wolffd@0: wolffd@0: %%%%%%%%%% wolffd@0: wolffd@0: function [theta, eta] = mixture_of_experts(q, data, num_iter, theta, eta) wolffd@0: % MIXTURE_OF_EXPERTS Fit a piecewise linear regression model using stochastic gradient descent. wolffd@0: % [theta, eta] = mixture_of_experts(q, data, num_iter) wolffd@0: % wolffd@0: % Inputs: wolffd@0: % q = number of pieces (experts) wolffd@0: % data(l,:) = input example l wolffd@0: % wolffd@0: % Outputs: wolffd@0: % theta(i,:) = regression vector for expert i wolffd@0: % eta(i,:) = softmax (gating) params for expert i wolffd@0: wolffd@0: [num_cases dim] = size(data); wolffd@0: data = [ones(num_cases,1) data]; % prepend with offset wolffd@0: mu = 0.5; % step size wolffd@0: sigma = 1; % variance of noise wolffd@0: wolffd@0: if nargin < 4 wolffd@0: theta = 0.1*rand(q, dim); wolffd@0: eta = 0.1*rand(q, dim); wolffd@0: end wolffd@0: wolffd@0: for t=1:num_iter wolffd@0: for iter=1:num_cases wolffd@0: x = data(iter, 1:dim); wolffd@0: ystar = data(iter, dim+1); % target wolffd@0: % yhat(i) = E[y | Q=i, x] = prediction of i'th expert wolffd@0: yhat = theta * x'; wolffd@0: % gate_prior(i,:) = Pr(Q=i | x) wolffd@0: gate_prior = exp(eta * x'); wolffd@0: gate_prior = gate_prior / sum(gate_prior); wolffd@0: % lik(i) = Pr(y | Q=i, x) wolffd@0: lik = (1/(sqrt(2*pi)*sigma)) * exp(-(0.5/sigma^2) * ((ystar - yhat) .* (ystar - yhat))); wolffd@0: % gate_posterior(i,:) = Pr(Q=i | x, y) wolffd@0: gate_posterior = gate_prior .* lik; wolffd@0: gate_posterior = gate_posterior / sum(gate_posterior); wolffd@0: % Update wolffd@0: eta = eta + mu*(gate_posterior - gate_prior)*x; wolffd@0: theta = theta + mu*(gate_posterior .* (ystar - yhat))*x; wolffd@0: end wolffd@0: wolffd@0: if mod(t,100)==0 wolffd@0: fprintf(1, 'iter %d\n', t); wolffd@0: end wolffd@0: wolffd@0: end wolffd@0: fprintf(1, '\n'); wolffd@0: