wolffd@0: 
wolffd@0: %%%%%%%%%%
wolffd@0: 
wolffd@0: function [theta, eta] = mixture_of_experts(q, data, num_iter, theta, eta)
wolffd@0: % MIXTURE_OF_EXPERTS Fit a piecewise linear regression model using stochastic gradient descent.
wolffd@0: % [theta, eta] = mixture_of_experts(q, data, num_iter)
wolffd@0: %
wolffd@0: % Inputs:
wolffd@0: % q = number of pieces (experts)
wolffd@0: % data(l,:) = input example l 
wolffd@0: % 
wolffd@0: % Outputs:
wolffd@0: % theta(i,:) = regression vector for expert i
wolffd@0: % eta(i,:) = softmax (gating) params for expert i
wolffd@0: 
wolffd@0: [num_cases dim] = size(data);
wolffd@0: data = [ones(num_cases,1) data]; % prepend with offset
wolffd@0: mu = 0.5; % step size
wolffd@0: sigma = 1; % variance of noise
wolffd@0: 
wolffd@0: if nargin < 4
wolffd@0:   theta = 0.1*rand(q, dim);
wolffd@0:   eta = 0.1*rand(q, dim);
wolffd@0: end
wolffd@0: 
wolffd@0: for t=1:num_iter
wolffd@0:   for iter=1:num_cases
wolffd@0:     x = data(iter, 1:dim);
wolffd@0:     ystar = data(iter, dim+1); % target
wolffd@0:     % yhat(i) = E[y | Q=i, x] = prediction of i'th expert
wolffd@0:     yhat = theta * x'; 
wolffd@0:     % gate_prior(i,:) = Pr(Q=i | x)
wolffd@0:     gate_prior = exp(eta * x');
wolffd@0:     gate_prior = gate_prior / sum(gate_prior);
wolffd@0:     % lik(i) = Pr(y | Q=i, x)
wolffd@0:     lik = (1/(sqrt(2*pi)*sigma)) * exp(-(0.5/sigma^2) * ((ystar - yhat) .* (ystar - yhat)));
wolffd@0:     % gate_posterior(i,:) = Pr(Q=i | x, y)
wolffd@0:     gate_posterior = gate_prior .* lik;
wolffd@0:     gate_posterior = gate_posterior / sum(gate_posterior);
wolffd@0:     % Update
wolffd@0:     eta = eta + mu*(gate_posterior - gate_prior)*x;
wolffd@0:     theta = theta + mu*(gate_posterior .* (ystar - yhat))*x;
wolffd@0:   end
wolffd@0: 
wolffd@0:   if mod(t,100)==0
wolffd@0:     fprintf(1, 'iter %d\n', t);
wolffd@0:   end
wolffd@0: 
wolffd@0: end
wolffd@0: fprintf(1, '\n');
wolffd@0: