wolffd@0: function [net, options] = rbftrain(net, options, x, t)
wolffd@0: %RBFTRAIN Two stage training of RBF network.
wolffd@0: %
wolffd@0: %	Description
wolffd@0: %	NET = RBFTRAIN(NET, OPTIONS, X, T) uses a  two stage training
wolffd@0: %	algorithm to set the weights in the RBF model structure NET. Each row
wolffd@0: %	of X corresponds to one input vector and each row of T contains the
wolffd@0: %	corresponding target vector. The centres are determined by fitting a
wolffd@0: %	Gaussian mixture model with circular covariances using the EM
wolffd@0: %	algorithm through a call to RBFSETBF.  (The mixture model is
wolffd@0: %	initialised using a small number of iterations of the K-means
wolffd@0: %	algorithm.) If the activation functions are Gaussians, then the basis
wolffd@0: %	function widths are then set to the maximum inter-centre squared
wolffd@0: %	distance.
wolffd@0: %
wolffd@0: %	For linear outputs,  the hidden to output weights that give rise to
wolffd@0: %	the least squares solution can then be determined using the pseudo-
wolffd@0: %	inverse. For neuroscale outputs, the hidden to output weights are
wolffd@0: %	determined using the iterative shadow targets algorithm.  Although
wolffd@0: %	this two stage procedure may not give solutions with as low an error
wolffd@0: %	as using general  purpose non-linear optimisers, it is much faster.
wolffd@0: %
wolffd@0: %	The options vector may have two rows: if this is the case, then the
wolffd@0: %	second row is passed to RBFSETBF, which allows the user to specify a
wolffd@0: %	different number iterations for RBF and GMM training. The optional
wolffd@0: %	parameters to RBFTRAIN have the following interpretations.
wolffd@0: %
wolffd@0: %	OPTIONS(1) is set to 1 to display error values during EM training.
wolffd@0: %
wolffd@0: %	OPTIONS(2) is a measure of the precision required for the value of
wolffd@0: %	the weights W at the solution.
wolffd@0: %
wolffd@0: %	OPTIONS(3) is a measure of the precision required of the objective
wolffd@0: %	function at the solution.  Both this and the previous condition must
wolffd@0: %	be satisfied for termination.
wolffd@0: %
wolffd@0: %	OPTIONS(5) is set to 1 if the basis functions parameters should
wolffd@0: %	remain unchanged; default 0.
wolffd@0: %
wolffd@0: %	OPTIONS(6) is set to 1 if the output layer weights should be should
wolffd@0: %	set using PCA. This is only relevant for Neuroscale outputs; default
wolffd@0: %	0.
wolffd@0: %
wolffd@0: %	OPTIONS(14) is the maximum number of iterations for the shadow
wolffd@0: %	targets algorithm;  default 100.
wolffd@0: %
wolffd@0: %	See also
wolffd@0: %	RBF, RBFERR, RBFFWD, RBFGRAD, RBFPAK, RBFUNPAK, RBFSETBF
wolffd@0: %
wolffd@0: 
wolffd@0: %	Copyright (c) Ian T Nabney (1996-2001)
wolffd@0: 
wolffd@0: % Check arguments for consistency
wolffd@0: switch net.outfn
wolffd@0: case 'linear'
wolffd@0:   errstring = consist(net, 'rbf', x, t);
wolffd@0: case 'neuroscale'
wolffd@0:   errstring = consist(net, 'rbf', x);
wolffd@0: otherwise
wolffd@0:  error(['Unknown output function ', net.outfn]);
wolffd@0: end
wolffd@0: if ~isempty(errstring)
wolffd@0:   error(errstring);
wolffd@0: end
wolffd@0: 
wolffd@0: % Allow options to have two rows: if this is the case, then the second row
wolffd@0: % is passed to rbfsetbf
wolffd@0: if size(options, 1) == 2
wolffd@0:   setbfoptions = options(2, :);
wolffd@0:   options = options(1, :);
wolffd@0: else
wolffd@0:   setbfoptions = options;
wolffd@0: end
wolffd@0: 
wolffd@0: if(~options(14))
wolffd@0:   options(14) = 100;
wolffd@0: end
wolffd@0: % Do we need to test for termination?
wolffd@0: test = (options(2) | options(3));
wolffd@0: 
wolffd@0: % Set up the basis function parameters to model the input data density
wolffd@0: % unless options(5) is set.
wolffd@0: if ~(logical(options(5)))
wolffd@0:   net = rbfsetbf(net, setbfoptions, x);
wolffd@0: end
wolffd@0: 
wolffd@0: % Compute the design (or activations) matrix
wolffd@0: [y, act] = rbffwd(net, x);
wolffd@0: ndata = size(x, 1);
wolffd@0: 
wolffd@0: if strcmp(net.outfn, 'neuroscale') & options(6)
wolffd@0:   % Initialise output layer weights by projecting data with PCA
wolffd@0:   mu = mean(x);
wolffd@0:   [pcvals, pcvecs] = pca(x, net.nout);
wolffd@0:   xproj = (x - ones(ndata, 1)*mu)*pcvecs;
wolffd@0:   % Now use projected data as targets to compute output layer weights
wolffd@0:   temp = pinv([act ones(ndata, 1)]) * xproj;
wolffd@0:   net.w2 = temp(1:net.nhidden, :);
wolffd@0:   net.b2 = temp(net.nhidden+1, :);
wolffd@0:   % Propagate again to compute revised outputs
wolffd@0:   [y, act] = rbffwd(net, x);
wolffd@0: end
wolffd@0: 
wolffd@0: switch net.outfn
wolffd@0: case 'linear'
wolffd@0:   % Sum of squares error function in regression model
wolffd@0:   % Solve for the weights and biases using pseudo-inverse from activations
wolffd@0:   Phi = [act ones(ndata, 1)];
wolffd@0:   if ~isfield(net, 'alpha')
wolffd@0:     % Solve for the weights and biases using left matrix divide
wolffd@0:     temp = pinv(Phi)*t;
wolffd@0:   elseif size(net.alpha == [1 1])
wolffd@0:     % Use normal form equation
wolffd@0:     hessian = Phi'*Phi + net.alpha*eye(net.nhidden+1);
wolffd@0:     temp = pinv(hessian)*(Phi'*t);  
wolffd@0:   else
wolffd@0:     error('Only scalar alpha allowed');
wolffd@0:   end
wolffd@0:   net.w2 = temp(1:net.nhidden, :);
wolffd@0:   net.b2 = temp(net.nhidden+1, :);
wolffd@0: 
wolffd@0: case 'neuroscale'
wolffd@0:   % Use the shadow targets training algorithm
wolffd@0:   if nargin < 4
wolffd@0:     % If optional input distances not passed in, then use
wolffd@0:     % Euclidean distance
wolffd@0:     x_dist = sqrt(dist2(x, x));
wolffd@0:   else
wolffd@0:     x_dist = t;
wolffd@0:   end
wolffd@0:   Phi = [act, ones(ndata, 1)];
wolffd@0:   % Compute the pseudo-inverse of Phi
wolffd@0:   PhiDag = pinv(Phi);
wolffd@0:   % Compute y_dist, distances between image points
wolffd@0:   y_dist = sqrt(dist2(y, y));
wolffd@0: 
wolffd@0:   % Save old weights so that we can check the termination criterion
wolffd@0:   wold = netpak(net);
wolffd@0:   % Compute initial error (stress) value
wolffd@0:   errold = 0.5*(sum(sum((x_dist - y_dist).^2)));
wolffd@0: 
wolffd@0:   % Initial value for eta
wolffd@0:   eta = 0.1;
wolffd@0:   k_up = 1.2;
wolffd@0:   k_down = 0.1;
wolffd@0:   success = 1;  % Force initial gradient calculation
wolffd@0: 
wolffd@0:   for j = 1:options(14)
wolffd@0:     if success
wolffd@0:       % Compute the negative error gradient with respect to network outputs
wolffd@0:       D = (x_dist - y_dist)./(y_dist+(y_dist==0));
wolffd@0:       temp = y';
wolffd@0:       neg_gradient = -2.*sum(kron(D, ones(1, net.nout)) .* ...
wolffd@0: 	(repmat(y, 1, ndata) - repmat((temp(:))', ndata, 1)), 1);
wolffd@0:       neg_gradient = (reshape(neg_gradient, net.nout, ndata))';
wolffd@0:     end
wolffd@0:     % Compute the shadow targets
wolffd@0:     t = y + eta*neg_gradient;
wolffd@0:     % Solve for the weights and biases
wolffd@0:     temp = PhiDag * t;
wolffd@0:     net.w2 = temp(1:net.nhidden, :);
wolffd@0:     net.b2 = temp(net.nhidden+1, :);
wolffd@0:    
wolffd@0:     % Do housekeeping and test for convergence
wolffd@0:     ynew = rbffwd(net, x);
wolffd@0:     y_distnew = sqrt(dist2(ynew, ynew));
wolffd@0:     err = 0.5.*(sum(sum((x_dist-y_distnew).^2)));
wolffd@0:     if err > errold
wolffd@0:       success = 0;
wolffd@0:       % Restore previous weights
wolffd@0:       net = netunpak(net, wold);
wolffd@0:       err = errold;
wolffd@0:       eta = eta * k_down;
wolffd@0:     else
wolffd@0:       success = 1;
wolffd@0:       eta = eta * k_up;
wolffd@0:       errold = err;
wolffd@0:       y = ynew;
wolffd@0:       y_dist = y_distnew;
wolffd@0:       if test & j > 1
wolffd@0: 	w = netpak(net);
wolffd@0: 	if (max(abs(w - wold)) < options(2) & abs(err-errold) < options(3))
wolffd@0: 	  options(8) = err;
wolffd@0: 	  return;
wolffd@0: 	end
wolffd@0:       end
wolffd@0:       wold = netpak(net);
wolffd@0:     end
wolffd@0:     if options(1)
wolffd@0:       fprintf(1, 'Cycle %4d Error %11.6f\n', j, err)
wolffd@0:     end
wolffd@0:     if nargout >= 3
wolffd@0:       errlog(j) = err;
wolffd@0:     end
wolffd@0:   end
wolffd@0:   options(8) = errold;
wolffd@0:   if (options(1) >= 0)
wolffd@0:     disp('Warning: Maximum number of iterations has been exceeded');
wolffd@0:   end
wolffd@0: otherwise
wolffd@0:    error(['Unknown output function ', net.outfn]);
wolffd@0: 
wolffd@0: end