wolffd@0: function [net, options] = rbftrain(net, options, x, t) wolffd@0: %RBFTRAIN Two stage training of RBF network. wolffd@0: % wolffd@0: % Description wolffd@0: % NET = RBFTRAIN(NET, OPTIONS, X, T) uses a two stage training wolffd@0: % algorithm to set the weights in the RBF model structure NET. Each row wolffd@0: % of X corresponds to one input vector and each row of T contains the wolffd@0: % corresponding target vector. The centres are determined by fitting a wolffd@0: % Gaussian mixture model with circular covariances using the EM wolffd@0: % algorithm through a call to RBFSETBF. (The mixture model is wolffd@0: % initialised using a small number of iterations of the K-means wolffd@0: % algorithm.) If the activation functions are Gaussians, then the basis wolffd@0: % function widths are then set to the maximum inter-centre squared wolffd@0: % distance. wolffd@0: % wolffd@0: % For linear outputs, the hidden to output weights that give rise to wolffd@0: % the least squares solution can then be determined using the pseudo- wolffd@0: % inverse. For neuroscale outputs, the hidden to output weights are wolffd@0: % determined using the iterative shadow targets algorithm. Although wolffd@0: % this two stage procedure may not give solutions with as low an error wolffd@0: % as using general purpose non-linear optimisers, it is much faster. wolffd@0: % wolffd@0: % The options vector may have two rows: if this is the case, then the wolffd@0: % second row is passed to RBFSETBF, which allows the user to specify a wolffd@0: % different number iterations for RBF and GMM training. The optional wolffd@0: % parameters to RBFTRAIN have the following interpretations. wolffd@0: % wolffd@0: % OPTIONS(1) is set to 1 to display error values during EM training. wolffd@0: % wolffd@0: % OPTIONS(2) is a measure of the precision required for the value of wolffd@0: % the weights W at the solution. wolffd@0: % wolffd@0: % OPTIONS(3) is a measure of the precision required of the objective wolffd@0: % function at the solution. Both this and the previous condition must wolffd@0: % be satisfied for termination. wolffd@0: % wolffd@0: % OPTIONS(5) is set to 1 if the basis functions parameters should wolffd@0: % remain unchanged; default 0. wolffd@0: % wolffd@0: % OPTIONS(6) is set to 1 if the output layer weights should be should wolffd@0: % set using PCA. This is only relevant for Neuroscale outputs; default wolffd@0: % 0. wolffd@0: % wolffd@0: % OPTIONS(14) is the maximum number of iterations for the shadow wolffd@0: % targets algorithm; default 100. wolffd@0: % wolffd@0: % See also wolffd@0: % RBF, RBFERR, RBFFWD, RBFGRAD, RBFPAK, RBFUNPAK, RBFSETBF wolffd@0: % wolffd@0: wolffd@0: % Copyright (c) Ian T Nabney (1996-2001) wolffd@0: wolffd@0: % Check arguments for consistency wolffd@0: switch net.outfn wolffd@0: case 'linear' wolffd@0: errstring = consist(net, 'rbf', x, t); wolffd@0: case 'neuroscale' wolffd@0: errstring = consist(net, 'rbf', x); wolffd@0: otherwise wolffd@0: error(['Unknown output function ', net.outfn]); wolffd@0: end wolffd@0: if ~isempty(errstring) wolffd@0: error(errstring); wolffd@0: end wolffd@0: wolffd@0: % Allow options to have two rows: if this is the case, then the second row wolffd@0: % is passed to rbfsetbf wolffd@0: if size(options, 1) == 2 wolffd@0: setbfoptions = options(2, :); wolffd@0: options = options(1, :); wolffd@0: else wolffd@0: setbfoptions = options; wolffd@0: end wolffd@0: wolffd@0: if(~options(14)) wolffd@0: options(14) = 100; wolffd@0: end wolffd@0: % Do we need to test for termination? wolffd@0: test = (options(2) | options(3)); wolffd@0: wolffd@0: % Set up the basis function parameters to model the input data density wolffd@0: % unless options(5) is set. wolffd@0: if ~(logical(options(5))) wolffd@0: net = rbfsetbf(net, setbfoptions, x); wolffd@0: end wolffd@0: wolffd@0: % Compute the design (or activations) matrix wolffd@0: [y, act] = rbffwd(net, x); wolffd@0: ndata = size(x, 1); wolffd@0: wolffd@0: if strcmp(net.outfn, 'neuroscale') & options(6) wolffd@0: % Initialise output layer weights by projecting data with PCA wolffd@0: mu = mean(x); wolffd@0: [pcvals, pcvecs] = pca(x, net.nout); wolffd@0: xproj = (x - ones(ndata, 1)*mu)*pcvecs; wolffd@0: % Now use projected data as targets to compute output layer weights wolffd@0: temp = pinv([act ones(ndata, 1)]) * xproj; wolffd@0: net.w2 = temp(1:net.nhidden, :); wolffd@0: net.b2 = temp(net.nhidden+1, :); wolffd@0: % Propagate again to compute revised outputs wolffd@0: [y, act] = rbffwd(net, x); wolffd@0: end wolffd@0: wolffd@0: switch net.outfn wolffd@0: case 'linear' wolffd@0: % Sum of squares error function in regression model wolffd@0: % Solve for the weights and biases using pseudo-inverse from activations wolffd@0: Phi = [act ones(ndata, 1)]; wolffd@0: if ~isfield(net, 'alpha') wolffd@0: % Solve for the weights and biases using left matrix divide wolffd@0: temp = pinv(Phi)*t; wolffd@0: elseif size(net.alpha == [1 1]) wolffd@0: % Use normal form equation wolffd@0: hessian = Phi'*Phi + net.alpha*eye(net.nhidden+1); wolffd@0: temp = pinv(hessian)*(Phi'*t); wolffd@0: else wolffd@0: error('Only scalar alpha allowed'); wolffd@0: end wolffd@0: net.w2 = temp(1:net.nhidden, :); wolffd@0: net.b2 = temp(net.nhidden+1, :); wolffd@0: wolffd@0: case 'neuroscale' wolffd@0: % Use the shadow targets training algorithm wolffd@0: if nargin < 4 wolffd@0: % If optional input distances not passed in, then use wolffd@0: % Euclidean distance wolffd@0: x_dist = sqrt(dist2(x, x)); wolffd@0: else wolffd@0: x_dist = t; wolffd@0: end wolffd@0: Phi = [act, ones(ndata, 1)]; wolffd@0: % Compute the pseudo-inverse of Phi wolffd@0: PhiDag = pinv(Phi); wolffd@0: % Compute y_dist, distances between image points wolffd@0: y_dist = sqrt(dist2(y, y)); wolffd@0: wolffd@0: % Save old weights so that we can check the termination criterion wolffd@0: wold = netpak(net); wolffd@0: % Compute initial error (stress) value wolffd@0: errold = 0.5*(sum(sum((x_dist - y_dist).^2))); wolffd@0: wolffd@0: % Initial value for eta wolffd@0: eta = 0.1; wolffd@0: k_up = 1.2; wolffd@0: k_down = 0.1; wolffd@0: success = 1; % Force initial gradient calculation wolffd@0: wolffd@0: for j = 1:options(14) wolffd@0: if success wolffd@0: % Compute the negative error gradient with respect to network outputs wolffd@0: D = (x_dist - y_dist)./(y_dist+(y_dist==0)); wolffd@0: temp = y'; wolffd@0: neg_gradient = -2.*sum(kron(D, ones(1, net.nout)) .* ... wolffd@0: (repmat(y, 1, ndata) - repmat((temp(:))', ndata, 1)), 1); wolffd@0: neg_gradient = (reshape(neg_gradient, net.nout, ndata))'; wolffd@0: end wolffd@0: % Compute the shadow targets wolffd@0: t = y + eta*neg_gradient; wolffd@0: % Solve for the weights and biases wolffd@0: temp = PhiDag * t; wolffd@0: net.w2 = temp(1:net.nhidden, :); wolffd@0: net.b2 = temp(net.nhidden+1, :); wolffd@0: wolffd@0: % Do housekeeping and test for convergence wolffd@0: ynew = rbffwd(net, x); wolffd@0: y_distnew = sqrt(dist2(ynew, ynew)); wolffd@0: err = 0.5.*(sum(sum((x_dist-y_distnew).^2))); wolffd@0: if err > errold wolffd@0: success = 0; wolffd@0: % Restore previous weights wolffd@0: net = netunpak(net, wold); wolffd@0: err = errold; wolffd@0: eta = eta * k_down; wolffd@0: else wolffd@0: success = 1; wolffd@0: eta = eta * k_up; wolffd@0: errold = err; wolffd@0: y = ynew; wolffd@0: y_dist = y_distnew; wolffd@0: if test & j > 1 wolffd@0: w = netpak(net); wolffd@0: if (max(abs(w - wold)) < options(2) & abs(err-errold) < options(3)) wolffd@0: options(8) = err; wolffd@0: return; wolffd@0: end wolffd@0: end wolffd@0: wold = netpak(net); wolffd@0: end wolffd@0: if options(1) wolffd@0: fprintf(1, 'Cycle %4d Error %11.6f\n', j, err) wolffd@0: end wolffd@0: if nargout >= 3 wolffd@0: errlog(j) = err; wolffd@0: end wolffd@0: end wolffd@0: options(8) = errold; wolffd@0: if (options(1) >= 0) wolffd@0: disp('Warning: Maximum number of iterations has been exceeded'); wolffd@0: end wolffd@0: otherwise wolffd@0: error(['Unknown output function ', net.outfn]); wolffd@0: wolffd@0: end