wolffd@0: %DEMOLGD1 Demonstrate simple MLP optimisation with on-line gradient descent
wolffd@0: %
wolffd@0: %	Description
wolffd@0: %	The problem consists of one input variable X and one target variable
wolffd@0: %	T with data generated by sampling X at equal intervals and then
wolffd@0: %	generating target data by computing SIN(2*PI*X) and adding Gaussian
wolffd@0: %	noise. A 2-layer network with linear outputs is trained by minimizing
wolffd@0: %	a  sum-of-squares error function using on-line gradient descent.
wolffd@0: %
wolffd@0: %	See also
wolffd@0: %	DEMMLP1, OLGD
wolffd@0: %
wolffd@0: 
wolffd@0: %	Copyright (c) Ian T Nabney (1996-2001)
wolffd@0: 
wolffd@0: 
wolffd@0: % Generate the matrix of inputs x and targets t.
wolffd@0: 
wolffd@0: ndata = 20;			% Number of data points.
wolffd@0: noise = 0.2;			% Standard deviation of noise distribution.
wolffd@0: x = [0:1/(ndata - 1):1]';
wolffd@0: randn('state', 42);
wolffd@0: rand('state', 42);
wolffd@0: t = sin(2*pi*x) + noise*randn(ndata, 1);
wolffd@0: 
wolffd@0: clc
wolffd@0: disp('This demonstration illustrates the use of the on-line gradient')
wolffd@0: disp('descent algorithm to train a Multi-Layer Perceptron network for')
wolffd@0: disp('regression problems.  It is intended to illustrate the drawbacks')
wolffd@0: disp('of this algorithm compared to more powerful non-linear optimisation')
wolffd@0: disp('algorithms, such as conjugate gradients.')
wolffd@0: disp(' ')
wolffd@0: disp('First we generate the data from a noisy sine function and construct')
wolffd@0: disp('the network.')
wolffd@0: disp(' ')
wolffd@0: disp('Press any key to continue.')
wolffd@0: pause
wolffd@0: % Set up network parameters.
wolffd@0: nin = 1;			% Number of inputs.
wolffd@0: nhidden = 3;			% Number of hidden units.
wolffd@0: nout = 1;			% Number of outputs.
wolffd@0: alpha = 0.01;			% Coefficient of weight-decay prior. 
wolffd@0: 
wolffd@0: % Create and initialize network weight vector.
wolffd@0: net = mlp(nin, nhidden, nout, 'linear');
wolffd@0: % Initialise weights reasonably close to 0
wolffd@0: net = mlpinit(net, 10);
wolffd@0: 
wolffd@0: % Set up vector of options for the optimiser.
wolffd@0: options = foptions;
wolffd@0: options(1) = 1;			% This provides display of error values.
wolffd@0: options(14) = 20;		% Number of training cycles. 
wolffd@0: options(18) = 0.1;		% Learning rate
wolffd@0: %options(17) = 0.4;		% Momentum
wolffd@0: options(17) = 0.4;		% Momentum
wolffd@0: options(5) = 1; 		% Do randomise pattern order
wolffd@0: clc
wolffd@0: disp('Then we set the options for the training algorithm.')
wolffd@0: disp(['In the first phase of training, which lasts for ',...
wolffd@0:     num2str(options(14)), ' cycles,'])
wolffd@0: disp(['the learning rate is ', num2str(options(18)), ...
wolffd@0:     ' and the momentum is ', num2str(options(17)), '.'])
wolffd@0: disp('The error values are displayed at the end of each pass through the')
wolffd@0: disp('entire pattern set.')
wolffd@0: disp(' ')
wolffd@0: disp('Press any key to continue.')
wolffd@0: pause
wolffd@0: 
wolffd@0: % Train using online gradient descent
wolffd@0: [net, options] = olgd(net, options, x, t);
wolffd@0: 
wolffd@0: % Now allow learning rate to decay and remove momentum
wolffd@0: options(2) = 0;
wolffd@0: options(3) = 0;
wolffd@0: options(17) = 0.4;	% Turn off momentum
wolffd@0: options(5) = 1;		% Randomise pattern order
wolffd@0: options(6) = 1;		% Set learning rate decay on
wolffd@0: options(14) = 200;
wolffd@0: options(18) = 0.1;	% Initial learning rate
wolffd@0: 
wolffd@0: disp(['In the second phase of training, which lasts for up to ',...
wolffd@0:     num2str(options(14)), ' cycles,'])
wolffd@0: disp(['the learning rate starts at ', num2str(options(18)), ...
wolffd@0:     ', decaying at 1/t and the momentum is ', num2str(options(17)), '.'])
wolffd@0: disp(' ')
wolffd@0: disp('Press any key to continue.')
wolffd@0: pause
wolffd@0: [net, options] = olgd(net, options, x, t);
wolffd@0: 
wolffd@0: clc
wolffd@0: disp('Now we plot the data, underlying function, and network outputs')
wolffd@0: disp('on a single graph to compare the results.')
wolffd@0: disp(' ')
wolffd@0: disp('Press any key to continue.')
wolffd@0: pause
wolffd@0: 
wolffd@0: % Plot the data, the original function, and the trained network function.
wolffd@0: plotvals = [0:0.01:1]';
wolffd@0: y = mlpfwd(net, plotvals);
wolffd@0: fh1 = figure;
wolffd@0: plot(x, t, 'ob')
wolffd@0: hold on
wolffd@0: axis([0 1 -1.5 1.5])
wolffd@0: fplot('sin(2*pi*x)', [0 1], '--g')
wolffd@0: plot(plotvals, y, '-r')
wolffd@0: legend('data', 'function', 'network');
wolffd@0: hold off
wolffd@0: 
wolffd@0: disp('Note the very poor fit to the data: this should be compared with')
wolffd@0: disp('the results obtained in demmlp1.')
wolffd@0: disp(' ')
wolffd@0: disp('Press any key to exit.')
wolffd@0: pause
wolffd@0: close(fh1);
wolffd@0: clear all;