wolffd@0
|
1 %DEMOLGD1 Demonstrate simple MLP optimisation with on-line gradient descent
|
wolffd@0
|
2 %
|
wolffd@0
|
3 % Description
|
wolffd@0
|
4 % The problem consists of one input variable X and one target variable
|
wolffd@0
|
5 % T with data generated by sampling X at equal intervals and then
|
wolffd@0
|
6 % generating target data by computing SIN(2*PI*X) and adding Gaussian
|
wolffd@0
|
7 % noise. A 2-layer network with linear outputs is trained by minimizing
|
wolffd@0
|
8 % a sum-of-squares error function using on-line gradient descent.
|
wolffd@0
|
9 %
|
wolffd@0
|
10 % See also
|
wolffd@0
|
11 % DEMMLP1, OLGD
|
wolffd@0
|
12 %
|
wolffd@0
|
13
|
wolffd@0
|
14 % Copyright (c) Ian T Nabney (1996-2001)
|
wolffd@0
|
15
|
wolffd@0
|
16
|
wolffd@0
|
17 % Generate the matrix of inputs x and targets t.
|
wolffd@0
|
18
|
wolffd@0
|
19 ndata = 20; % Number of data points.
|
wolffd@0
|
20 noise = 0.2; % Standard deviation of noise distribution.
|
wolffd@0
|
21 x = [0:1/(ndata - 1):1]';
|
wolffd@0
|
22 randn('state', 42);
|
wolffd@0
|
23 rand('state', 42);
|
wolffd@0
|
24 t = sin(2*pi*x) + noise*randn(ndata, 1);
|
wolffd@0
|
25
|
wolffd@0
|
26 clc
|
wolffd@0
|
27 disp('This demonstration illustrates the use of the on-line gradient')
|
wolffd@0
|
28 disp('descent algorithm to train a Multi-Layer Perceptron network for')
|
wolffd@0
|
29 disp('regression problems. It is intended to illustrate the drawbacks')
|
wolffd@0
|
30 disp('of this algorithm compared to more powerful non-linear optimisation')
|
wolffd@0
|
31 disp('algorithms, such as conjugate gradients.')
|
wolffd@0
|
32 disp(' ')
|
wolffd@0
|
33 disp('First we generate the data from a noisy sine function and construct')
|
wolffd@0
|
34 disp('the network.')
|
wolffd@0
|
35 disp(' ')
|
wolffd@0
|
36 disp('Press any key to continue.')
|
wolffd@0
|
37 pause
|
wolffd@0
|
38 % Set up network parameters.
|
wolffd@0
|
39 nin = 1; % Number of inputs.
|
wolffd@0
|
40 nhidden = 3; % Number of hidden units.
|
wolffd@0
|
41 nout = 1; % Number of outputs.
|
wolffd@0
|
42 alpha = 0.01; % Coefficient of weight-decay prior.
|
wolffd@0
|
43
|
wolffd@0
|
44 % Create and initialize network weight vector.
|
wolffd@0
|
45 net = mlp(nin, nhidden, nout, 'linear');
|
wolffd@0
|
46 % Initialise weights reasonably close to 0
|
wolffd@0
|
47 net = mlpinit(net, 10);
|
wolffd@0
|
48
|
wolffd@0
|
49 % Set up vector of options for the optimiser.
|
wolffd@0
|
50 options = foptions;
|
wolffd@0
|
51 options(1) = 1; % This provides display of error values.
|
wolffd@0
|
52 options(14) = 20; % Number of training cycles.
|
wolffd@0
|
53 options(18) = 0.1; % Learning rate
|
wolffd@0
|
54 %options(17) = 0.4; % Momentum
|
wolffd@0
|
55 options(17) = 0.4; % Momentum
|
wolffd@0
|
56 options(5) = 1; % Do randomise pattern order
|
wolffd@0
|
57 clc
|
wolffd@0
|
58 disp('Then we set the options for the training algorithm.')
|
wolffd@0
|
59 disp(['In the first phase of training, which lasts for ',...
|
wolffd@0
|
60 num2str(options(14)), ' cycles,'])
|
wolffd@0
|
61 disp(['the learning rate is ', num2str(options(18)), ...
|
wolffd@0
|
62 ' and the momentum is ', num2str(options(17)), '.'])
|
wolffd@0
|
63 disp('The error values are displayed at the end of each pass through the')
|
wolffd@0
|
64 disp('entire pattern set.')
|
wolffd@0
|
65 disp(' ')
|
wolffd@0
|
66 disp('Press any key to continue.')
|
wolffd@0
|
67 pause
|
wolffd@0
|
68
|
wolffd@0
|
69 % Train using online gradient descent
|
wolffd@0
|
70 [net, options] = olgd(net, options, x, t);
|
wolffd@0
|
71
|
wolffd@0
|
72 % Now allow learning rate to decay and remove momentum
|
wolffd@0
|
73 options(2) = 0;
|
wolffd@0
|
74 options(3) = 0;
|
wolffd@0
|
75 options(17) = 0.4; % Turn off momentum
|
wolffd@0
|
76 options(5) = 1; % Randomise pattern order
|
wolffd@0
|
77 options(6) = 1; % Set learning rate decay on
|
wolffd@0
|
78 options(14) = 200;
|
wolffd@0
|
79 options(18) = 0.1; % Initial learning rate
|
wolffd@0
|
80
|
wolffd@0
|
81 disp(['In the second phase of training, which lasts for up to ',...
|
wolffd@0
|
82 num2str(options(14)), ' cycles,'])
|
wolffd@0
|
83 disp(['the learning rate starts at ', num2str(options(18)), ...
|
wolffd@0
|
84 ', decaying at 1/t and the momentum is ', num2str(options(17)), '.'])
|
wolffd@0
|
85 disp(' ')
|
wolffd@0
|
86 disp('Press any key to continue.')
|
wolffd@0
|
87 pause
|
wolffd@0
|
88 [net, options] = olgd(net, options, x, t);
|
wolffd@0
|
89
|
wolffd@0
|
90 clc
|
wolffd@0
|
91 disp('Now we plot the data, underlying function, and network outputs')
|
wolffd@0
|
92 disp('on a single graph to compare the results.')
|
wolffd@0
|
93 disp(' ')
|
wolffd@0
|
94 disp('Press any key to continue.')
|
wolffd@0
|
95 pause
|
wolffd@0
|
96
|
wolffd@0
|
97 % Plot the data, the original function, and the trained network function.
|
wolffd@0
|
98 plotvals = [0:0.01:1]';
|
wolffd@0
|
99 y = mlpfwd(net, plotvals);
|
wolffd@0
|
100 fh1 = figure;
|
wolffd@0
|
101 plot(x, t, 'ob')
|
wolffd@0
|
102 hold on
|
wolffd@0
|
103 axis([0 1 -1.5 1.5])
|
wolffd@0
|
104 fplot('sin(2*pi*x)', [0 1], '--g')
|
wolffd@0
|
105 plot(plotvals, y, '-r')
|
wolffd@0
|
106 legend('data', 'function', 'network');
|
wolffd@0
|
107 hold off
|
wolffd@0
|
108
|
wolffd@0
|
109 disp('Note the very poor fit to the data: this should be compared with')
|
wolffd@0
|
110 disp('the results obtained in demmlp1.')
|
wolffd@0
|
111 disp(' ')
|
wolffd@0
|
112 disp('Press any key to exit.')
|
wolffd@0
|
113 pause
|
wolffd@0
|
114 close(fh1);
|
wolffd@0
|
115 clear all; |