wolffd@0
|
1 %DEMEV2 Demonstrate Bayesian classification for the MLP.
|
wolffd@0
|
2 %
|
wolffd@0
|
3 % Description
|
wolffd@0
|
4 % A synthetic two class two-dimensional dataset X is sampled from a
|
wolffd@0
|
5 % mixture of four Gaussians. Each class is associated with two of the
|
wolffd@0
|
6 % Gaussians so that the optimal decision boundary is non-linear. A 2-
|
wolffd@0
|
7 % layer network with logistic outputs is trained by minimizing the
|
wolffd@0
|
8 % cross-entropy error function with isotroipc Gaussian regularizer (one
|
wolffd@0
|
9 % hyperparameter for each of the four standard weight groups), using
|
wolffd@0
|
10 % the scaled conjugate gradient optimizer. The hyperparameter vectors
|
wolffd@0
|
11 % ALPHA and BETA are re-estimated using the function EVIDENCE. A graph
|
wolffd@0
|
12 % is plotted of the optimal, regularised, and unregularised decision
|
wolffd@0
|
13 % boundaries. A further plot of the moderated versus unmoderated
|
wolffd@0
|
14 % contours is generated.
|
wolffd@0
|
15 %
|
wolffd@0
|
16 % See also
|
wolffd@0
|
17 % EVIDENCE, MLP, SCG, DEMARD, DEMMLP2
|
wolffd@0
|
18 %
|
wolffd@0
|
19
|
wolffd@0
|
20 % Copyright (c) Ian T Nabney (1996-2001)
|
wolffd@0
|
21
|
wolffd@0
|
22
|
wolffd@0
|
23 clc;
|
wolffd@0
|
24
|
wolffd@0
|
25 disp('This program demonstrates the use of the evidence procedure on')
|
wolffd@0
|
26 disp('a two-class problem. It also shows the improved generalisation')
|
wolffd@0
|
27 disp('performance that can be achieved with moderated outputs; that is')
|
wolffd@0
|
28 disp('predictions where an approximate integration over the true')
|
wolffd@0
|
29 disp('posterior distribution is carried out.')
|
wolffd@0
|
30 disp(' ')
|
wolffd@0
|
31 disp('First we generate a synthetic dataset with two-dimensional input')
|
wolffd@0
|
32 disp('sampled from a mixture of four Gaussians. Each class is')
|
wolffd@0
|
33 disp('associated with two of the Gaussians so that the optimal decision')
|
wolffd@0
|
34 disp('boundary is non-linear.')
|
wolffd@0
|
35 disp(' ')
|
wolffd@0
|
36 disp('Press any key to see a plot of the data.')
|
wolffd@0
|
37 pause;
|
wolffd@0
|
38
|
wolffd@0
|
39 % Generate the matrix of inputs x and targets t.
|
wolffd@0
|
40
|
wolffd@0
|
41 rand('state', 423);
|
wolffd@0
|
42 randn('state', 423);
|
wolffd@0
|
43
|
wolffd@0
|
44 ClassSymbol1 = 'r.';
|
wolffd@0
|
45 ClassSymbol2 = 'y.';
|
wolffd@0
|
46 PointSize = 12;
|
wolffd@0
|
47 titleSize = 10;
|
wolffd@0
|
48
|
wolffd@0
|
49 fh1 = figure;
|
wolffd@0
|
50 set(fh1, 'Name', 'True Data Distribution');
|
wolffd@0
|
51 whitebg(fh1, 'k');
|
wolffd@0
|
52
|
wolffd@0
|
53 %
|
wolffd@0
|
54 % Generate the data
|
wolffd@0
|
55 %
|
wolffd@0
|
56 n=200;
|
wolffd@0
|
57
|
wolffd@0
|
58 % Set up mixture model: 2d data with four centres
|
wolffd@0
|
59 % Class 1 is first two centres, class 2 from the other two
|
wolffd@0
|
60 mix = gmm(2, 4, 'full');
|
wolffd@0
|
61 mix.priors = [0.25 0.25 0.25 0.25];
|
wolffd@0
|
62 mix.centres = [0 -0.1; 1.5 0; 1 1; 1 -1];
|
wolffd@0
|
63 mix.covars(:,:,1) = [0.625 -0.2165; -0.2165 0.875];
|
wolffd@0
|
64 mix.covars(:,:,2) = [0.25 0; 0 0.25];
|
wolffd@0
|
65 mix.covars(:,:,3) = [0.2241 -0.1368; -0.1368 0.9759];
|
wolffd@0
|
66 mix.covars(:,:,4) = [0.2375 0.1516; 0.1516 0.4125];
|
wolffd@0
|
67
|
wolffd@0
|
68 [data, label] = gmmsamp(mix, n);
|
wolffd@0
|
69
|
wolffd@0
|
70 %
|
wolffd@0
|
71 % Calculate some useful axis limits
|
wolffd@0
|
72 %
|
wolffd@0
|
73 x0 = min(data(:,1));
|
wolffd@0
|
74 x1 = max(data(:,1));
|
wolffd@0
|
75 y0 = min(data(:,2));
|
wolffd@0
|
76 y1 = max(data(:,2));
|
wolffd@0
|
77 dx = x1-x0;
|
wolffd@0
|
78 dy = y1-y0;
|
wolffd@0
|
79 expand = 5/100; % Add on 5 percent each way
|
wolffd@0
|
80 x0 = x0 - dx*expand;
|
wolffd@0
|
81 x1 = x1 + dx*expand;
|
wolffd@0
|
82 y0 = y0 - dy*expand;
|
wolffd@0
|
83 y1 = y1 + dy*expand;
|
wolffd@0
|
84 resolution = 100;
|
wolffd@0
|
85 step = dx/resolution;
|
wolffd@0
|
86 xrange = [x0:step:x1];
|
wolffd@0
|
87 yrange = [y0:step:y1];
|
wolffd@0
|
88 %
|
wolffd@0
|
89 % Generate the grid
|
wolffd@0
|
90 %
|
wolffd@0
|
91 [X Y]=meshgrid([x0:step:x1],[y0:step:y1]);
|
wolffd@0
|
92 %
|
wolffd@0
|
93 % Calculate the class conditional densities, the unconditional densities and
|
wolffd@0
|
94 % the posterior probabilities
|
wolffd@0
|
95 %
|
wolffd@0
|
96 px_j = gmmactiv(mix, [X(:) Y(:)]);
|
wolffd@0
|
97 px = reshape(px_j*(mix.priors)',size(X));
|
wolffd@0
|
98 post = gmmpost(mix, [X(:) Y(:)]);
|
wolffd@0
|
99 p1_x = reshape(post(:, 1) + post(:, 2), size(X));
|
wolffd@0
|
100 p2_x = reshape(post(:, 3) + post(:, 4), size(X));
|
wolffd@0
|
101
|
wolffd@0
|
102 plot(data((label<=2),1),data(label<=2,2),ClassSymbol1, 'MarkerSize', ...
|
wolffd@0
|
103 PointSize)
|
wolffd@0
|
104 hold on
|
wolffd@0
|
105 axis([x0 x1 y0 y1])
|
wolffd@0
|
106 plot(data((label>2),1),data(label>2,2),ClassSymbol2, 'MarkerSize', ...
|
wolffd@0
|
107 PointSize)
|
wolffd@0
|
108
|
wolffd@0
|
109 % Convert targets to 0-1 encoding
|
wolffd@0
|
110 target=[label<=2];
|
wolffd@0
|
111 disp(' ')
|
wolffd@0
|
112 disp('Press any key to continue')
|
wolffd@0
|
113 pause; clc;
|
wolffd@0
|
114
|
wolffd@0
|
115 disp('Next we create a two-layer MLP network with 6 hidden units and')
|
wolffd@0
|
116 disp('one logistic output. We use a separate inverse variance')
|
wolffd@0
|
117 disp('hyperparameter for each group of weights (inputs, input bias,')
|
wolffd@0
|
118 disp('outputs, output bias) and the weights are optimised with the')
|
wolffd@0
|
119 disp('scaled conjugate gradient algorithm. After each 100 iterations')
|
wolffd@0
|
120 disp('the hyperparameters are re-estimated twice. There are eight')
|
wolffd@0
|
121 disp('cycles of the whole algorithm.')
|
wolffd@0
|
122 disp(' ')
|
wolffd@0
|
123 disp('Press any key to train the network and determine the hyperparameters.')
|
wolffd@0
|
124 pause;
|
wolffd@0
|
125
|
wolffd@0
|
126 % Set up network parameters.
|
wolffd@0
|
127 nin = 2; % Number of inputs.
|
wolffd@0
|
128 nhidden = 6; % Number of hidden units.
|
wolffd@0
|
129 nout = 1; % Number of outputs.
|
wolffd@0
|
130 alpha = 0.01; % Initial prior hyperparameter.
|
wolffd@0
|
131 aw1 = 0.01;
|
wolffd@0
|
132 ab1 = 0.01;
|
wolffd@0
|
133 aw2 = 0.01;
|
wolffd@0
|
134 ab2 = 0.01;
|
wolffd@0
|
135
|
wolffd@0
|
136 % Create and initialize network weight vector.
|
wolffd@0
|
137 prior = mlpprior(nin, nhidden, nout, aw1, ab1, aw2, ab2);
|
wolffd@0
|
138 net = mlp(nin, nhidden, nout, 'logistic', prior);
|
wolffd@0
|
139
|
wolffd@0
|
140 % Set up vector of options for the optimiser.
|
wolffd@0
|
141 nouter = 8; % Number of outer loops.
|
wolffd@0
|
142 ninner = 2; % Number of innter loops.
|
wolffd@0
|
143 options = foptions; % Default options vector.
|
wolffd@0
|
144 options(1) = 1; % This provides display of error values.
|
wolffd@0
|
145 options(2) = 1.0e-5; % Absolute precision for weights.
|
wolffd@0
|
146 options(3) = 1.0e-5; % Precision for objective function.
|
wolffd@0
|
147 options(14) = 100; % Number of training cycles in inner loop.
|
wolffd@0
|
148
|
wolffd@0
|
149 % Train using scaled conjugate gradients, re-estimating alpha and beta.
|
wolffd@0
|
150 for k = 1:nouter
|
wolffd@0
|
151 net = netopt(net, options, data, target, 'scg');
|
wolffd@0
|
152 [net, gamma] = evidence(net, data, target, ninner);
|
wolffd@0
|
153 fprintf(1, '\nRe-estimation cycle %d:\n', k);
|
wolffd@0
|
154 disp([' alpha = ', num2str(net.alpha')]);
|
wolffd@0
|
155 fprintf(1, ' gamma = %8.5f\n\n', gamma);
|
wolffd@0
|
156 disp(' ')
|
wolffd@0
|
157 disp('Press any key to continue.')
|
wolffd@0
|
158 pause;
|
wolffd@0
|
159 end
|
wolffd@0
|
160
|
wolffd@0
|
161 disp(' ')
|
wolffd@0
|
162 disp('Network training and hyperparameter re-estimation are now complete.')
|
wolffd@0
|
163 disp('Notice that the final error value is close to the number of data')
|
wolffd@0
|
164 disp(['points (', num2str(n), ') divided by two.'])
|
wolffd@0
|
165 disp('Also, the hyperparameter values differ, which suggests that a single')
|
wolffd@0
|
166 disp('hyperparameter would not be so effective.')
|
wolffd@0
|
167 disp(' ')
|
wolffd@0
|
168 disp('First we train an MLP without Bayesian regularisation on the')
|
wolffd@0
|
169 disp('same dataset using 400 iterations of scaled conjugate gradient')
|
wolffd@0
|
170 disp(' ')
|
wolffd@0
|
171 disp('Press any key to train the network by maximum likelihood.')
|
wolffd@0
|
172 pause;
|
wolffd@0
|
173 % Train standard network
|
wolffd@0
|
174 net2 = mlp(nin, nhidden, nout, 'logistic');
|
wolffd@0
|
175 options(14) = 400;
|
wolffd@0
|
176 net2 = netopt(net2, options, data, target, 'scg');
|
wolffd@0
|
177 y2g = mlpfwd(net2, [X(:), Y(:)]);
|
wolffd@0
|
178 y2g = reshape(y2g(:, 1), size(X));
|
wolffd@0
|
179
|
wolffd@0
|
180 disp(' ')
|
wolffd@0
|
181 disp('We can now plot the function represented by the trained networks.')
|
wolffd@0
|
182 disp('We show the decision boundaries (output = 0.5) and the optimal')
|
wolffd@0
|
183 disp('decision boundary given by applying Bayes'' theorem to the true')
|
wolffd@0
|
184 disp('data model.')
|
wolffd@0
|
185 disp(' ')
|
wolffd@0
|
186 disp('Press any key to add the boundaries to the plot.')
|
wolffd@0
|
187 pause;
|
wolffd@0
|
188
|
wolffd@0
|
189 % Evaluate predictions.
|
wolffd@0
|
190 [yg, ymodg] = mlpevfwd(net, data, target, [X(:) Y(:)]);
|
wolffd@0
|
191 yg = reshape(yg(:,1),size(X));
|
wolffd@0
|
192 ymodg = reshape(ymodg(:,1),size(X));
|
wolffd@0
|
193
|
wolffd@0
|
194 % Bayesian decision boundary
|
wolffd@0
|
195 [cB, hB] = contour(xrange,yrange,p1_x,[0.5 0.5],'b-');
|
wolffd@0
|
196 [cNb, hNb] = contour(xrange,yrange,yg,[0.5 0.5],'r-');
|
wolffd@0
|
197 [cN, hN] = contour(xrange,yrange,y2g,[0.5 0.5],'g-');
|
wolffd@0
|
198 set(hB, 'LineWidth', 2);
|
wolffd@0
|
199 set(hNb, 'LineWidth', 2);
|
wolffd@0
|
200 set(hN, 'LineWidth', 2);
|
wolffd@0
|
201 Chandles = [hB(1) hNb(1) hN(1)];
|
wolffd@0
|
202 legend(Chandles, 'Bayes', ...
|
wolffd@0
|
203 'Reg. Network', 'Network', 3);
|
wolffd@0
|
204
|
wolffd@0
|
205 disp(' ')
|
wolffd@0
|
206 disp('Note how the regularised network predictions are closer to the')
|
wolffd@0
|
207 disp('optimal decision boundary, while the unregularised network is')
|
wolffd@0
|
208 disp('overtrained.')
|
wolffd@0
|
209
|
wolffd@0
|
210 disp(' ')
|
wolffd@0
|
211 disp('We will now compare moderated and unmoderated outputs for the');
|
wolffd@0
|
212 disp('regularised network by showing the contour plot of the posterior')
|
wolffd@0
|
213 disp('probability estimates.')
|
wolffd@0
|
214 disp(' ')
|
wolffd@0
|
215 disp('The first plot shows the regularised (moderated) predictions')
|
wolffd@0
|
216 disp('and the second shows the standard predictions from the same network.')
|
wolffd@0
|
217 disp('These agree at the level 0.5.')
|
wolffd@0
|
218 disp('Press any key to continue')
|
wolffd@0
|
219 pause
|
wolffd@0
|
220 levels = 0:0.1:1;
|
wolffd@0
|
221 fh4 = figure;
|
wolffd@0
|
222 set(fh4, 'Name', 'Moderated outputs');
|
wolffd@0
|
223 hold on
|
wolffd@0
|
224 plot(data((label<=2),1),data(label<=2,2),'r.', 'MarkerSize', PointSize)
|
wolffd@0
|
225 plot(data((label>2),1),data(label>2,2),'y.', 'MarkerSize', PointSize)
|
wolffd@0
|
226
|
wolffd@0
|
227 [cNby, hNby] = contour(xrange, yrange, ymodg, levels, 'k-');
|
wolffd@0
|
228 set(hNby, 'LineWidth', 1);
|
wolffd@0
|
229
|
wolffd@0
|
230 fh5 = figure;
|
wolffd@0
|
231 set(fh5, 'Name', 'Unmoderated outputs');
|
wolffd@0
|
232 hold on
|
wolffd@0
|
233 plot(data((label<=2),1),data(label<=2,2),'r.', 'MarkerSize', PointSize)
|
wolffd@0
|
234 plot(data((label>2),1),data(label>2,2),'y.', 'MarkerSize', PointSize)
|
wolffd@0
|
235
|
wolffd@0
|
236 [cNbm, hNbm] = contour(xrange, yrange, yg, levels, 'k-');
|
wolffd@0
|
237 set(hNbm, 'LineWidth', 1);
|
wolffd@0
|
238
|
wolffd@0
|
239 disp(' ')
|
wolffd@0
|
240 disp('Note how the moderated contours are more widely spaced. This shows')
|
wolffd@0
|
241 disp('that there is a larger region where the outputs are close to 0.5')
|
wolffd@0
|
242 disp('and a smaller region where the outputs are close to 0 or 1.')
|
wolffd@0
|
243 disp(' ')
|
wolffd@0
|
244 disp('Press any key to exit')
|
wolffd@0
|
245 pause
|
wolffd@0
|
246 close(fh1);
|
wolffd@0
|
247 close(fh4);
|
wolffd@0
|
248 close(fh5); |