p@24
|
1 """
|
p@24
|
2 This tutorial introduces the multilayer perceptron using Theano.
|
p@24
|
3
|
p@24
|
4 A multilayer perceptron is a logistic regressor where
|
p@24
|
5 instead of feeding the input to the logistic regression you insert a
|
p@24
|
6 intermediate layer, called the hidden layer, that has a nonlinear
|
p@24
|
7 activation function (usually tanh or sigmoid) . One can use many such
|
p@24
|
8 hidden layers making the architecture deep. The tutorial will also tackle
|
p@24
|
9 the problem of MNIST digit classification.
|
p@24
|
10
|
p@24
|
11 .. math::
|
p@24
|
12
|
p@24
|
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
|
p@24
|
14
|
p@24
|
15 References:
|
p@24
|
16
|
p@24
|
17 - textbooks: "Pattern Recognition and Machine Learning" -
|
p@24
|
18 Christopher M. Bishop, section 5
|
p@24
|
19
|
p@24
|
20 """
|
p@24
|
21 __docformat__ = 'restructedtext en'
|
p@24
|
22
|
p@24
|
23
|
p@24
|
24 import os
|
p@24
|
25 import sys
|
p@24
|
26 import timeit
|
p@24
|
27
|
p@24
|
28 import numpy
|
p@24
|
29
|
p@24
|
30 import theano
|
p@24
|
31 import theano.tensor as T
|
p@24
|
32
|
p@24
|
33
|
p@24
|
34 from logistic_sgd import LogisticRegression, load_data
|
p@24
|
35
|
p@24
|
36
|
p@24
|
37 # start-snippet-1
|
p@24
|
38 class HiddenLayer(object):
|
p@24
|
39 def __init__(self, rng, input, n_in, n_out, W=None, b=None,
|
p@24
|
40 activation=T.tanh):
|
p@24
|
41 """
|
p@24
|
42 Typical hidden layer of a MLP: units are fully-connected and have
|
p@24
|
43 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
|
p@24
|
44 and the bias vector b is of shape (n_out,).
|
p@24
|
45
|
p@24
|
46 NOTE : The nonlinearity used here is tanh
|
p@24
|
47
|
p@24
|
48 Hidden unit activation is given by: tanh(dot(input,W) + b)
|
p@24
|
49
|
p@24
|
50 :type rng: numpy.random.RandomState
|
p@24
|
51 :param rng: a random number generator used to initialize weights
|
p@24
|
52
|
p@24
|
53 :type input: theano.tensor.dmatrix
|
p@24
|
54 :param input: a symbolic tensor of shape (n_examples, n_in)
|
p@24
|
55
|
p@24
|
56 :type n_in: int
|
p@24
|
57 :param n_in: dimensionality of input
|
p@24
|
58
|
p@24
|
59 :type n_out: int
|
p@24
|
60 :param n_out: number of hidden units
|
p@24
|
61
|
p@24
|
62 :type activation: theano.Op or function
|
p@24
|
63 :param activation: Non linearity to be applied in the hidden
|
p@24
|
64 layer
|
p@24
|
65 """
|
p@24
|
66 self.input = input
|
p@24
|
67 # end-snippet-1
|
p@24
|
68
|
p@24
|
69 # `W` is initialized with `W_values` which is uniformely sampled
|
p@24
|
70 # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
|
p@24
|
71 # for tanh activation function
|
p@24
|
72 # the output of uniform if converted using asarray to dtype
|
p@24
|
73 # theano.config.floatX so that the code is runable on GPU
|
p@24
|
74 # Note : optimal initialization of weights is dependent on the
|
p@24
|
75 # activation function used (among other things).
|
p@24
|
76 # For example, results presented in [Xavier10] suggest that you
|
p@24
|
77 # should use 4 times larger initial weights for sigmoid
|
p@24
|
78 # compared to tanh
|
p@24
|
79 # We have no info for other function, so we use the same as
|
p@24
|
80 # tanh.
|
p@24
|
81 if W is None:
|
p@24
|
82 W_values = numpy.asarray(
|
p@24
|
83 rng.uniform(
|
p@24
|
84 low=-numpy.sqrt(6. / (n_in + n_out)),
|
p@24
|
85 high=numpy.sqrt(6. / (n_in + n_out)),
|
p@24
|
86 size=(n_in, n_out)
|
p@24
|
87 ),
|
p@24
|
88 dtype=theano.config.floatX
|
p@24
|
89 )
|
p@24
|
90 if activation == theano.tensor.nnet.sigmoid:
|
p@24
|
91 W_values *= 4
|
p@24
|
92
|
p@24
|
93 W = theano.shared(value=W_values, name='W', borrow=True)
|
p@24
|
94
|
p@24
|
95 if b is None:
|
p@24
|
96 b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
p@24
|
97 b = theano.shared(value=b_values, name='b', borrow=True)
|
p@24
|
98
|
p@24
|
99 self.W = W
|
p@24
|
100 self.b = b
|
p@24
|
101
|
p@24
|
102 lin_output = T.dot(input, self.W) + self.b
|
p@24
|
103 self.output = (
|
p@24
|
104 lin_output if activation is None
|
p@24
|
105 else activation(lin_output)
|
p@24
|
106 )
|
p@24
|
107 # parameters of the model
|
p@24
|
108 self.params = [self.W, self.b]
|
p@24
|
109
|
p@24
|
110
|
p@24
|
111 # start-snippet-2
|
p@24
|
112 class MLP(object):
|
p@24
|
113 """Multi-Layer Perceptron Class
|
p@24
|
114
|
p@24
|
115 A multilayer perceptron is a feedforward artificial neural network model
|
p@24
|
116 that has one layer or more of hidden units and nonlinear activations.
|
p@24
|
117 Intermediate layers usually have as activation function tanh or the
|
p@24
|
118 sigmoid function (defined here by a ``HiddenLayer`` class) while the
|
p@24
|
119 top layer is a softmax layer (defined here by a ``LogisticRegression``
|
p@24
|
120 class).
|
p@24
|
121 """
|
p@24
|
122
|
p@24
|
123 def __init__(self, rng, input, n_in, n_hidden, n_out):
|
p@24
|
124 """Initialize the parameters for the multilayer perceptron
|
p@24
|
125
|
p@24
|
126 :type rng: numpy.random.RandomState
|
p@24
|
127 :param rng: a random number generator used to initialize weights
|
p@24
|
128
|
p@24
|
129 :type input: theano.tensor.TensorType
|
p@24
|
130 :param input: symbolic variable that describes the input of the
|
p@24
|
131 architecture (one minibatch)
|
p@24
|
132
|
p@24
|
133 :type n_in: int
|
p@24
|
134 :param n_in: number of input units, the dimension of the space in
|
p@24
|
135 which the datapoints lie
|
p@24
|
136
|
p@24
|
137 :type n_hidden: int
|
p@24
|
138 :param n_hidden: number of hidden units
|
p@24
|
139
|
p@24
|
140 :type n_out: int
|
p@24
|
141 :param n_out: number of output units, the dimension of the space in
|
p@24
|
142 which the labels lie
|
p@24
|
143
|
p@24
|
144 """
|
p@24
|
145
|
p@24
|
146 # Since we are dealing with a one hidden layer MLP, this will translate
|
p@24
|
147 # into a HiddenLayer with a tanh activation function connected to the
|
p@24
|
148 # LogisticRegression layer; the activation function can be replaced by
|
p@24
|
149 # sigmoid or any other nonlinear function
|
p@24
|
150 self.hiddenLayer = HiddenLayer(
|
p@24
|
151 rng=rng,
|
p@24
|
152 input=input,
|
p@24
|
153 n_in=n_in,
|
p@24
|
154 n_out=n_hidden,
|
p@24
|
155 activation=T.tanh
|
p@24
|
156 )
|
p@24
|
157
|
p@24
|
158 # The logistic regression layer gets as input the hidden units
|
p@24
|
159 # of the hidden layer
|
p@24
|
160 self.logRegressionLayer = LogisticRegression(
|
p@24
|
161 input=self.hiddenLayer.output,
|
p@24
|
162 n_in=n_hidden,
|
p@24
|
163 n_out=n_out
|
p@24
|
164 )
|
p@24
|
165 # end-snippet-2 start-snippet-3
|
p@24
|
166 # L1 norm ; one regularization option is to enforce L1 norm to
|
p@24
|
167 # be small
|
p@24
|
168 self.L1 = (
|
p@24
|
169 abs(self.hiddenLayer.W).sum()
|
p@24
|
170 + abs(self.logRegressionLayer.W).sum()
|
p@24
|
171 )
|
p@24
|
172
|
p@24
|
173 # square of L2 norm ; one regularization option is to enforce
|
p@24
|
174 # square of L2 norm to be small
|
p@24
|
175 self.L2_sqr = (
|
p@24
|
176 (self.hiddenLayer.W ** 2).sum()
|
p@24
|
177 + (self.logRegressionLayer.W ** 2).sum()
|
p@24
|
178 )
|
p@24
|
179
|
p@24
|
180 # negative log likelihood of the MLP is given by the negative
|
p@24
|
181 # log likelihood of the output of the model, computed in the
|
p@24
|
182 # logistic regression layer
|
p@24
|
183 self.negative_log_likelihood = (
|
p@24
|
184 self.logRegressionLayer.negative_log_likelihood
|
p@24
|
185 )
|
p@24
|
186 # same holds for the function computing the number of errors
|
p@24
|
187 self.errors = self.logRegressionLayer.errors
|
p@24
|
188
|
p@24
|
189 # the parameters of the model are the parameters of the two layer it is
|
p@24
|
190 # made out of
|
p@24
|
191 self.params = self.hiddenLayer.params + self.logRegressionLayer.params
|
p@24
|
192 # end-snippet-3
|
p@24
|
193
|
p@24
|
194 # keep track of model input
|
p@24
|
195 self.input = input
|
p@24
|
196
|
p@24
|
197
|
p@24
|
198 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
|
p@24
|
199 dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
|
p@24
|
200 """
|
p@24
|
201 Demonstrate stochastic gradient descent optimization for a multilayer
|
p@24
|
202 perceptron
|
p@24
|
203
|
p@24
|
204 This is demonstrated on MNIST.
|
p@24
|
205
|
p@24
|
206 :type learning_rate: float
|
p@24
|
207 :param learning_rate: learning rate used (factor for the stochastic
|
p@24
|
208 gradient
|
p@24
|
209
|
p@24
|
210 :type L1_reg: float
|
p@24
|
211 :param L1_reg: L1-norm's weight when added to the cost (see
|
p@24
|
212 regularization)
|
p@24
|
213
|
p@24
|
214 :type L2_reg: float
|
p@24
|
215 :param L2_reg: L2-norm's weight when added to the cost (see
|
p@24
|
216 regularization)
|
p@24
|
217
|
p@24
|
218 :type n_epochs: int
|
p@24
|
219 :param n_epochs: maximal number of epochs to run the optimizer
|
p@24
|
220
|
p@24
|
221 :type dataset: string
|
p@24
|
222 :param dataset: the path of the MNIST dataset file from
|
p@24
|
223 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
|
p@24
|
224
|
p@24
|
225
|
p@24
|
226 """
|
p@24
|
227 datasets = load_data(dataset)
|
p@24
|
228
|
p@24
|
229 train_set_x, train_set_y = datasets[0]
|
p@24
|
230 valid_set_x, valid_set_y = datasets[1]
|
p@24
|
231 test_set_x, test_set_y = datasets[2]
|
p@24
|
232
|
p@24
|
233 # compute number of minibatches for training, validation and testing
|
p@24
|
234 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
|
p@24
|
235 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
|
p@24
|
236 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
|
p@24
|
237
|
p@24
|
238 ######################
|
p@24
|
239 # BUILD ACTUAL MODEL #
|
p@24
|
240 ######################
|
p@24
|
241 print '... building the model'
|
p@24
|
242
|
p@24
|
243 # allocate symbolic variables for the data
|
p@24
|
244 index = T.lscalar() # index to a [mini]batch
|
p@24
|
245 x = T.matrix('x') # the data is presented as rasterized images
|
p@24
|
246 y = T.ivector('y') # the labels are presented as 1D vector of
|
p@24
|
247 # [int] labels
|
p@24
|
248
|
p@24
|
249 rng = numpy.random.RandomState(1234)
|
p@24
|
250
|
p@24
|
251 # construct the MLP class
|
p@24
|
252 classifier = MLP(
|
p@24
|
253 rng=rng,
|
p@24
|
254 input=x,
|
p@24
|
255 n_in=28 * 28,
|
p@24
|
256 n_hidden=n_hidden,
|
p@24
|
257 n_out=10
|
p@24
|
258 )
|
p@24
|
259
|
p@24
|
260 # start-snippet-4
|
p@24
|
261 # the cost we minimize during training is the negative log likelihood of
|
p@24
|
262 # the model plus the regularization terms (L1 and L2); cost is expressed
|
p@24
|
263 # here symbolically
|
p@24
|
264 cost = (
|
p@24
|
265 classifier.negative_log_likelihood(y)
|
p@24
|
266 + L1_reg * classifier.L1
|
p@24
|
267 + L2_reg * classifier.L2_sqr
|
p@24
|
268 )
|
p@24
|
269 # end-snippet-4
|
p@24
|
270
|
p@24
|
271 # compiling a Theano function that computes the mistakes that are made
|
p@24
|
272 # by the model on a minibatch
|
p@24
|
273 test_model = theano.function(
|
p@24
|
274 inputs=[index],
|
p@24
|
275 outputs=classifier.errors(y),
|
p@24
|
276 givens={
|
p@24
|
277 x: test_set_x[index * batch_size:(index + 1) * batch_size],
|
p@24
|
278 y: test_set_y[index * batch_size:(index + 1) * batch_size]
|
p@24
|
279 }
|
p@24
|
280 )
|
p@24
|
281
|
p@24
|
282 validate_model = theano.function(
|
p@24
|
283 inputs=[index],
|
p@24
|
284 outputs=classifier.errors(y),
|
p@24
|
285 givens={
|
p@24
|
286 x: valid_set_x[index * batch_size:(index + 1) * batch_size],
|
p@24
|
287 y: valid_set_y[index * batch_size:(index + 1) * batch_size]
|
p@24
|
288 }
|
p@24
|
289 )
|
p@24
|
290
|
p@24
|
291 # start-snippet-5
|
p@24
|
292 # compute the gradient of cost with respect to theta (sotred in params)
|
p@24
|
293 # the resulting gradients will be stored in a list gparams
|
p@24
|
294 gparams = [T.grad(cost, param) for param in classifier.params]
|
p@24
|
295
|
p@24
|
296 # specify how to update the parameters of the model as a list of
|
p@24
|
297 # (variable, update expression) pairs
|
p@24
|
298
|
p@24
|
299 # given two lists of the same length, A = [a1, a2, a3, a4] and
|
p@24
|
300 # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
|
p@24
|
301 # element is a pair formed from the two lists :
|
p@24
|
302 # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
|
p@24
|
303 updates = [
|
p@24
|
304 (param, param - learning_rate * gparam)
|
p@24
|
305 for param, gparam in zip(classifier.params, gparams)
|
p@24
|
306 ]
|
p@24
|
307
|
p@24
|
308 # compiling a Theano function `train_model` that returns the cost, but
|
p@24
|
309 # in the same time updates the parameter of the model based on the rules
|
p@24
|
310 # defined in `updates`
|
p@24
|
311 train_model = theano.function(
|
p@24
|
312 inputs=[index],
|
p@24
|
313 outputs=cost,
|
p@24
|
314 updates=updates,
|
p@24
|
315 givens={
|
p@24
|
316 x: train_set_x[index * batch_size: (index + 1) * batch_size],
|
p@24
|
317 y: train_set_y[index * batch_size: (index + 1) * batch_size]
|
p@24
|
318 }
|
p@24
|
319 )
|
p@24
|
320 # end-snippet-5
|
p@24
|
321
|
p@24
|
322 ###############
|
p@24
|
323 # TRAIN MODEL #
|
p@24
|
324 ###############
|
p@24
|
325 print '... training'
|
p@24
|
326
|
p@24
|
327 # early-stopping parameters
|
p@24
|
328 patience = 10000 # look as this many examples regardless
|
p@24
|
329 patience_increase = 2 # wait this much longer when a new best is
|
p@24
|
330 # found
|
p@24
|
331 improvement_threshold = 0.995 # a relative improvement of this much is
|
p@24
|
332 # considered significant
|
p@24
|
333 validation_frequency = min(n_train_batches, patience / 2)
|
p@24
|
334 # go through this many
|
p@24
|
335 # minibatche before checking the network
|
p@24
|
336 # on the validation set; in this case we
|
p@24
|
337 # check every epoch
|
p@24
|
338
|
p@24
|
339 best_validation_loss = numpy.inf
|
p@24
|
340 best_iter = 0
|
p@24
|
341 test_score = 0.
|
p@24
|
342 start_time = timeit.default_timer()
|
p@24
|
343
|
p@24
|
344 epoch = 0
|
p@24
|
345 done_looping = False
|
p@24
|
346
|
p@24
|
347 while (epoch < n_epochs) and (not done_looping):
|
p@24
|
348 epoch = epoch + 1
|
p@24
|
349 for minibatch_index in xrange(n_train_batches):
|
p@24
|
350
|
p@24
|
351 minibatch_avg_cost = train_model(minibatch_index)
|
p@24
|
352 # iteration number
|
p@24
|
353 iter = (epoch - 1) * n_train_batches + minibatch_index
|
p@24
|
354
|
p@24
|
355 if (iter + 1) % validation_frequency == 0:
|
p@24
|
356 # compute zero-one loss on validation set
|
p@24
|
357 validation_losses = [validate_model(i) for i
|
p@24
|
358 in xrange(n_valid_batches)]
|
p@24
|
359 this_validation_loss = numpy.mean(validation_losses)
|
p@24
|
360
|
p@24
|
361 print(
|
p@24
|
362 'epoch %i, minibatch %i/%i, validation error %f %%' %
|
p@24
|
363 (
|
p@24
|
364 epoch,
|
p@24
|
365 minibatch_index + 1,
|
p@24
|
366 n_train_batches,
|
p@24
|
367 this_validation_loss * 100.
|
p@24
|
368 )
|
p@24
|
369 )
|
p@24
|
370
|
p@24
|
371 # if we got the best validation score until now
|
p@24
|
372 if this_validation_loss < best_validation_loss:
|
p@24
|
373 #improve patience if loss improvement is good enough
|
p@24
|
374 if (
|
p@24
|
375 this_validation_loss < best_validation_loss *
|
p@24
|
376 improvement_threshold
|
p@24
|
377 ):
|
p@24
|
378 patience = max(patience, iter * patience_increase)
|
p@24
|
379
|
p@24
|
380 best_validation_loss = this_validation_loss
|
p@24
|
381 best_iter = iter
|
p@24
|
382
|
p@24
|
383 # test it on the test set
|
p@24
|
384 test_losses = [test_model(i) for i
|
p@24
|
385 in xrange(n_test_batches)]
|
p@24
|
386 test_score = numpy.mean(test_losses)
|
p@24
|
387
|
p@24
|
388 print((' epoch %i, minibatch %i/%i, test error of '
|
p@24
|
389 'best model %f %%') %
|
p@24
|
390 (epoch, minibatch_index + 1, n_train_batches,
|
p@24
|
391 test_score * 100.))
|
p@24
|
392
|
p@24
|
393 if patience <= iter:
|
p@24
|
394 done_looping = True
|
p@24
|
395 break
|
p@24
|
396
|
p@24
|
397 end_time = timeit.default_timer()
|
p@24
|
398 print(('Optimization complete. Best validation score of %f %% '
|
p@24
|
399 'obtained at iteration %i, with test performance %f %%') %
|
p@24
|
400 (best_validation_loss * 100., best_iter + 1, test_score * 100.))
|
p@24
|
401 print >> sys.stderr, ('The code for file ' +
|
p@24
|
402 os.path.split(__file__)[1] +
|
p@24
|
403 ' ran for %.2fm' % ((end_time - start_time) / 60.))
|
p@24
|
404
|
p@24
|
405
|
p@24
|
406 if __name__ == '__main__':
|
p@24
|
407 test_mlp()
|
p@24
|
408
|
p@24
|
409 # Rectifier Linear Unit
|
p@24
|
410 #Source: http://stackoverflow.com/questions/26497564/theano-hiddenlayer-activation-function
|
p@24
|
411 def relu(x):
|
p@24
|
412 return T.maximum(0.,x)
|