comparison Code/genre_classification/classification/mlp.py @ 24:68a62ca32441

Organized python scripts
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sat, 15 Aug 2015 19:16:17 +0100
parents
children
comparison
equal deleted inserted replaced
23:45e6f85d0ba4 24:68a62ca32441
1 """
2 This tutorial introduces the multilayer perceptron using Theano.
3
4 A multilayer perceptron is a logistic regressor where
5 instead of feeding the input to the logistic regression you insert a
6 intermediate layer, called the hidden layer, that has a nonlinear
7 activation function (usually tanh or sigmoid) . One can use many such
8 hidden layers making the architecture deep. The tutorial will also tackle
9 the problem of MNIST digit classification.
10
11 .. math::
12
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
14
15 References:
16
17 - textbooks: "Pattern Recognition and Machine Learning" -
18 Christopher M. Bishop, section 5
19
20 """
21 __docformat__ = 'restructedtext en'
22
23
24 import os
25 import sys
26 import timeit
27
28 import numpy
29
30 import theano
31 import theano.tensor as T
32
33
34 from logistic_sgd import LogisticRegression, load_data
35
36
37 # start-snippet-1
38 class HiddenLayer(object):
39 def __init__(self, rng, input, n_in, n_out, W=None, b=None,
40 activation=T.tanh):
41 """
42 Typical hidden layer of a MLP: units are fully-connected and have
43 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
44 and the bias vector b is of shape (n_out,).
45
46 NOTE : The nonlinearity used here is tanh
47
48 Hidden unit activation is given by: tanh(dot(input,W) + b)
49
50 :type rng: numpy.random.RandomState
51 :param rng: a random number generator used to initialize weights
52
53 :type input: theano.tensor.dmatrix
54 :param input: a symbolic tensor of shape (n_examples, n_in)
55
56 :type n_in: int
57 :param n_in: dimensionality of input
58
59 :type n_out: int
60 :param n_out: number of hidden units
61
62 :type activation: theano.Op or function
63 :param activation: Non linearity to be applied in the hidden
64 layer
65 """
66 self.input = input
67 # end-snippet-1
68
69 # `W` is initialized with `W_values` which is uniformely sampled
70 # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
71 # for tanh activation function
72 # the output of uniform if converted using asarray to dtype
73 # theano.config.floatX so that the code is runable on GPU
74 # Note : optimal initialization of weights is dependent on the
75 # activation function used (among other things).
76 # For example, results presented in [Xavier10] suggest that you
77 # should use 4 times larger initial weights for sigmoid
78 # compared to tanh
79 # We have no info for other function, so we use the same as
80 # tanh.
81 if W is None:
82 W_values = numpy.asarray(
83 rng.uniform(
84 low=-numpy.sqrt(6. / (n_in + n_out)),
85 high=numpy.sqrt(6. / (n_in + n_out)),
86 size=(n_in, n_out)
87 ),
88 dtype=theano.config.floatX
89 )
90 if activation == theano.tensor.nnet.sigmoid:
91 W_values *= 4
92
93 W = theano.shared(value=W_values, name='W', borrow=True)
94
95 if b is None:
96 b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
97 b = theano.shared(value=b_values, name='b', borrow=True)
98
99 self.W = W
100 self.b = b
101
102 lin_output = T.dot(input, self.W) + self.b
103 self.output = (
104 lin_output if activation is None
105 else activation(lin_output)
106 )
107 # parameters of the model
108 self.params = [self.W, self.b]
109
110
111 # start-snippet-2
112 class MLP(object):
113 """Multi-Layer Perceptron Class
114
115 A multilayer perceptron is a feedforward artificial neural network model
116 that has one layer or more of hidden units and nonlinear activations.
117 Intermediate layers usually have as activation function tanh or the
118 sigmoid function (defined here by a ``HiddenLayer`` class) while the
119 top layer is a softmax layer (defined here by a ``LogisticRegression``
120 class).
121 """
122
123 def __init__(self, rng, input, n_in, n_hidden, n_out):
124 """Initialize the parameters for the multilayer perceptron
125
126 :type rng: numpy.random.RandomState
127 :param rng: a random number generator used to initialize weights
128
129 :type input: theano.tensor.TensorType
130 :param input: symbolic variable that describes the input of the
131 architecture (one minibatch)
132
133 :type n_in: int
134 :param n_in: number of input units, the dimension of the space in
135 which the datapoints lie
136
137 :type n_hidden: int
138 :param n_hidden: number of hidden units
139
140 :type n_out: int
141 :param n_out: number of output units, the dimension of the space in
142 which the labels lie
143
144 """
145
146 # Since we are dealing with a one hidden layer MLP, this will translate
147 # into a HiddenLayer with a tanh activation function connected to the
148 # LogisticRegression layer; the activation function can be replaced by
149 # sigmoid or any other nonlinear function
150 self.hiddenLayer = HiddenLayer(
151 rng=rng,
152 input=input,
153 n_in=n_in,
154 n_out=n_hidden,
155 activation=T.tanh
156 )
157
158 # The logistic regression layer gets as input the hidden units
159 # of the hidden layer
160 self.logRegressionLayer = LogisticRegression(
161 input=self.hiddenLayer.output,
162 n_in=n_hidden,
163 n_out=n_out
164 )
165 # end-snippet-2 start-snippet-3
166 # L1 norm ; one regularization option is to enforce L1 norm to
167 # be small
168 self.L1 = (
169 abs(self.hiddenLayer.W).sum()
170 + abs(self.logRegressionLayer.W).sum()
171 )
172
173 # square of L2 norm ; one regularization option is to enforce
174 # square of L2 norm to be small
175 self.L2_sqr = (
176 (self.hiddenLayer.W ** 2).sum()
177 + (self.logRegressionLayer.W ** 2).sum()
178 )
179
180 # negative log likelihood of the MLP is given by the negative
181 # log likelihood of the output of the model, computed in the
182 # logistic regression layer
183 self.negative_log_likelihood = (
184 self.logRegressionLayer.negative_log_likelihood
185 )
186 # same holds for the function computing the number of errors
187 self.errors = self.logRegressionLayer.errors
188
189 # the parameters of the model are the parameters of the two layer it is
190 # made out of
191 self.params = self.hiddenLayer.params + self.logRegressionLayer.params
192 # end-snippet-3
193
194 # keep track of model input
195 self.input = input
196
197
198 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
199 dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
200 """
201 Demonstrate stochastic gradient descent optimization for a multilayer
202 perceptron
203
204 This is demonstrated on MNIST.
205
206 :type learning_rate: float
207 :param learning_rate: learning rate used (factor for the stochastic
208 gradient
209
210 :type L1_reg: float
211 :param L1_reg: L1-norm's weight when added to the cost (see
212 regularization)
213
214 :type L2_reg: float
215 :param L2_reg: L2-norm's weight when added to the cost (see
216 regularization)
217
218 :type n_epochs: int
219 :param n_epochs: maximal number of epochs to run the optimizer
220
221 :type dataset: string
222 :param dataset: the path of the MNIST dataset file from
223 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
224
225
226 """
227 datasets = load_data(dataset)
228
229 train_set_x, train_set_y = datasets[0]
230 valid_set_x, valid_set_y = datasets[1]
231 test_set_x, test_set_y = datasets[2]
232
233 # compute number of minibatches for training, validation and testing
234 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
235 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
236 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
237
238 ######################
239 # BUILD ACTUAL MODEL #
240 ######################
241 print '... building the model'
242
243 # allocate symbolic variables for the data
244 index = T.lscalar() # index to a [mini]batch
245 x = T.matrix('x') # the data is presented as rasterized images
246 y = T.ivector('y') # the labels are presented as 1D vector of
247 # [int] labels
248
249 rng = numpy.random.RandomState(1234)
250
251 # construct the MLP class
252 classifier = MLP(
253 rng=rng,
254 input=x,
255 n_in=28 * 28,
256 n_hidden=n_hidden,
257 n_out=10
258 )
259
260 # start-snippet-4
261 # the cost we minimize during training is the negative log likelihood of
262 # the model plus the regularization terms (L1 and L2); cost is expressed
263 # here symbolically
264 cost = (
265 classifier.negative_log_likelihood(y)
266 + L1_reg * classifier.L1
267 + L2_reg * classifier.L2_sqr
268 )
269 # end-snippet-4
270
271 # compiling a Theano function that computes the mistakes that are made
272 # by the model on a minibatch
273 test_model = theano.function(
274 inputs=[index],
275 outputs=classifier.errors(y),
276 givens={
277 x: test_set_x[index * batch_size:(index + 1) * batch_size],
278 y: test_set_y[index * batch_size:(index + 1) * batch_size]
279 }
280 )
281
282 validate_model = theano.function(
283 inputs=[index],
284 outputs=classifier.errors(y),
285 givens={
286 x: valid_set_x[index * batch_size:(index + 1) * batch_size],
287 y: valid_set_y[index * batch_size:(index + 1) * batch_size]
288 }
289 )
290
291 # start-snippet-5
292 # compute the gradient of cost with respect to theta (sotred in params)
293 # the resulting gradients will be stored in a list gparams
294 gparams = [T.grad(cost, param) for param in classifier.params]
295
296 # specify how to update the parameters of the model as a list of
297 # (variable, update expression) pairs
298
299 # given two lists of the same length, A = [a1, a2, a3, a4] and
300 # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
301 # element is a pair formed from the two lists :
302 # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
303 updates = [
304 (param, param - learning_rate * gparam)
305 for param, gparam in zip(classifier.params, gparams)
306 ]
307
308 # compiling a Theano function `train_model` that returns the cost, but
309 # in the same time updates the parameter of the model based on the rules
310 # defined in `updates`
311 train_model = theano.function(
312 inputs=[index],
313 outputs=cost,
314 updates=updates,
315 givens={
316 x: train_set_x[index * batch_size: (index + 1) * batch_size],
317 y: train_set_y[index * batch_size: (index + 1) * batch_size]
318 }
319 )
320 # end-snippet-5
321
322 ###############
323 # TRAIN MODEL #
324 ###############
325 print '... training'
326
327 # early-stopping parameters
328 patience = 10000 # look as this many examples regardless
329 patience_increase = 2 # wait this much longer when a new best is
330 # found
331 improvement_threshold = 0.995 # a relative improvement of this much is
332 # considered significant
333 validation_frequency = min(n_train_batches, patience / 2)
334 # go through this many
335 # minibatche before checking the network
336 # on the validation set; in this case we
337 # check every epoch
338
339 best_validation_loss = numpy.inf
340 best_iter = 0
341 test_score = 0.
342 start_time = timeit.default_timer()
343
344 epoch = 0
345 done_looping = False
346
347 while (epoch < n_epochs) and (not done_looping):
348 epoch = epoch + 1
349 for minibatch_index in xrange(n_train_batches):
350
351 minibatch_avg_cost = train_model(minibatch_index)
352 # iteration number
353 iter = (epoch - 1) * n_train_batches + minibatch_index
354
355 if (iter + 1) % validation_frequency == 0:
356 # compute zero-one loss on validation set
357 validation_losses = [validate_model(i) for i
358 in xrange(n_valid_batches)]
359 this_validation_loss = numpy.mean(validation_losses)
360
361 print(
362 'epoch %i, minibatch %i/%i, validation error %f %%' %
363 (
364 epoch,
365 minibatch_index + 1,
366 n_train_batches,
367 this_validation_loss * 100.
368 )
369 )
370
371 # if we got the best validation score until now
372 if this_validation_loss < best_validation_loss:
373 #improve patience if loss improvement is good enough
374 if (
375 this_validation_loss < best_validation_loss *
376 improvement_threshold
377 ):
378 patience = max(patience, iter * patience_increase)
379
380 best_validation_loss = this_validation_loss
381 best_iter = iter
382
383 # test it on the test set
384 test_losses = [test_model(i) for i
385 in xrange(n_test_batches)]
386 test_score = numpy.mean(test_losses)
387
388 print((' epoch %i, minibatch %i/%i, test error of '
389 'best model %f %%') %
390 (epoch, minibatch_index + 1, n_train_batches,
391 test_score * 100.))
392
393 if patience <= iter:
394 done_looping = True
395 break
396
397 end_time = timeit.default_timer()
398 print(('Optimization complete. Best validation score of %f %% '
399 'obtained at iteration %i, with test performance %f %%') %
400 (best_validation_loss * 100., best_iter + 1, test_score * 100.))
401 print >> sys.stderr, ('The code for file ' +
402 os.path.split(__file__)[1] +
403 ' ran for %.2fm' % ((end_time - start_time) / 60.))
404
405
406 if __name__ == '__main__':
407 test_mlp()
408
409 # Rectifier Linear Unit
410 #Source: http://stackoverflow.com/questions/26497564/theano-hiddenlayer-activation-function
411 def relu(x):
412 return T.maximum(0.,x)