Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/genre_classification/classification/mlp.py @ 24:68a62ca32441
Organized python scripts
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 15 Aug 2015 19:16:17 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:45e6f85d0ba4 | 24:68a62ca32441 |
---|---|
1 """ | |
2 This tutorial introduces the multilayer perceptron using Theano. | |
3 | |
4 A multilayer perceptron is a logistic regressor where | |
5 instead of feeding the input to the logistic regression you insert a | |
6 intermediate layer, called the hidden layer, that has a nonlinear | |
7 activation function (usually tanh or sigmoid) . One can use many such | |
8 hidden layers making the architecture deep. The tutorial will also tackle | |
9 the problem of MNIST digit classification. | |
10 | |
11 .. math:: | |
12 | |
13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), | |
14 | |
15 References: | |
16 | |
17 - textbooks: "Pattern Recognition and Machine Learning" - | |
18 Christopher M. Bishop, section 5 | |
19 | |
20 """ | |
21 __docformat__ = 'restructedtext en' | |
22 | |
23 | |
24 import os | |
25 import sys | |
26 import timeit | |
27 | |
28 import numpy | |
29 | |
30 import theano | |
31 import theano.tensor as T | |
32 | |
33 | |
34 from logistic_sgd import LogisticRegression, load_data | |
35 | |
36 | |
37 # start-snippet-1 | |
38 class HiddenLayer(object): | |
39 def __init__(self, rng, input, n_in, n_out, W=None, b=None, | |
40 activation=T.tanh): | |
41 """ | |
42 Typical hidden layer of a MLP: units are fully-connected and have | |
43 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) | |
44 and the bias vector b is of shape (n_out,). | |
45 | |
46 NOTE : The nonlinearity used here is tanh | |
47 | |
48 Hidden unit activation is given by: tanh(dot(input,W) + b) | |
49 | |
50 :type rng: numpy.random.RandomState | |
51 :param rng: a random number generator used to initialize weights | |
52 | |
53 :type input: theano.tensor.dmatrix | |
54 :param input: a symbolic tensor of shape (n_examples, n_in) | |
55 | |
56 :type n_in: int | |
57 :param n_in: dimensionality of input | |
58 | |
59 :type n_out: int | |
60 :param n_out: number of hidden units | |
61 | |
62 :type activation: theano.Op or function | |
63 :param activation: Non linearity to be applied in the hidden | |
64 layer | |
65 """ | |
66 self.input = input | |
67 # end-snippet-1 | |
68 | |
69 # `W` is initialized with `W_values` which is uniformely sampled | |
70 # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) | |
71 # for tanh activation function | |
72 # the output of uniform if converted using asarray to dtype | |
73 # theano.config.floatX so that the code is runable on GPU | |
74 # Note : optimal initialization of weights is dependent on the | |
75 # activation function used (among other things). | |
76 # For example, results presented in [Xavier10] suggest that you | |
77 # should use 4 times larger initial weights for sigmoid | |
78 # compared to tanh | |
79 # We have no info for other function, so we use the same as | |
80 # tanh. | |
81 if W is None: | |
82 W_values = numpy.asarray( | |
83 rng.uniform( | |
84 low=-numpy.sqrt(6. / (n_in + n_out)), | |
85 high=numpy.sqrt(6. / (n_in + n_out)), | |
86 size=(n_in, n_out) | |
87 ), | |
88 dtype=theano.config.floatX | |
89 ) | |
90 if activation == theano.tensor.nnet.sigmoid: | |
91 W_values *= 4 | |
92 | |
93 W = theano.shared(value=W_values, name='W', borrow=True) | |
94 | |
95 if b is None: | |
96 b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) | |
97 b = theano.shared(value=b_values, name='b', borrow=True) | |
98 | |
99 self.W = W | |
100 self.b = b | |
101 | |
102 lin_output = T.dot(input, self.W) + self.b | |
103 self.output = ( | |
104 lin_output if activation is None | |
105 else activation(lin_output) | |
106 ) | |
107 # parameters of the model | |
108 self.params = [self.W, self.b] | |
109 | |
110 | |
111 # start-snippet-2 | |
112 class MLP(object): | |
113 """Multi-Layer Perceptron Class | |
114 | |
115 A multilayer perceptron is a feedforward artificial neural network model | |
116 that has one layer or more of hidden units and nonlinear activations. | |
117 Intermediate layers usually have as activation function tanh or the | |
118 sigmoid function (defined here by a ``HiddenLayer`` class) while the | |
119 top layer is a softmax layer (defined here by a ``LogisticRegression`` | |
120 class). | |
121 """ | |
122 | |
123 def __init__(self, rng, input, n_in, n_hidden, n_out): | |
124 """Initialize the parameters for the multilayer perceptron | |
125 | |
126 :type rng: numpy.random.RandomState | |
127 :param rng: a random number generator used to initialize weights | |
128 | |
129 :type input: theano.tensor.TensorType | |
130 :param input: symbolic variable that describes the input of the | |
131 architecture (one minibatch) | |
132 | |
133 :type n_in: int | |
134 :param n_in: number of input units, the dimension of the space in | |
135 which the datapoints lie | |
136 | |
137 :type n_hidden: int | |
138 :param n_hidden: number of hidden units | |
139 | |
140 :type n_out: int | |
141 :param n_out: number of output units, the dimension of the space in | |
142 which the labels lie | |
143 | |
144 """ | |
145 | |
146 # Since we are dealing with a one hidden layer MLP, this will translate | |
147 # into a HiddenLayer with a tanh activation function connected to the | |
148 # LogisticRegression layer; the activation function can be replaced by | |
149 # sigmoid or any other nonlinear function | |
150 self.hiddenLayer = HiddenLayer( | |
151 rng=rng, | |
152 input=input, | |
153 n_in=n_in, | |
154 n_out=n_hidden, | |
155 activation=T.tanh | |
156 ) | |
157 | |
158 # The logistic regression layer gets as input the hidden units | |
159 # of the hidden layer | |
160 self.logRegressionLayer = LogisticRegression( | |
161 input=self.hiddenLayer.output, | |
162 n_in=n_hidden, | |
163 n_out=n_out | |
164 ) | |
165 # end-snippet-2 start-snippet-3 | |
166 # L1 norm ; one regularization option is to enforce L1 norm to | |
167 # be small | |
168 self.L1 = ( | |
169 abs(self.hiddenLayer.W).sum() | |
170 + abs(self.logRegressionLayer.W).sum() | |
171 ) | |
172 | |
173 # square of L2 norm ; one regularization option is to enforce | |
174 # square of L2 norm to be small | |
175 self.L2_sqr = ( | |
176 (self.hiddenLayer.W ** 2).sum() | |
177 + (self.logRegressionLayer.W ** 2).sum() | |
178 ) | |
179 | |
180 # negative log likelihood of the MLP is given by the negative | |
181 # log likelihood of the output of the model, computed in the | |
182 # logistic regression layer | |
183 self.negative_log_likelihood = ( | |
184 self.logRegressionLayer.negative_log_likelihood | |
185 ) | |
186 # same holds for the function computing the number of errors | |
187 self.errors = self.logRegressionLayer.errors | |
188 | |
189 # the parameters of the model are the parameters of the two layer it is | |
190 # made out of | |
191 self.params = self.hiddenLayer.params + self.logRegressionLayer.params | |
192 # end-snippet-3 | |
193 | |
194 # keep track of model input | |
195 self.input = input | |
196 | |
197 | |
198 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, | |
199 dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): | |
200 """ | |
201 Demonstrate stochastic gradient descent optimization for a multilayer | |
202 perceptron | |
203 | |
204 This is demonstrated on MNIST. | |
205 | |
206 :type learning_rate: float | |
207 :param learning_rate: learning rate used (factor for the stochastic | |
208 gradient | |
209 | |
210 :type L1_reg: float | |
211 :param L1_reg: L1-norm's weight when added to the cost (see | |
212 regularization) | |
213 | |
214 :type L2_reg: float | |
215 :param L2_reg: L2-norm's weight when added to the cost (see | |
216 regularization) | |
217 | |
218 :type n_epochs: int | |
219 :param n_epochs: maximal number of epochs to run the optimizer | |
220 | |
221 :type dataset: string | |
222 :param dataset: the path of the MNIST dataset file from | |
223 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz | |
224 | |
225 | |
226 """ | |
227 datasets = load_data(dataset) | |
228 | |
229 train_set_x, train_set_y = datasets[0] | |
230 valid_set_x, valid_set_y = datasets[1] | |
231 test_set_x, test_set_y = datasets[2] | |
232 | |
233 # compute number of minibatches for training, validation and testing | |
234 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size | |
235 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size | |
236 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size | |
237 | |
238 ###################### | |
239 # BUILD ACTUAL MODEL # | |
240 ###################### | |
241 print '... building the model' | |
242 | |
243 # allocate symbolic variables for the data | |
244 index = T.lscalar() # index to a [mini]batch | |
245 x = T.matrix('x') # the data is presented as rasterized images | |
246 y = T.ivector('y') # the labels are presented as 1D vector of | |
247 # [int] labels | |
248 | |
249 rng = numpy.random.RandomState(1234) | |
250 | |
251 # construct the MLP class | |
252 classifier = MLP( | |
253 rng=rng, | |
254 input=x, | |
255 n_in=28 * 28, | |
256 n_hidden=n_hidden, | |
257 n_out=10 | |
258 ) | |
259 | |
260 # start-snippet-4 | |
261 # the cost we minimize during training is the negative log likelihood of | |
262 # the model plus the regularization terms (L1 and L2); cost is expressed | |
263 # here symbolically | |
264 cost = ( | |
265 classifier.negative_log_likelihood(y) | |
266 + L1_reg * classifier.L1 | |
267 + L2_reg * classifier.L2_sqr | |
268 ) | |
269 # end-snippet-4 | |
270 | |
271 # compiling a Theano function that computes the mistakes that are made | |
272 # by the model on a minibatch | |
273 test_model = theano.function( | |
274 inputs=[index], | |
275 outputs=classifier.errors(y), | |
276 givens={ | |
277 x: test_set_x[index * batch_size:(index + 1) * batch_size], | |
278 y: test_set_y[index * batch_size:(index + 1) * batch_size] | |
279 } | |
280 ) | |
281 | |
282 validate_model = theano.function( | |
283 inputs=[index], | |
284 outputs=classifier.errors(y), | |
285 givens={ | |
286 x: valid_set_x[index * batch_size:(index + 1) * batch_size], | |
287 y: valid_set_y[index * batch_size:(index + 1) * batch_size] | |
288 } | |
289 ) | |
290 | |
291 # start-snippet-5 | |
292 # compute the gradient of cost with respect to theta (sotred in params) | |
293 # the resulting gradients will be stored in a list gparams | |
294 gparams = [T.grad(cost, param) for param in classifier.params] | |
295 | |
296 # specify how to update the parameters of the model as a list of | |
297 # (variable, update expression) pairs | |
298 | |
299 # given two lists of the same length, A = [a1, a2, a3, a4] and | |
300 # B = [b1, b2, b3, b4], zip generates a list C of same size, where each | |
301 # element is a pair formed from the two lists : | |
302 # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] | |
303 updates = [ | |
304 (param, param - learning_rate * gparam) | |
305 for param, gparam in zip(classifier.params, gparams) | |
306 ] | |
307 | |
308 # compiling a Theano function `train_model` that returns the cost, but | |
309 # in the same time updates the parameter of the model based on the rules | |
310 # defined in `updates` | |
311 train_model = theano.function( | |
312 inputs=[index], | |
313 outputs=cost, | |
314 updates=updates, | |
315 givens={ | |
316 x: train_set_x[index * batch_size: (index + 1) * batch_size], | |
317 y: train_set_y[index * batch_size: (index + 1) * batch_size] | |
318 } | |
319 ) | |
320 # end-snippet-5 | |
321 | |
322 ############### | |
323 # TRAIN MODEL # | |
324 ############### | |
325 print '... training' | |
326 | |
327 # early-stopping parameters | |
328 patience = 10000 # look as this many examples regardless | |
329 patience_increase = 2 # wait this much longer when a new best is | |
330 # found | |
331 improvement_threshold = 0.995 # a relative improvement of this much is | |
332 # considered significant | |
333 validation_frequency = min(n_train_batches, patience / 2) | |
334 # go through this many | |
335 # minibatche before checking the network | |
336 # on the validation set; in this case we | |
337 # check every epoch | |
338 | |
339 best_validation_loss = numpy.inf | |
340 best_iter = 0 | |
341 test_score = 0. | |
342 start_time = timeit.default_timer() | |
343 | |
344 epoch = 0 | |
345 done_looping = False | |
346 | |
347 while (epoch < n_epochs) and (not done_looping): | |
348 epoch = epoch + 1 | |
349 for minibatch_index in xrange(n_train_batches): | |
350 | |
351 minibatch_avg_cost = train_model(minibatch_index) | |
352 # iteration number | |
353 iter = (epoch - 1) * n_train_batches + minibatch_index | |
354 | |
355 if (iter + 1) % validation_frequency == 0: | |
356 # compute zero-one loss on validation set | |
357 validation_losses = [validate_model(i) for i | |
358 in xrange(n_valid_batches)] | |
359 this_validation_loss = numpy.mean(validation_losses) | |
360 | |
361 print( | |
362 'epoch %i, minibatch %i/%i, validation error %f %%' % | |
363 ( | |
364 epoch, | |
365 minibatch_index + 1, | |
366 n_train_batches, | |
367 this_validation_loss * 100. | |
368 ) | |
369 ) | |
370 | |
371 # if we got the best validation score until now | |
372 if this_validation_loss < best_validation_loss: | |
373 #improve patience if loss improvement is good enough | |
374 if ( | |
375 this_validation_loss < best_validation_loss * | |
376 improvement_threshold | |
377 ): | |
378 patience = max(patience, iter * patience_increase) | |
379 | |
380 best_validation_loss = this_validation_loss | |
381 best_iter = iter | |
382 | |
383 # test it on the test set | |
384 test_losses = [test_model(i) for i | |
385 in xrange(n_test_batches)] | |
386 test_score = numpy.mean(test_losses) | |
387 | |
388 print((' epoch %i, minibatch %i/%i, test error of ' | |
389 'best model %f %%') % | |
390 (epoch, minibatch_index + 1, n_train_batches, | |
391 test_score * 100.)) | |
392 | |
393 if patience <= iter: | |
394 done_looping = True | |
395 break | |
396 | |
397 end_time = timeit.default_timer() | |
398 print(('Optimization complete. Best validation score of %f %% ' | |
399 'obtained at iteration %i, with test performance %f %%') % | |
400 (best_validation_loss * 100., best_iter + 1, test_score * 100.)) | |
401 print >> sys.stderr, ('The code for file ' + | |
402 os.path.split(__file__)[1] + | |
403 ' ran for %.2fm' % ((end_time - start_time) / 60.)) | |
404 | |
405 | |
406 if __name__ == '__main__': | |
407 test_mlp() | |
408 | |
409 # Rectifier Linear Unit | |
410 #Source: http://stackoverflow.com/questions/26497564/theano-hiddenlayer-activation-function | |
411 def relu(x): | |
412 return T.maximum(0.,x) |