Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/genre_classification/learning/logistic_sgd.py @ 24:68a62ca32441
Organized python scripts
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 15 Aug 2015 19:16:17 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:45e6f85d0ba4 | 24:68a62ca32441 |
---|---|
1 """ | |
2 This tutorial introduces logistic regression using Theano and stochastic | |
3 gradient descent. | |
4 | |
5 Logistic regression is a probabilistic, linear classifier. It is parametrized | |
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is | |
7 done by projecting data points onto a set of hyperplanes, the distance to | |
8 which is used to determine a class membership probability. | |
9 | |
10 Mathematically, this can be written as: | |
11 | |
12 .. math:: | |
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\ | |
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} | |
15 | |
16 | |
17 The output of the model or prediction is then done by taking the argmax of | |
18 the vector whose i'th element is P(Y=i|x). | |
19 | |
20 .. math:: | |
21 | |
22 y_{pred} = argmax_i P(Y=i|x,W,b) | |
23 | |
24 | |
25 This tutorial presents a stochastic gradient descent optimization method | |
26 suitable for large datasets. | |
27 | |
28 | |
29 References: | |
30 | |
31 - textbooks: "Pattern Recognition and Machine Learning" - | |
32 Christopher M. Bishop, section 4.3.2 | |
33 | |
34 """ | |
35 __docformat__ = 'restructedtext en' | |
36 | |
37 import cPickle | |
38 import gzip | |
39 import os | |
40 import sys | |
41 import timeit | |
42 | |
43 import numpy | |
44 | |
45 import theano | |
46 import theano.tensor as T | |
47 | |
48 | |
49 class LogisticRegression(object): | |
50 """Multi-class Logistic Regression Class | |
51 | |
52 The logistic regression is fully described by a weight matrix :math:`W` | |
53 and bias vector :math:`b`. Classification is done by projecting data | |
54 points onto a set of hyperplanes, the distance to which is used to | |
55 determine a class membership probability. | |
56 """ | |
57 | |
58 def __init__(self, input, n_in, n_out): | |
59 """ Initialize the parameters of the logistic regression | |
60 | |
61 :type input: theano.tensor.TensorType | |
62 :param input: symbolic variable that describes the input of the | |
63 architecture (one minibatch) | |
64 | |
65 :type n_in: int | |
66 :param n_in: number of input units, the dimension of the space in | |
67 which the datapoints lie | |
68 | |
69 :type n_out: int | |
70 :param n_out: number of output units, the dimension of the space in | |
71 which the labels lie | |
72 | |
73 """ | |
74 # start-snippet-1 | |
75 # initialize with 0 the weights W as a matrix of shape (n_in, n_out) | |
76 self.W = theano.shared( | |
77 value=numpy.zeros( | |
78 (n_in, n_out), | |
79 dtype=theano.config.floatX | |
80 ), | |
81 name='W', | |
82 borrow=True | |
83 ) | |
84 # initialize the baises b as a vector of n_out 0s | |
85 self.b = theano.shared( | |
86 value=numpy.zeros( | |
87 (n_out,), | |
88 dtype=theano.config.floatX | |
89 ), | |
90 name='b', | |
91 borrow=True | |
92 ) | |
93 | |
94 # symbolic expression for computing the matrix of class-membership | |
95 # probabilities | |
96 # Where: | |
97 # W is a matrix where column-k represent the separation hyperplane for | |
98 # class-k | |
99 # x is a matrix where row-j represents input training sample-j | |
100 # b is a vector where element-k represent the free parameter of | |
101 # hyperplane-k | |
102 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) | |
103 | |
104 # symbolic description of how to compute prediction as class whose | |
105 # probability is maximal | |
106 self.y_pred = T.argmax(self.p_y_given_x, axis=1) | |
107 # end-snippet-1 | |
108 | |
109 # parameters of the model | |
110 self.params = [self.W, self.b] | |
111 | |
112 # keep track of model input | |
113 self.input = input | |
114 | |
115 def negative_log_likelihood(self, y): | |
116 """Return the mean of the negative log-likelihood of the prediction | |
117 of this model under a given target distribution. | |
118 | |
119 .. math:: | |
120 | |
121 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = | |
122 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} | |
123 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ | |
124 \ell (\theta=\{W,b\}, \mathcal{D}) | |
125 | |
126 :type y: theano.tensor.TensorType | |
127 :param y: corresponds to a vector that gives for each example the | |
128 correct label | |
129 | |
130 Note: we use the mean instead of the sum so that | |
131 the learning rate is less dependent on the batch size | |
132 """ | |
133 # start-snippet-2 | |
134 # y.shape[0] is (symbolically) the number of rows in y, i.e., | |
135 # number of examples (call it n) in the minibatch | |
136 # T.arange(y.shape[0]) is a symbolic vector which will contain | |
137 # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of | |
138 # Log-Probabilities (call it LP) with one row per example and | |
139 # one column per class LP[T.arange(y.shape[0]),y] is a vector | |
140 # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., | |
141 # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is | |
142 # the mean (across minibatch examples) of the elements in v, | |
143 # i.e., the mean log-likelihood across the minibatch. | |
144 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
145 # end-snippet-2 | |
146 | |
147 def errors(self, y): | |
148 """Return a float representing the number of errors in the minibatch | |
149 over the total number of examples of the minibatch ; zero one | |
150 loss over the size of the minibatch | |
151 | |
152 :type y: theano.tensor.TensorType | |
153 :param y: corresponds to a vector that gives for each example the | |
154 correct label | |
155 """ | |
156 | |
157 # check if y has same dimension of y_pred | |
158 if y.ndim != self.y_pred.ndim: | |
159 raise TypeError( | |
160 'y should have the same shape as self.y_pred', | |
161 ('y', y.type, 'y_pred', self.y_pred.type) | |
162 ) | |
163 # check if y is of the correct datatype | |
164 if y.dtype.startswith('int'): | |
165 # the T.neq operator returns a vector of 0s and 1s, where 1 | |
166 # represents a mistake in prediction | |
167 return T.mean(T.neq(self.y_pred, y)) | |
168 else: | |
169 raise NotImplementedError() | |
170 | |
171 | |
172 def load_data(dataset): | |
173 ''' Loads the dataset | |
174 | |
175 :type dataset: string | |
176 :param dataset: the path to the dataset (here MNIST) | |
177 ''' | |
178 ############# | |
179 # LOAD DATA # | |
180 ############# | |
181 ''' | |
182 # Download the MNIST dataset if it is not present | |
183 data_dir, data_file = os.path.split(dataset) | |
184 if data_dir == "" and not os.path.isfile(dataset): | |
185 # Check if dataset is in the data directory. | |
186 new_path = os.path.join( | |
187 os.path.split(__file__)[0], | |
188 "..", | |
189 "data", | |
190 dataset | |
191 ) | |
192 if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': | |
193 dataset = new_path | |
194 | |
195 if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': | |
196 import urllib | |
197 origin = ( | |
198 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' | |
199 ) | |
200 print 'Downloading data from %s' % origin | |
201 urllib.urlretrieve(origin, dataset) | |
202 | |
203 print '... loading data' | |
204 | |
205 # Load the dataset | |
206 f = gzip.open(dataset, 'rb') | |
207 ''' | |
208 f = file(dataset, 'rb') | |
209 train_set, valid_set, test_set = cPickle.load(f) | |
210 f.close() | |
211 #train_set, valid_set, test_set format: tuple(input, target) | |
212 #input is an numpy.ndarray of 2 dimensions (a matrix) | |
213 #witch row's correspond to an example. target is a | |
214 #numpy.ndarray of 1 dimensions (vector)) that have the same length as | |
215 #the number of rows in the input. It should give the target | |
216 #target to the example with the same index in the input. | |
217 | |
218 def shared_dataset(data_xy, borrow=True): | |
219 """ Function that loads the dataset into shared variables | |
220 | |
221 The reason we store our dataset in shared variables is to allow | |
222 Theano to copy it into the GPU memory (when code is run on GPU). | |
223 Since copying data into the GPU is slow, copying a minibatch everytime | |
224 is needed (the default behaviour if the data is not in a shared | |
225 variable) would lead to a large decrease in performance. | |
226 """ | |
227 data_x, data_y = data_xy | |
228 shared_x = theano.shared(numpy.asarray(data_x, | |
229 dtype=theano.config.floatX), | |
230 borrow=borrow) | |
231 shared_y = theano.shared(numpy.asarray(data_y, | |
232 dtype=theano.config.floatX), | |
233 borrow=borrow) | |
234 # When storing data on the GPU it has to be stored as floats | |
235 # therefore we will store the labels as ``floatX`` as well | |
236 # (``shared_y`` does exactly that). But during our computations | |
237 # we need them as ints (we use labels as index, and if they are | |
238 # floats it doesn't make sense) therefore instead of returning | |
239 # ``shared_y`` we will have to cast it to int. This little hack | |
240 # lets ous get around this issue | |
241 return shared_x, T.cast(shared_y, 'int32') | |
242 | |
243 test_set_x, test_set_y = shared_dataset(test_set) | |
244 valid_set_x, valid_set_y = shared_dataset(valid_set) | |
245 train_set_x, train_set_y = shared_dataset(train_set) | |
246 | |
247 rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), | |
248 (test_set_x, test_set_y)] | |
249 return rval | |
250 | |
251 | |
252 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, | |
253 dataset='mnist.pkl.gz', | |
254 batch_size=600): | |
255 """ | |
256 Demonstrate stochastic gradient descent optimization of a log-linear | |
257 model | |
258 | |
259 This is demonstrated on MNIST. | |
260 | |
261 :type learning_rate: float | |
262 :param learning_rate: learning rate used (factor for the stochastic | |
263 gradient) | |
264 | |
265 :type n_epochs: int | |
266 :param n_epochs: maximal number of epochs to run the optimizer | |
267 | |
268 :type dataset: string | |
269 :param dataset: the path of the MNIST dataset file from | |
270 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz | |
271 | |
272 """ | |
273 datasets = load_data(dataset) | |
274 | |
275 train_set_x, train_set_y = datasets[0] | |
276 valid_set_x, valid_set_y = datasets[1] | |
277 test_set_x, test_set_y = datasets[2] | |
278 | |
279 # compute number of minibatches for training, validation and testing | |
280 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size | |
281 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size | |
282 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size | |
283 | |
284 ###################### | |
285 # BUILD ACTUAL MODEL # | |
286 ###################### | |
287 print '... building the model' | |
288 | |
289 # allocate symbolic variables for the data | |
290 index = T.lscalar() # index to a [mini]batch | |
291 | |
292 # generate symbolic variables for input (x and y represent a | |
293 # minibatch) | |
294 x = T.matrix('x') # data, presented as rasterized images | |
295 y = T.ivector('y') # labels, presented as 1D vector of [int] labels | |
296 | |
297 # construct the logistic regression class | |
298 # Each MNIST image has size 28*28 | |
299 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) | |
300 | |
301 # the cost we minimize during training is the negative log likelihood of | |
302 # the model in symbolic format | |
303 cost = classifier.negative_log_likelihood(y) | |
304 | |
305 # compiling a Theano function that computes the mistakes that are made by | |
306 # the model on a minibatch | |
307 test_model = theano.function( | |
308 inputs=[index], | |
309 outputs=classifier.errors(y), | |
310 givens={ | |
311 x: test_set_x[index * batch_size: (index + 1) * batch_size], | |
312 y: test_set_y[index * batch_size: (index + 1) * batch_size] | |
313 } | |
314 ) | |
315 | |
316 validate_model = theano.function( | |
317 inputs=[index], | |
318 outputs=classifier.errors(y), | |
319 givens={ | |
320 x: valid_set_x[index * batch_size: (index + 1) * batch_size], | |
321 y: valid_set_y[index * batch_size: (index + 1) * batch_size] | |
322 } | |
323 ) | |
324 | |
325 # compute the gradient of cost with respect to theta = (W,b) | |
326 g_W = T.grad(cost=cost, wrt=classifier.W) | |
327 g_b = T.grad(cost=cost, wrt=classifier.b) | |
328 | |
329 # start-snippet-3 | |
330 # specify how to update the parameters of the model as a list of | |
331 # (variable, update expression) pairs. | |
332 updates = [(classifier.W, classifier.W - learning_rate * g_W), | |
333 (classifier.b, classifier.b - learning_rate * g_b)] | |
334 | |
335 # compiling a Theano function `train_model` that returns the cost, but in | |
336 # the same time updates the parameter of the model based on the rules | |
337 # defined in `updates` | |
338 train_model = theano.function( | |
339 inputs=[index], | |
340 outputs=cost, | |
341 updates=updates, | |
342 givens={ | |
343 x: train_set_x[index * batch_size: (index + 1) * batch_size], | |
344 y: train_set_y[index * batch_size: (index + 1) * batch_size] | |
345 } | |
346 ) | |
347 # end-snippet-3 | |
348 | |
349 ############### | |
350 # TRAIN MODEL # | |
351 ############### | |
352 print '... training the model' | |
353 # early-stopping parameters | |
354 patience = 5000 # look as this many examples regardless | |
355 patience_increase = 2 # wait this much longer when a new best is | |
356 # found | |
357 improvement_threshold = 0.995 # a relative improvement of this much is | |
358 # considered significant | |
359 validation_frequency = min(n_train_batches, patience / 2) | |
360 # go through this many | |
361 # minibatche before checking the network | |
362 # on the validation set; in this case we | |
363 # check every epoch | |
364 | |
365 best_validation_loss = numpy.inf | |
366 test_score = 0. | |
367 start_time = timeit.default_timer() | |
368 | |
369 done_looping = False | |
370 epoch = 0 | |
371 while (epoch < n_epochs) and (not done_looping): | |
372 epoch = epoch + 1 | |
373 for minibatch_index in xrange(n_train_batches): | |
374 | |
375 minibatch_avg_cost = train_model(minibatch_index) | |
376 # iteration number | |
377 iter = (epoch - 1) * n_train_batches + minibatch_index | |
378 | |
379 if (iter + 1) % validation_frequency == 0: | |
380 # compute zero-one loss on validation set | |
381 validation_losses = [validate_model(i) | |
382 for i in xrange(n_valid_batches)] | |
383 this_validation_loss = numpy.mean(validation_losses) | |
384 | |
385 print( | |
386 'epoch %i, minibatch %i/%i, validation error %f %%' % | |
387 ( | |
388 epoch, | |
389 minibatch_index + 1, | |
390 n_train_batches, | |
391 this_validation_loss * 100. | |
392 ) | |
393 ) | |
394 | |
395 # if we got the best validation score until now | |
396 if this_validation_loss < best_validation_loss: | |
397 #improve patience if loss improvement is good enough | |
398 if this_validation_loss < best_validation_loss * \ | |
399 improvement_threshold: | |
400 patience = max(patience, iter * patience_increase) | |
401 | |
402 best_validation_loss = this_validation_loss | |
403 # test it on the test set | |
404 | |
405 test_losses = [test_model(i) | |
406 for i in xrange(n_test_batches)] | |
407 test_score = numpy.mean(test_losses) | |
408 | |
409 print( | |
410 ( | |
411 ' epoch %i, minibatch %i/%i, test error of' | |
412 ' best model %f %%' | |
413 ) % | |
414 ( | |
415 epoch, | |
416 minibatch_index + 1, | |
417 n_train_batches, | |
418 test_score * 100. | |
419 ) | |
420 ) | |
421 | |
422 # save the best model | |
423 with open('best_model.pkl', 'w') as f: | |
424 cPickle.dump(classifier, f) | |
425 | |
426 if patience <= iter: | |
427 done_looping = True | |
428 break | |
429 | |
430 end_time = timeit.default_timer() | |
431 print( | |
432 ( | |
433 'Optimization complete with best validation score of %f %%,' | |
434 'with test performance %f %%' | |
435 ) | |
436 % (best_validation_loss * 100., test_score * 100.) | |
437 ) | |
438 print 'The code run for %d epochs, with %f epochs/sec' % ( | |
439 epoch, 1. * epoch / (end_time - start_time)) | |
440 print >> sys.stderr, ('The code for file ' + | |
441 os.path.split(__file__)[1] + | |
442 ' ran for %.1fs' % ((end_time - start_time))) | |
443 | |
444 | |
445 def predict(): | |
446 """ | |
447 An example of how to load a trained model and use it | |
448 to predict labels. | |
449 """ | |
450 | |
451 # load the saved model | |
452 classifier = cPickle.load(open('best_model.pkl')) | |
453 | |
454 # compile a predictor function | |
455 predict_model = theano.function( | |
456 inputs=[classifier.input], | |
457 outputs=classifier.y_pred) | |
458 | |
459 # We can test it on some examples from test test | |
460 dataset='mnist.pkl.gz' | |
461 datasets = load_data(dataset) | |
462 test_set_x, test_set_y = datasets[2] | |
463 test_set_x = test_set_x.get_value() | |
464 | |
465 predicted_values = predict_model(test_set_x[:10]) | |
466 print ("Predicted values for the first 10 examples in test set:") | |
467 print predicted_values | |
468 | |
469 | |
470 if __name__ == '__main__': | |
471 sgd_optimization_mnist() | |
472 |