comparison Code/genre_classification/learning/logistic_sgd.py @ 24:68a62ca32441

Organized python scripts
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sat, 15 Aug 2015 19:16:17 +0100
parents
children
comparison
equal deleted inserted replaced
23:45e6f85d0ba4 24:68a62ca32441
1 """
2 This tutorial introduces logistic regression using Theano and stochastic
3 gradient descent.
4
5 Logistic regression is a probabilistic, linear classifier. It is parametrized
6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
7 done by projecting data points onto a set of hyperplanes, the distance to
8 which is used to determine a class membership probability.
9
10 Mathematically, this can be written as:
11
12 .. math::
13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\
14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
15
16
17 The output of the model or prediction is then done by taking the argmax of
18 the vector whose i'th element is P(Y=i|x).
19
20 .. math::
21
22 y_{pred} = argmax_i P(Y=i|x,W,b)
23
24
25 This tutorial presents a stochastic gradient descent optimization method
26 suitable for large datasets.
27
28
29 References:
30
31 - textbooks: "Pattern Recognition and Machine Learning" -
32 Christopher M. Bishop, section 4.3.2
33
34 """
35 __docformat__ = 'restructedtext en'
36
37 import cPickle
38 import gzip
39 import os
40 import sys
41 import timeit
42
43 import numpy
44
45 import theano
46 import theano.tensor as T
47
48
49 class LogisticRegression(object):
50 """Multi-class Logistic Regression Class
51
52 The logistic regression is fully described by a weight matrix :math:`W`
53 and bias vector :math:`b`. Classification is done by projecting data
54 points onto a set of hyperplanes, the distance to which is used to
55 determine a class membership probability.
56 """
57
58 def __init__(self, input, n_in, n_out):
59 """ Initialize the parameters of the logistic regression
60
61 :type input: theano.tensor.TensorType
62 :param input: symbolic variable that describes the input of the
63 architecture (one minibatch)
64
65 :type n_in: int
66 :param n_in: number of input units, the dimension of the space in
67 which the datapoints lie
68
69 :type n_out: int
70 :param n_out: number of output units, the dimension of the space in
71 which the labels lie
72
73 """
74 # start-snippet-1
75 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
76 self.W = theano.shared(
77 value=numpy.zeros(
78 (n_in, n_out),
79 dtype=theano.config.floatX
80 ),
81 name='W',
82 borrow=True
83 )
84 # initialize the baises b as a vector of n_out 0s
85 self.b = theano.shared(
86 value=numpy.zeros(
87 (n_out,),
88 dtype=theano.config.floatX
89 ),
90 name='b',
91 borrow=True
92 )
93
94 # symbolic expression for computing the matrix of class-membership
95 # probabilities
96 # Where:
97 # W is a matrix where column-k represent the separation hyperplane for
98 # class-k
99 # x is a matrix where row-j represents input training sample-j
100 # b is a vector where element-k represent the free parameter of
101 # hyperplane-k
102 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
103
104 # symbolic description of how to compute prediction as class whose
105 # probability is maximal
106 self.y_pred = T.argmax(self.p_y_given_x, axis=1)
107 # end-snippet-1
108
109 # parameters of the model
110 self.params = [self.W, self.b]
111
112 # keep track of model input
113 self.input = input
114
115 def negative_log_likelihood(self, y):
116 """Return the mean of the negative log-likelihood of the prediction
117 of this model under a given target distribution.
118
119 .. math::
120
121 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
122 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
123 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
124 \ell (\theta=\{W,b\}, \mathcal{D})
125
126 :type y: theano.tensor.TensorType
127 :param y: corresponds to a vector that gives for each example the
128 correct label
129
130 Note: we use the mean instead of the sum so that
131 the learning rate is less dependent on the batch size
132 """
133 # start-snippet-2
134 # y.shape[0] is (symbolically) the number of rows in y, i.e.,
135 # number of examples (call it n) in the minibatch
136 # T.arange(y.shape[0]) is a symbolic vector which will contain
137 # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
138 # Log-Probabilities (call it LP) with one row per example and
139 # one column per class LP[T.arange(y.shape[0]),y] is a vector
140 # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
141 # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
142 # the mean (across minibatch examples) of the elements in v,
143 # i.e., the mean log-likelihood across the minibatch.
144 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
145 # end-snippet-2
146
147 def errors(self, y):
148 """Return a float representing the number of errors in the minibatch
149 over the total number of examples of the minibatch ; zero one
150 loss over the size of the minibatch
151
152 :type y: theano.tensor.TensorType
153 :param y: corresponds to a vector that gives for each example the
154 correct label
155 """
156
157 # check if y has same dimension of y_pred
158 if y.ndim != self.y_pred.ndim:
159 raise TypeError(
160 'y should have the same shape as self.y_pred',
161 ('y', y.type, 'y_pred', self.y_pred.type)
162 )
163 # check if y is of the correct datatype
164 if y.dtype.startswith('int'):
165 # the T.neq operator returns a vector of 0s and 1s, where 1
166 # represents a mistake in prediction
167 return T.mean(T.neq(self.y_pred, y))
168 else:
169 raise NotImplementedError()
170
171
172 def load_data(dataset):
173 ''' Loads the dataset
174
175 :type dataset: string
176 :param dataset: the path to the dataset (here MNIST)
177 '''
178 #############
179 # LOAD DATA #
180 #############
181 '''
182 # Download the MNIST dataset if it is not present
183 data_dir, data_file = os.path.split(dataset)
184 if data_dir == "" and not os.path.isfile(dataset):
185 # Check if dataset is in the data directory.
186 new_path = os.path.join(
187 os.path.split(__file__)[0],
188 "..",
189 "data",
190 dataset
191 )
192 if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
193 dataset = new_path
194
195 if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
196 import urllib
197 origin = (
198 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
199 )
200 print 'Downloading data from %s' % origin
201 urllib.urlretrieve(origin, dataset)
202
203 print '... loading data'
204
205 # Load the dataset
206 f = gzip.open(dataset, 'rb')
207 '''
208 f = file(dataset, 'rb')
209 train_set, valid_set, test_set = cPickle.load(f)
210 f.close()
211 #train_set, valid_set, test_set format: tuple(input, target)
212 #input is an numpy.ndarray of 2 dimensions (a matrix)
213 #witch row's correspond to an example. target is a
214 #numpy.ndarray of 1 dimensions (vector)) that have the same length as
215 #the number of rows in the input. It should give the target
216 #target to the example with the same index in the input.
217
218 def shared_dataset(data_xy, borrow=True):
219 """ Function that loads the dataset into shared variables
220
221 The reason we store our dataset in shared variables is to allow
222 Theano to copy it into the GPU memory (when code is run on GPU).
223 Since copying data into the GPU is slow, copying a minibatch everytime
224 is needed (the default behaviour if the data is not in a shared
225 variable) would lead to a large decrease in performance.
226 """
227 data_x, data_y = data_xy
228 shared_x = theano.shared(numpy.asarray(data_x,
229 dtype=theano.config.floatX),
230 borrow=borrow)
231 shared_y = theano.shared(numpy.asarray(data_y,
232 dtype=theano.config.floatX),
233 borrow=borrow)
234 # When storing data on the GPU it has to be stored as floats
235 # therefore we will store the labels as ``floatX`` as well
236 # (``shared_y`` does exactly that). But during our computations
237 # we need them as ints (we use labels as index, and if they are
238 # floats it doesn't make sense) therefore instead of returning
239 # ``shared_y`` we will have to cast it to int. This little hack
240 # lets ous get around this issue
241 return shared_x, T.cast(shared_y, 'int32')
242
243 test_set_x, test_set_y = shared_dataset(test_set)
244 valid_set_x, valid_set_y = shared_dataset(valid_set)
245 train_set_x, train_set_y = shared_dataset(train_set)
246
247 rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
248 (test_set_x, test_set_y)]
249 return rval
250
251
252 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
253 dataset='mnist.pkl.gz',
254 batch_size=600):
255 """
256 Demonstrate stochastic gradient descent optimization of a log-linear
257 model
258
259 This is demonstrated on MNIST.
260
261 :type learning_rate: float
262 :param learning_rate: learning rate used (factor for the stochastic
263 gradient)
264
265 :type n_epochs: int
266 :param n_epochs: maximal number of epochs to run the optimizer
267
268 :type dataset: string
269 :param dataset: the path of the MNIST dataset file from
270 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
271
272 """
273 datasets = load_data(dataset)
274
275 train_set_x, train_set_y = datasets[0]
276 valid_set_x, valid_set_y = datasets[1]
277 test_set_x, test_set_y = datasets[2]
278
279 # compute number of minibatches for training, validation and testing
280 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
281 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
282 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
283
284 ######################
285 # BUILD ACTUAL MODEL #
286 ######################
287 print '... building the model'
288
289 # allocate symbolic variables for the data
290 index = T.lscalar() # index to a [mini]batch
291
292 # generate symbolic variables for input (x and y represent a
293 # minibatch)
294 x = T.matrix('x') # data, presented as rasterized images
295 y = T.ivector('y') # labels, presented as 1D vector of [int] labels
296
297 # construct the logistic regression class
298 # Each MNIST image has size 28*28
299 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
300
301 # the cost we minimize during training is the negative log likelihood of
302 # the model in symbolic format
303 cost = classifier.negative_log_likelihood(y)
304
305 # compiling a Theano function that computes the mistakes that are made by
306 # the model on a minibatch
307 test_model = theano.function(
308 inputs=[index],
309 outputs=classifier.errors(y),
310 givens={
311 x: test_set_x[index * batch_size: (index + 1) * batch_size],
312 y: test_set_y[index * batch_size: (index + 1) * batch_size]
313 }
314 )
315
316 validate_model = theano.function(
317 inputs=[index],
318 outputs=classifier.errors(y),
319 givens={
320 x: valid_set_x[index * batch_size: (index + 1) * batch_size],
321 y: valid_set_y[index * batch_size: (index + 1) * batch_size]
322 }
323 )
324
325 # compute the gradient of cost with respect to theta = (W,b)
326 g_W = T.grad(cost=cost, wrt=classifier.W)
327 g_b = T.grad(cost=cost, wrt=classifier.b)
328
329 # start-snippet-3
330 # specify how to update the parameters of the model as a list of
331 # (variable, update expression) pairs.
332 updates = [(classifier.W, classifier.W - learning_rate * g_W),
333 (classifier.b, classifier.b - learning_rate * g_b)]
334
335 # compiling a Theano function `train_model` that returns the cost, but in
336 # the same time updates the parameter of the model based on the rules
337 # defined in `updates`
338 train_model = theano.function(
339 inputs=[index],
340 outputs=cost,
341 updates=updates,
342 givens={
343 x: train_set_x[index * batch_size: (index + 1) * batch_size],
344 y: train_set_y[index * batch_size: (index + 1) * batch_size]
345 }
346 )
347 # end-snippet-3
348
349 ###############
350 # TRAIN MODEL #
351 ###############
352 print '... training the model'
353 # early-stopping parameters
354 patience = 5000 # look as this many examples regardless
355 patience_increase = 2 # wait this much longer when a new best is
356 # found
357 improvement_threshold = 0.995 # a relative improvement of this much is
358 # considered significant
359 validation_frequency = min(n_train_batches, patience / 2)
360 # go through this many
361 # minibatche before checking the network
362 # on the validation set; in this case we
363 # check every epoch
364
365 best_validation_loss = numpy.inf
366 test_score = 0.
367 start_time = timeit.default_timer()
368
369 done_looping = False
370 epoch = 0
371 while (epoch < n_epochs) and (not done_looping):
372 epoch = epoch + 1
373 for minibatch_index in xrange(n_train_batches):
374
375 minibatch_avg_cost = train_model(minibatch_index)
376 # iteration number
377 iter = (epoch - 1) * n_train_batches + minibatch_index
378
379 if (iter + 1) % validation_frequency == 0:
380 # compute zero-one loss on validation set
381 validation_losses = [validate_model(i)
382 for i in xrange(n_valid_batches)]
383 this_validation_loss = numpy.mean(validation_losses)
384
385 print(
386 'epoch %i, minibatch %i/%i, validation error %f %%' %
387 (
388 epoch,
389 minibatch_index + 1,
390 n_train_batches,
391 this_validation_loss * 100.
392 )
393 )
394
395 # if we got the best validation score until now
396 if this_validation_loss < best_validation_loss:
397 #improve patience if loss improvement is good enough
398 if this_validation_loss < best_validation_loss * \
399 improvement_threshold:
400 patience = max(patience, iter * patience_increase)
401
402 best_validation_loss = this_validation_loss
403 # test it on the test set
404
405 test_losses = [test_model(i)
406 for i in xrange(n_test_batches)]
407 test_score = numpy.mean(test_losses)
408
409 print(
410 (
411 ' epoch %i, minibatch %i/%i, test error of'
412 ' best model %f %%'
413 ) %
414 (
415 epoch,
416 minibatch_index + 1,
417 n_train_batches,
418 test_score * 100.
419 )
420 )
421
422 # save the best model
423 with open('best_model.pkl', 'w') as f:
424 cPickle.dump(classifier, f)
425
426 if patience <= iter:
427 done_looping = True
428 break
429
430 end_time = timeit.default_timer()
431 print(
432 (
433 'Optimization complete with best validation score of %f %%,'
434 'with test performance %f %%'
435 )
436 % (best_validation_loss * 100., test_score * 100.)
437 )
438 print 'The code run for %d epochs, with %f epochs/sec' % (
439 epoch, 1. * epoch / (end_time - start_time))
440 print >> sys.stderr, ('The code for file ' +
441 os.path.split(__file__)[1] +
442 ' ran for %.1fs' % ((end_time - start_time)))
443
444
445 def predict():
446 """
447 An example of how to load a trained model and use it
448 to predict labels.
449 """
450
451 # load the saved model
452 classifier = cPickle.load(open('best_model.pkl'))
453
454 # compile a predictor function
455 predict_model = theano.function(
456 inputs=[classifier.input],
457 outputs=classifier.y_pred)
458
459 # We can test it on some examples from test test
460 dataset='mnist.pkl.gz'
461 datasets = load_data(dataset)
462 test_set_x, test_set_y = datasets[2]
463 test_set_x = test_set_x.get_value()
464
465 predicted_values = predict_model(test_set_x[:10])
466 print ("Predicted values for the first 10 examples in test set:")
467 print predicted_values
468
469
470 if __name__ == '__main__':
471 sgd_optimization_mnist()
472