annotate Code/genre_classification/learning/logistic_sgd.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sat, 09 Jul 2022 00:50:43 -0500
parents 68a62ca32441
children
rev   line source
p@24 1 """
p@24 2 This tutorial introduces logistic regression using Theano and stochastic
p@24 3 gradient descent.
p@24 4
p@24 5 Logistic regression is a probabilistic, linear classifier. It is parametrized
p@24 6 by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
p@24 7 done by projecting data points onto a set of hyperplanes, the distance to
p@24 8 which is used to determine a class membership probability.
p@24 9
p@24 10 Mathematically, this can be written as:
p@24 11
p@24 12 .. math::
p@24 13 P(Y=i|x, W,b) &= softmax_i(W x + b) \\
p@24 14 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
p@24 15
p@24 16
p@24 17 The output of the model or prediction is then done by taking the argmax of
p@24 18 the vector whose i'th element is P(Y=i|x).
p@24 19
p@24 20 .. math::
p@24 21
p@24 22 y_{pred} = argmax_i P(Y=i|x,W,b)
p@24 23
p@24 24
p@24 25 This tutorial presents a stochastic gradient descent optimization method
p@24 26 suitable for large datasets.
p@24 27
p@24 28
p@24 29 References:
p@24 30
p@24 31 - textbooks: "Pattern Recognition and Machine Learning" -
p@24 32 Christopher M. Bishop, section 4.3.2
p@24 33
p@24 34 """
p@24 35 __docformat__ = 'restructedtext en'
p@24 36
p@24 37 import cPickle
p@24 38 import gzip
p@24 39 import os
p@24 40 import sys
p@24 41 import timeit
p@24 42
p@24 43 import numpy
p@24 44
p@24 45 import theano
p@24 46 import theano.tensor as T
p@24 47
p@24 48
p@24 49 class LogisticRegression(object):
p@24 50 """Multi-class Logistic Regression Class
p@24 51
p@24 52 The logistic regression is fully described by a weight matrix :math:`W`
p@24 53 and bias vector :math:`b`. Classification is done by projecting data
p@24 54 points onto a set of hyperplanes, the distance to which is used to
p@24 55 determine a class membership probability.
p@24 56 """
p@24 57
p@24 58 def __init__(self, input, n_in, n_out):
p@24 59 """ Initialize the parameters of the logistic regression
p@24 60
p@24 61 :type input: theano.tensor.TensorType
p@24 62 :param input: symbolic variable that describes the input of the
p@24 63 architecture (one minibatch)
p@24 64
p@24 65 :type n_in: int
p@24 66 :param n_in: number of input units, the dimension of the space in
p@24 67 which the datapoints lie
p@24 68
p@24 69 :type n_out: int
p@24 70 :param n_out: number of output units, the dimension of the space in
p@24 71 which the labels lie
p@24 72
p@24 73 """
p@24 74 # start-snippet-1
p@24 75 # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
p@24 76 self.W = theano.shared(
p@24 77 value=numpy.zeros(
p@24 78 (n_in, n_out),
p@24 79 dtype=theano.config.floatX
p@24 80 ),
p@24 81 name='W',
p@24 82 borrow=True
p@24 83 )
p@24 84 # initialize the baises b as a vector of n_out 0s
p@24 85 self.b = theano.shared(
p@24 86 value=numpy.zeros(
p@24 87 (n_out,),
p@24 88 dtype=theano.config.floatX
p@24 89 ),
p@24 90 name='b',
p@24 91 borrow=True
p@24 92 )
p@24 93
p@24 94 # symbolic expression for computing the matrix of class-membership
p@24 95 # probabilities
p@24 96 # Where:
p@24 97 # W is a matrix where column-k represent the separation hyperplane for
p@24 98 # class-k
p@24 99 # x is a matrix where row-j represents input training sample-j
p@24 100 # b is a vector where element-k represent the free parameter of
p@24 101 # hyperplane-k
p@24 102 self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
p@24 103
p@24 104 # symbolic description of how to compute prediction as class whose
p@24 105 # probability is maximal
p@24 106 self.y_pred = T.argmax(self.p_y_given_x, axis=1)
p@24 107 # end-snippet-1
p@24 108
p@24 109 # parameters of the model
p@24 110 self.params = [self.W, self.b]
p@24 111
p@24 112 # keep track of model input
p@24 113 self.input = input
p@24 114
p@24 115 def negative_log_likelihood(self, y):
p@24 116 """Return the mean of the negative log-likelihood of the prediction
p@24 117 of this model under a given target distribution.
p@24 118
p@24 119 .. math::
p@24 120
p@24 121 \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
p@24 122 \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
p@24 123 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
p@24 124 \ell (\theta=\{W,b\}, \mathcal{D})
p@24 125
p@24 126 :type y: theano.tensor.TensorType
p@24 127 :param y: corresponds to a vector that gives for each example the
p@24 128 correct label
p@24 129
p@24 130 Note: we use the mean instead of the sum so that
p@24 131 the learning rate is less dependent on the batch size
p@24 132 """
p@24 133 # start-snippet-2
p@24 134 # y.shape[0] is (symbolically) the number of rows in y, i.e.,
p@24 135 # number of examples (call it n) in the minibatch
p@24 136 # T.arange(y.shape[0]) is a symbolic vector which will contain
p@24 137 # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
p@24 138 # Log-Probabilities (call it LP) with one row per example and
p@24 139 # one column per class LP[T.arange(y.shape[0]),y] is a vector
p@24 140 # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
p@24 141 # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
p@24 142 # the mean (across minibatch examples) of the elements in v,
p@24 143 # i.e., the mean log-likelihood across the minibatch.
p@24 144 return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
p@24 145 # end-snippet-2
p@24 146
p@24 147 def errors(self, y):
p@24 148 """Return a float representing the number of errors in the minibatch
p@24 149 over the total number of examples of the minibatch ; zero one
p@24 150 loss over the size of the minibatch
p@24 151
p@24 152 :type y: theano.tensor.TensorType
p@24 153 :param y: corresponds to a vector that gives for each example the
p@24 154 correct label
p@24 155 """
p@24 156
p@24 157 # check if y has same dimension of y_pred
p@24 158 if y.ndim != self.y_pred.ndim:
p@24 159 raise TypeError(
p@24 160 'y should have the same shape as self.y_pred',
p@24 161 ('y', y.type, 'y_pred', self.y_pred.type)
p@24 162 )
p@24 163 # check if y is of the correct datatype
p@24 164 if y.dtype.startswith('int'):
p@24 165 # the T.neq operator returns a vector of 0s and 1s, where 1
p@24 166 # represents a mistake in prediction
p@24 167 return T.mean(T.neq(self.y_pred, y))
p@24 168 else:
p@24 169 raise NotImplementedError()
p@24 170
p@24 171
p@24 172 def load_data(dataset):
p@24 173 ''' Loads the dataset
p@24 174
p@24 175 :type dataset: string
p@24 176 :param dataset: the path to the dataset (here MNIST)
p@24 177 '''
p@24 178 #############
p@24 179 # LOAD DATA #
p@24 180 #############
p@24 181 '''
p@24 182 # Download the MNIST dataset if it is not present
p@24 183 data_dir, data_file = os.path.split(dataset)
p@24 184 if data_dir == "" and not os.path.isfile(dataset):
p@24 185 # Check if dataset is in the data directory.
p@24 186 new_path = os.path.join(
p@24 187 os.path.split(__file__)[0],
p@24 188 "..",
p@24 189 "data",
p@24 190 dataset
p@24 191 )
p@24 192 if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
p@24 193 dataset = new_path
p@24 194
p@24 195 if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
p@24 196 import urllib
p@24 197 origin = (
p@24 198 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
p@24 199 )
p@24 200 print 'Downloading data from %s' % origin
p@24 201 urllib.urlretrieve(origin, dataset)
p@24 202
p@24 203 print '... loading data'
p@24 204
p@24 205 # Load the dataset
p@24 206 f = gzip.open(dataset, 'rb')
p@24 207 '''
p@24 208 f = file(dataset, 'rb')
p@24 209 train_set, valid_set, test_set = cPickle.load(f)
p@24 210 f.close()
p@24 211 #train_set, valid_set, test_set format: tuple(input, target)
p@24 212 #input is an numpy.ndarray of 2 dimensions (a matrix)
p@24 213 #witch row's correspond to an example. target is a
p@24 214 #numpy.ndarray of 1 dimensions (vector)) that have the same length as
p@24 215 #the number of rows in the input. It should give the target
p@24 216 #target to the example with the same index in the input.
p@24 217
p@24 218 def shared_dataset(data_xy, borrow=True):
p@24 219 """ Function that loads the dataset into shared variables
p@24 220
p@24 221 The reason we store our dataset in shared variables is to allow
p@24 222 Theano to copy it into the GPU memory (when code is run on GPU).
p@24 223 Since copying data into the GPU is slow, copying a minibatch everytime
p@24 224 is needed (the default behaviour if the data is not in a shared
p@24 225 variable) would lead to a large decrease in performance.
p@24 226 """
p@24 227 data_x, data_y = data_xy
p@24 228 shared_x = theano.shared(numpy.asarray(data_x,
p@24 229 dtype=theano.config.floatX),
p@24 230 borrow=borrow)
p@24 231 shared_y = theano.shared(numpy.asarray(data_y,
p@24 232 dtype=theano.config.floatX),
p@24 233 borrow=borrow)
p@24 234 # When storing data on the GPU it has to be stored as floats
p@24 235 # therefore we will store the labels as ``floatX`` as well
p@24 236 # (``shared_y`` does exactly that). But during our computations
p@24 237 # we need them as ints (we use labels as index, and if they are
p@24 238 # floats it doesn't make sense) therefore instead of returning
p@24 239 # ``shared_y`` we will have to cast it to int. This little hack
p@24 240 # lets ous get around this issue
p@24 241 return shared_x, T.cast(shared_y, 'int32')
p@24 242
p@24 243 test_set_x, test_set_y = shared_dataset(test_set)
p@24 244 valid_set_x, valid_set_y = shared_dataset(valid_set)
p@24 245 train_set_x, train_set_y = shared_dataset(train_set)
p@24 246
p@24 247 rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
p@24 248 (test_set_x, test_set_y)]
p@24 249 return rval
p@24 250
p@24 251
p@24 252 def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
p@24 253 dataset='mnist.pkl.gz',
p@24 254 batch_size=600):
p@24 255 """
p@24 256 Demonstrate stochastic gradient descent optimization of a log-linear
p@24 257 model
p@24 258
p@24 259 This is demonstrated on MNIST.
p@24 260
p@24 261 :type learning_rate: float
p@24 262 :param learning_rate: learning rate used (factor for the stochastic
p@24 263 gradient)
p@24 264
p@24 265 :type n_epochs: int
p@24 266 :param n_epochs: maximal number of epochs to run the optimizer
p@24 267
p@24 268 :type dataset: string
p@24 269 :param dataset: the path of the MNIST dataset file from
p@24 270 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
p@24 271
p@24 272 """
p@24 273 datasets = load_data(dataset)
p@24 274
p@24 275 train_set_x, train_set_y = datasets[0]
p@24 276 valid_set_x, valid_set_y = datasets[1]
p@24 277 test_set_x, test_set_y = datasets[2]
p@24 278
p@24 279 # compute number of minibatches for training, validation and testing
p@24 280 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
p@24 281 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
p@24 282 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
p@24 283
p@24 284 ######################
p@24 285 # BUILD ACTUAL MODEL #
p@24 286 ######################
p@24 287 print '... building the model'
p@24 288
p@24 289 # allocate symbolic variables for the data
p@24 290 index = T.lscalar() # index to a [mini]batch
p@24 291
p@24 292 # generate symbolic variables for input (x and y represent a
p@24 293 # minibatch)
p@24 294 x = T.matrix('x') # data, presented as rasterized images
p@24 295 y = T.ivector('y') # labels, presented as 1D vector of [int] labels
p@24 296
p@24 297 # construct the logistic regression class
p@24 298 # Each MNIST image has size 28*28
p@24 299 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
p@24 300
p@24 301 # the cost we minimize during training is the negative log likelihood of
p@24 302 # the model in symbolic format
p@24 303 cost = classifier.negative_log_likelihood(y)
p@24 304
p@24 305 # compiling a Theano function that computes the mistakes that are made by
p@24 306 # the model on a minibatch
p@24 307 test_model = theano.function(
p@24 308 inputs=[index],
p@24 309 outputs=classifier.errors(y),
p@24 310 givens={
p@24 311 x: test_set_x[index * batch_size: (index + 1) * batch_size],
p@24 312 y: test_set_y[index * batch_size: (index + 1) * batch_size]
p@24 313 }
p@24 314 )
p@24 315
p@24 316 validate_model = theano.function(
p@24 317 inputs=[index],
p@24 318 outputs=classifier.errors(y),
p@24 319 givens={
p@24 320 x: valid_set_x[index * batch_size: (index + 1) * batch_size],
p@24 321 y: valid_set_y[index * batch_size: (index + 1) * batch_size]
p@24 322 }
p@24 323 )
p@24 324
p@24 325 # compute the gradient of cost with respect to theta = (W,b)
p@24 326 g_W = T.grad(cost=cost, wrt=classifier.W)
p@24 327 g_b = T.grad(cost=cost, wrt=classifier.b)
p@24 328
p@24 329 # start-snippet-3
p@24 330 # specify how to update the parameters of the model as a list of
p@24 331 # (variable, update expression) pairs.
p@24 332 updates = [(classifier.W, classifier.W - learning_rate * g_W),
p@24 333 (classifier.b, classifier.b - learning_rate * g_b)]
p@24 334
p@24 335 # compiling a Theano function `train_model` that returns the cost, but in
p@24 336 # the same time updates the parameter of the model based on the rules
p@24 337 # defined in `updates`
p@24 338 train_model = theano.function(
p@24 339 inputs=[index],
p@24 340 outputs=cost,
p@24 341 updates=updates,
p@24 342 givens={
p@24 343 x: train_set_x[index * batch_size: (index + 1) * batch_size],
p@24 344 y: train_set_y[index * batch_size: (index + 1) * batch_size]
p@24 345 }
p@24 346 )
p@24 347 # end-snippet-3
p@24 348
p@24 349 ###############
p@24 350 # TRAIN MODEL #
p@24 351 ###############
p@24 352 print '... training the model'
p@24 353 # early-stopping parameters
p@24 354 patience = 5000 # look as this many examples regardless
p@24 355 patience_increase = 2 # wait this much longer when a new best is
p@24 356 # found
p@24 357 improvement_threshold = 0.995 # a relative improvement of this much is
p@24 358 # considered significant
p@24 359 validation_frequency = min(n_train_batches, patience / 2)
p@24 360 # go through this many
p@24 361 # minibatche before checking the network
p@24 362 # on the validation set; in this case we
p@24 363 # check every epoch
p@24 364
p@24 365 best_validation_loss = numpy.inf
p@24 366 test_score = 0.
p@24 367 start_time = timeit.default_timer()
p@24 368
p@24 369 done_looping = False
p@24 370 epoch = 0
p@24 371 while (epoch < n_epochs) and (not done_looping):
p@24 372 epoch = epoch + 1
p@24 373 for minibatch_index in xrange(n_train_batches):
p@24 374
p@24 375 minibatch_avg_cost = train_model(minibatch_index)
p@24 376 # iteration number
p@24 377 iter = (epoch - 1) * n_train_batches + minibatch_index
p@24 378
p@24 379 if (iter + 1) % validation_frequency == 0:
p@24 380 # compute zero-one loss on validation set
p@24 381 validation_losses = [validate_model(i)
p@24 382 for i in xrange(n_valid_batches)]
p@24 383 this_validation_loss = numpy.mean(validation_losses)
p@24 384
p@24 385 print(
p@24 386 'epoch %i, minibatch %i/%i, validation error %f %%' %
p@24 387 (
p@24 388 epoch,
p@24 389 minibatch_index + 1,
p@24 390 n_train_batches,
p@24 391 this_validation_loss * 100.
p@24 392 )
p@24 393 )
p@24 394
p@24 395 # if we got the best validation score until now
p@24 396 if this_validation_loss < best_validation_loss:
p@24 397 #improve patience if loss improvement is good enough
p@24 398 if this_validation_loss < best_validation_loss * \
p@24 399 improvement_threshold:
p@24 400 patience = max(patience, iter * patience_increase)
p@24 401
p@24 402 best_validation_loss = this_validation_loss
p@24 403 # test it on the test set
p@24 404
p@24 405 test_losses = [test_model(i)
p@24 406 for i in xrange(n_test_batches)]
p@24 407 test_score = numpy.mean(test_losses)
p@24 408
p@24 409 print(
p@24 410 (
p@24 411 ' epoch %i, minibatch %i/%i, test error of'
p@24 412 ' best model %f %%'
p@24 413 ) %
p@24 414 (
p@24 415 epoch,
p@24 416 minibatch_index + 1,
p@24 417 n_train_batches,
p@24 418 test_score * 100.
p@24 419 )
p@24 420 )
p@24 421
p@24 422 # save the best model
p@24 423 with open('best_model.pkl', 'w') as f:
p@24 424 cPickle.dump(classifier, f)
p@24 425
p@24 426 if patience <= iter:
p@24 427 done_looping = True
p@24 428 break
p@24 429
p@24 430 end_time = timeit.default_timer()
p@24 431 print(
p@24 432 (
p@24 433 'Optimization complete with best validation score of %f %%,'
p@24 434 'with test performance %f %%'
p@24 435 )
p@24 436 % (best_validation_loss * 100., test_score * 100.)
p@24 437 )
p@24 438 print 'The code run for %d epochs, with %f epochs/sec' % (
p@24 439 epoch, 1. * epoch / (end_time - start_time))
p@24 440 print >> sys.stderr, ('The code for file ' +
p@24 441 os.path.split(__file__)[1] +
p@24 442 ' ran for %.1fs' % ((end_time - start_time)))
p@24 443
p@24 444
p@24 445 def predict():
p@24 446 """
p@24 447 An example of how to load a trained model and use it
p@24 448 to predict labels.
p@24 449 """
p@24 450
p@24 451 # load the saved model
p@24 452 classifier = cPickle.load(open('best_model.pkl'))
p@24 453
p@24 454 # compile a predictor function
p@24 455 predict_model = theano.function(
p@24 456 inputs=[classifier.input],
p@24 457 outputs=classifier.y_pred)
p@24 458
p@24 459 # We can test it on some examples from test test
p@24 460 dataset='mnist.pkl.gz'
p@24 461 datasets = load_data(dataset)
p@24 462 test_set_x, test_set_y = datasets[2]
p@24 463 test_set_x = test_set_x.get_value()
p@24 464
p@24 465 predicted_values = predict_model(test_set_x[:10])
p@24 466 print ("Predicted values for the first 10 examples in test set:")
p@24 467 print predicted_values
p@24 468
p@24 469
p@24 470 if __name__ == '__main__':
p@24 471 sgd_optimization_mnist()
p@24 472