p@24: """ p@24: This tutorial introduces logistic regression using Theano and stochastic p@24: gradient descent. p@24: p@24: Logistic regression is a probabilistic, linear classifier. It is parametrized p@24: by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is p@24: done by projecting data points onto a set of hyperplanes, the distance to p@24: which is used to determine a class membership probability. p@24: p@24: Mathematically, this can be written as: p@24: p@24: .. math:: p@24: P(Y=i|x, W,b) &= softmax_i(W x + b) \\ p@24: &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} p@24: p@24: p@24: The output of the model or prediction is then done by taking the argmax of p@24: the vector whose i'th element is P(Y=i|x). p@24: p@24: .. math:: p@24: p@24: y_{pred} = argmax_i P(Y=i|x,W,b) p@24: p@24: p@24: This tutorial presents a stochastic gradient descent optimization method p@24: suitable for large datasets. p@24: p@24: p@24: References: p@24: p@24: - textbooks: "Pattern Recognition and Machine Learning" - p@24: Christopher M. Bishop, section 4.3.2 p@24: p@24: """ p@24: __docformat__ = 'restructedtext en' p@24: p@24: import cPickle p@24: import gzip p@24: import os p@24: import sys p@24: import timeit p@24: p@24: import numpy p@24: p@24: import theano p@24: import theano.tensor as T p@24: p@24: p@24: class LogisticRegression(object): p@24: """Multi-class Logistic Regression Class p@24: p@24: The logistic regression is fully described by a weight matrix :math:`W` p@24: and bias vector :math:`b`. Classification is done by projecting data p@24: points onto a set of hyperplanes, the distance to which is used to p@24: determine a class membership probability. p@24: """ p@24: p@24: def __init__(self, input, n_in, n_out): p@24: """ Initialize the parameters of the logistic regression p@24: p@24: :type input: theano.tensor.TensorType p@24: :param input: symbolic variable that describes the input of the p@24: architecture (one minibatch) p@24: p@24: :type n_in: int p@24: :param n_in: number of input units, the dimension of the space in p@24: which the datapoints lie p@24: p@24: :type n_out: int p@24: :param n_out: number of output units, the dimension of the space in p@24: which the labels lie p@24: p@24: """ p@24: # start-snippet-1 p@24: # initialize with 0 the weights W as a matrix of shape (n_in, n_out) p@24: self.W = theano.shared( p@24: value=numpy.zeros( p@24: (n_in, n_out), p@24: dtype=theano.config.floatX p@24: ), p@24: name='W', p@24: borrow=True p@24: ) p@24: # initialize the baises b as a vector of n_out 0s p@24: self.b = theano.shared( p@24: value=numpy.zeros( p@24: (n_out,), p@24: dtype=theano.config.floatX p@24: ), p@24: name='b', p@24: borrow=True p@24: ) p@24: p@24: # symbolic expression for computing the matrix of class-membership p@24: # probabilities p@24: # Where: p@24: # W is a matrix where column-k represent the separation hyperplane for p@24: # class-k p@24: # x is a matrix where row-j represents input training sample-j p@24: # b is a vector where element-k represent the free parameter of p@24: # hyperplane-k p@24: self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) p@24: p@24: # symbolic description of how to compute prediction as class whose p@24: # probability is maximal p@24: self.y_pred = T.argmax(self.p_y_given_x, axis=1) p@24: # end-snippet-1 p@24: p@24: # parameters of the model p@24: self.params = [self.W, self.b] p@24: p@24: # keep track of model input p@24: self.input = input p@24: p@24: def negative_log_likelihood(self, y): p@24: """Return the mean of the negative log-likelihood of the prediction p@24: of this model under a given target distribution. p@24: p@24: .. math:: p@24: p@24: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = p@24: \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} p@24: \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ p@24: \ell (\theta=\{W,b\}, \mathcal{D}) p@24: p@24: :type y: theano.tensor.TensorType p@24: :param y: corresponds to a vector that gives for each example the p@24: correct label p@24: p@24: Note: we use the mean instead of the sum so that p@24: the learning rate is less dependent on the batch size p@24: """ p@24: # start-snippet-2 p@24: # y.shape[0] is (symbolically) the number of rows in y, i.e., p@24: # number of examples (call it n) in the minibatch p@24: # T.arange(y.shape[0]) is a symbolic vector which will contain p@24: # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of p@24: # Log-Probabilities (call it LP) with one row per example and p@24: # one column per class LP[T.arange(y.shape[0]),y] is a vector p@24: # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., p@24: # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is p@24: # the mean (across minibatch examples) of the elements in v, p@24: # i.e., the mean log-likelihood across the minibatch. p@24: return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) p@24: # end-snippet-2 p@24: p@24: def errors(self, y): p@24: """Return a float representing the number of errors in the minibatch p@24: over the total number of examples of the minibatch ; zero one p@24: loss over the size of the minibatch p@24: p@24: :type y: theano.tensor.TensorType p@24: :param y: corresponds to a vector that gives for each example the p@24: correct label p@24: """ p@24: p@24: # check if y has same dimension of y_pred p@24: if y.ndim != self.y_pred.ndim: p@24: raise TypeError( p@24: 'y should have the same shape as self.y_pred', p@24: ('y', y.type, 'y_pred', self.y_pred.type) p@24: ) p@24: # check if y is of the correct datatype p@24: if y.dtype.startswith('int'): p@24: # the T.neq operator returns a vector of 0s and 1s, where 1 p@24: # represents a mistake in prediction p@24: return T.mean(T.neq(self.y_pred, y)) p@24: else: p@24: raise NotImplementedError() p@24: p@24: p@24: def load_data(dataset): p@24: ''' Loads the dataset p@24: p@24: :type dataset: string p@24: :param dataset: the path to the dataset (here MNIST) p@24: ''' p@24: ############# p@24: # LOAD DATA # p@24: ############# p@24: ''' p@24: # Download the MNIST dataset if it is not present p@24: data_dir, data_file = os.path.split(dataset) p@24: if data_dir == "" and not os.path.isfile(dataset): p@24: # Check if dataset is in the data directory. p@24: new_path = os.path.join( p@24: os.path.split(__file__)[0], p@24: "..", p@24: "data", p@24: dataset p@24: ) p@24: if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': p@24: dataset = new_path p@24: p@24: if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': p@24: import urllib p@24: origin = ( p@24: 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' p@24: ) p@24: print 'Downloading data from %s' % origin p@24: urllib.urlretrieve(origin, dataset) p@24: p@24: print '... loading data' p@24: p@24: # Load the dataset p@24: f = gzip.open(dataset, 'rb') p@24: ''' p@24: f = file(dataset, 'rb') p@24: train_set, valid_set, test_set = cPickle.load(f) p@24: f.close() p@24: #train_set, valid_set, test_set format: tuple(input, target) p@24: #input is an numpy.ndarray of 2 dimensions (a matrix) p@24: #witch row's correspond to an example. target is a p@24: #numpy.ndarray of 1 dimensions (vector)) that have the same length as p@24: #the number of rows in the input. It should give the target p@24: #target to the example with the same index in the input. p@24: p@24: def shared_dataset(data_xy, borrow=True): p@24: """ Function that loads the dataset into shared variables p@24: p@24: The reason we store our dataset in shared variables is to allow p@24: Theano to copy it into the GPU memory (when code is run on GPU). p@24: Since copying data into the GPU is slow, copying a minibatch everytime p@24: is needed (the default behaviour if the data is not in a shared p@24: variable) would lead to a large decrease in performance. p@24: """ p@24: data_x, data_y = data_xy p@24: shared_x = theano.shared(numpy.asarray(data_x, p@24: dtype=theano.config.floatX), p@24: borrow=borrow) p@24: shared_y = theano.shared(numpy.asarray(data_y, p@24: dtype=theano.config.floatX), p@24: borrow=borrow) p@24: # When storing data on the GPU it has to be stored as floats p@24: # therefore we will store the labels as ``floatX`` as well p@24: # (``shared_y`` does exactly that). But during our computations p@24: # we need them as ints (we use labels as index, and if they are p@24: # floats it doesn't make sense) therefore instead of returning p@24: # ``shared_y`` we will have to cast it to int. This little hack p@24: # lets ous get around this issue p@24: return shared_x, T.cast(shared_y, 'int32') p@24: p@24: test_set_x, test_set_y = shared_dataset(test_set) p@24: valid_set_x, valid_set_y = shared_dataset(valid_set) p@24: train_set_x, train_set_y = shared_dataset(train_set) p@24: p@24: rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), p@24: (test_set_x, test_set_y)] p@24: return rval p@24: p@24: p@24: def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, p@24: dataset='mnist.pkl.gz', p@24: batch_size=600): p@24: """ p@24: Demonstrate stochastic gradient descent optimization of a log-linear p@24: model p@24: p@24: This is demonstrated on MNIST. p@24: p@24: :type learning_rate: float p@24: :param learning_rate: learning rate used (factor for the stochastic p@24: gradient) p@24: p@24: :type n_epochs: int p@24: :param n_epochs: maximal number of epochs to run the optimizer p@24: p@24: :type dataset: string p@24: :param dataset: the path of the MNIST dataset file from p@24: http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz p@24: p@24: """ p@24: datasets = load_data(dataset) p@24: p@24: train_set_x, train_set_y = datasets[0] p@24: valid_set_x, valid_set_y = datasets[1] p@24: test_set_x, test_set_y = datasets[2] p@24: p@24: # compute number of minibatches for training, validation and testing p@24: n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size p@24: n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size p@24: n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size p@24: p@24: ###################### p@24: # BUILD ACTUAL MODEL # p@24: ###################### p@24: print '... building the model' p@24: p@24: # allocate symbolic variables for the data p@24: index = T.lscalar() # index to a [mini]batch p@24: p@24: # generate symbolic variables for input (x and y represent a p@24: # minibatch) p@24: x = T.matrix('x') # data, presented as rasterized images p@24: y = T.ivector('y') # labels, presented as 1D vector of [int] labels p@24: p@24: # construct the logistic regression class p@24: # Each MNIST image has size 28*28 p@24: classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) p@24: p@24: # the cost we minimize during training is the negative log likelihood of p@24: # the model in symbolic format p@24: cost = classifier.negative_log_likelihood(y) p@24: p@24: # compiling a Theano function that computes the mistakes that are made by p@24: # the model on a minibatch p@24: test_model = theano.function( p@24: inputs=[index], p@24: outputs=classifier.errors(y), p@24: givens={ p@24: x: test_set_x[index * batch_size: (index + 1) * batch_size], p@24: y: test_set_y[index * batch_size: (index + 1) * batch_size] p@24: } p@24: ) p@24: p@24: validate_model = theano.function( p@24: inputs=[index], p@24: outputs=classifier.errors(y), p@24: givens={ p@24: x: valid_set_x[index * batch_size: (index + 1) * batch_size], p@24: y: valid_set_y[index * batch_size: (index + 1) * batch_size] p@24: } p@24: ) p@24: p@24: # compute the gradient of cost with respect to theta = (W,b) p@24: g_W = T.grad(cost=cost, wrt=classifier.W) p@24: g_b = T.grad(cost=cost, wrt=classifier.b) p@24: p@24: # start-snippet-3 p@24: # specify how to update the parameters of the model as a list of p@24: # (variable, update expression) pairs. p@24: updates = [(classifier.W, classifier.W - learning_rate * g_W), p@24: (classifier.b, classifier.b - learning_rate * g_b)] p@24: p@24: # compiling a Theano function `train_model` that returns the cost, but in p@24: # the same time updates the parameter of the model based on the rules p@24: # defined in `updates` p@24: train_model = theano.function( p@24: inputs=[index], p@24: outputs=cost, p@24: updates=updates, p@24: givens={ p@24: x: train_set_x[index * batch_size: (index + 1) * batch_size], p@24: y: train_set_y[index * batch_size: (index + 1) * batch_size] p@24: } p@24: ) p@24: # end-snippet-3 p@24: p@24: ############### p@24: # TRAIN MODEL # p@24: ############### p@24: print '... training the model' p@24: # early-stopping parameters p@24: patience = 5000 # look as this many examples regardless p@24: patience_increase = 2 # wait this much longer when a new best is p@24: # found p@24: improvement_threshold = 0.995 # a relative improvement of this much is p@24: # considered significant p@24: validation_frequency = min(n_train_batches, patience / 2) p@24: # go through this many p@24: # minibatche before checking the network p@24: # on the validation set; in this case we p@24: # check every epoch p@24: p@24: best_validation_loss = numpy.inf p@24: test_score = 0. p@24: start_time = timeit.default_timer() p@24: p@24: done_looping = False p@24: epoch = 0 p@24: while (epoch < n_epochs) and (not done_looping): p@24: epoch = epoch + 1 p@24: for minibatch_index in xrange(n_train_batches): p@24: p@24: minibatch_avg_cost = train_model(minibatch_index) p@24: # iteration number p@24: iter = (epoch - 1) * n_train_batches + minibatch_index p@24: p@24: if (iter + 1) % validation_frequency == 0: p@24: # compute zero-one loss on validation set p@24: validation_losses = [validate_model(i) p@24: for i in xrange(n_valid_batches)] p@24: this_validation_loss = numpy.mean(validation_losses) p@24: p@24: print( p@24: 'epoch %i, minibatch %i/%i, validation error %f %%' % p@24: ( p@24: epoch, p@24: minibatch_index + 1, p@24: n_train_batches, p@24: this_validation_loss * 100. p@24: ) p@24: ) p@24: p@24: # if we got the best validation score until now p@24: if this_validation_loss < best_validation_loss: p@24: #improve patience if loss improvement is good enough p@24: if this_validation_loss < best_validation_loss * \ p@24: improvement_threshold: p@24: patience = max(patience, iter * patience_increase) p@24: p@24: best_validation_loss = this_validation_loss p@24: # test it on the test set p@24: p@24: test_losses = [test_model(i) p@24: for i in xrange(n_test_batches)] p@24: test_score = numpy.mean(test_losses) p@24: p@24: print( p@24: ( p@24: ' epoch %i, minibatch %i/%i, test error of' p@24: ' best model %f %%' p@24: ) % p@24: ( p@24: epoch, p@24: minibatch_index + 1, p@24: n_train_batches, p@24: test_score * 100. p@24: ) p@24: ) p@24: p@24: # save the best model p@24: with open('best_model.pkl', 'w') as f: p@24: cPickle.dump(classifier, f) p@24: p@24: if patience <= iter: p@24: done_looping = True p@24: break p@24: p@24: end_time = timeit.default_timer() p@24: print( p@24: ( p@24: 'Optimization complete with best validation score of %f %%,' p@24: 'with test performance %f %%' p@24: ) p@24: % (best_validation_loss * 100., test_score * 100.) p@24: ) p@24: print 'The code run for %d epochs, with %f epochs/sec' % ( p@24: epoch, 1. * epoch / (end_time - start_time)) p@24: print >> sys.stderr, ('The code for file ' + p@24: os.path.split(__file__)[1] + p@24: ' ran for %.1fs' % ((end_time - start_time))) p@24: p@24: p@24: def predict(): p@24: """ p@24: An example of how to load a trained model and use it p@24: to predict labels. p@24: """ p@24: p@24: # load the saved model p@24: classifier = cPickle.load(open('best_model.pkl')) p@24: p@24: # compile a predictor function p@24: predict_model = theano.function( p@24: inputs=[classifier.input], p@24: outputs=classifier.y_pred) p@24: p@24: # We can test it on some examples from test test p@24: dataset='mnist.pkl.gz' p@24: datasets = load_data(dataset) p@24: test_set_x, test_set_y = datasets[2] p@24: test_set_x = test_set_x.get_value() p@24: p@24: predicted_values = predict_model(test_set_x[:10]) p@24: print ("Predicted values for the first 10 examples in test set:") p@24: print predicted_values p@24: p@24: p@24: if __name__ == '__main__': p@24: sgd_optimization_mnist() p@24: