p@24: """
p@24: This tutorial introduces logistic regression using Theano and stochastic
p@24: gradient descent.
p@24: 
p@24: Logistic regression is a probabilistic, linear classifier. It is parametrized
p@24: by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
p@24: done by projecting data points onto a set of hyperplanes, the distance to
p@24: which is used to determine a class membership probability.
p@24: 
p@24: Mathematically, this can be written as:
p@24: 
p@24: .. math::
p@24:   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
p@24:                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
p@24: 
p@24: 
p@24: The output of the model or prediction is then done by taking the argmax of
p@24: the vector whose i'th element is P(Y=i|x).
p@24: 
p@24: .. math::
p@24: 
p@24:   y_{pred} = argmax_i P(Y=i|x,W,b)
p@24: 
p@24: 
p@24: This tutorial presents a stochastic gradient descent optimization method
p@24: suitable for large datasets.
p@24: 
p@24: 
p@24: References:
p@24: 
p@24:     - textbooks: "Pattern Recognition and Machine Learning" -
p@24:                  Christopher M. Bishop, section 4.3.2
p@24: 
p@24: """
p@24: __docformat__ = 'restructedtext en'
p@24: 
p@24: import cPickle
p@24: import gzip
p@24: import os
p@24: import sys
p@24: import timeit
p@24: 
p@24: import numpy
p@24: 
p@24: import theano
p@24: import theano.tensor as T
p@24: 
p@24: 
p@24: class LogisticRegression(object):
p@24:     """Multi-class Logistic Regression Class
p@24: 
p@24:     The logistic regression is fully described by a weight matrix :math:`W`
p@24:     and bias vector :math:`b`. Classification is done by projecting data
p@24:     points onto a set of hyperplanes, the distance to which is used to
p@24:     determine a class membership probability.
p@24:     """
p@24: 
p@24:     def __init__(self, input, n_in, n_out):
p@24:         """ Initialize the parameters of the logistic regression
p@24: 
p@24:         :type input: theano.tensor.TensorType
p@24:         :param input: symbolic variable that describes the input of the
p@24:                       architecture (one minibatch)
p@24: 
p@24:         :type n_in: int
p@24:         :param n_in: number of input units, the dimension of the space in
p@24:                      which the datapoints lie
p@24: 
p@24:         :type n_out: int
p@24:         :param n_out: number of output units, the dimension of the space in
p@24:                       which the labels lie
p@24: 
p@24:         """
p@24:         # start-snippet-1
p@24:         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
p@24:         self.W = theano.shared(
p@24:             value=numpy.zeros(
p@24:                 (n_in, n_out),
p@24:                 dtype=theano.config.floatX
p@24:             ),
p@24:             name='W',
p@24:             borrow=True
p@24:         )
p@24:         # initialize the baises b as a vector of n_out 0s
p@24:         self.b = theano.shared(
p@24:             value=numpy.zeros(
p@24:                 (n_out,),
p@24:                 dtype=theano.config.floatX
p@24:             ),
p@24:             name='b',
p@24:             borrow=True
p@24:         )
p@24: 
p@24:         # symbolic expression for computing the matrix of class-membership
p@24:         # probabilities
p@24:         # Where:
p@24:         # W is a matrix where column-k represent the separation hyperplane for
p@24:         # class-k
p@24:         # x is a matrix where row-j  represents input training sample-j
p@24:         # b is a vector where element-k represent the free parameter of
p@24:         # hyperplane-k
p@24:         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
p@24: 
p@24:         # symbolic description of how to compute prediction as class whose
p@24:         # probability is maximal
p@24:         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
p@24:         # end-snippet-1
p@24: 
p@24:         # parameters of the model
p@24:         self.params = [self.W, self.b]
p@24: 
p@24:         # keep track of model input
p@24:         self.input = input
p@24: 
p@24:     def negative_log_likelihood(self, y):
p@24:         """Return the mean of the negative log-likelihood of the prediction
p@24:         of this model under a given target distribution.
p@24: 
p@24:         .. math::
p@24: 
p@24:             \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
p@24:             \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
p@24:                 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
p@24:             \ell (\theta=\{W,b\}, \mathcal{D})
p@24: 
p@24:         :type y: theano.tensor.TensorType
p@24:         :param y: corresponds to a vector that gives for each example the
p@24:                   correct label
p@24: 
p@24:         Note: we use the mean instead of the sum so that
p@24:               the learning rate is less dependent on the batch size
p@24:         """
p@24:         # start-snippet-2
p@24:         # y.shape[0] is (symbolically) the number of rows in y, i.e.,
p@24:         # number of examples (call it n) in the minibatch
p@24:         # T.arange(y.shape[0]) is a symbolic vector which will contain
p@24:         # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
p@24:         # Log-Probabilities (call it LP) with one row per example and
p@24:         # one column per class LP[T.arange(y.shape[0]),y] is a vector
p@24:         # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
p@24:         # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
p@24:         # the mean (across minibatch examples) of the elements in v,
p@24:         # i.e., the mean log-likelihood across the minibatch.
p@24:         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
p@24:         # end-snippet-2
p@24: 
p@24:     def errors(self, y):
p@24:         """Return a float representing the number of errors in the minibatch
p@24:         over the total number of examples of the minibatch ; zero one
p@24:         loss over the size of the minibatch
p@24: 
p@24:         :type y: theano.tensor.TensorType
p@24:         :param y: corresponds to a vector that gives for each example the
p@24:                   correct label
p@24:         """
p@24: 
p@24:         # check if y has same dimension of y_pred
p@24:         if y.ndim != self.y_pred.ndim:
p@24:             raise TypeError(
p@24:                 'y should have the same shape as self.y_pred',
p@24:                 ('y', y.type, 'y_pred', self.y_pred.type)
p@24:             )
p@24:         # check if y is of the correct datatype
p@24:         if y.dtype.startswith('int'):
p@24:             # the T.neq operator returns a vector of 0s and 1s, where 1
p@24:             # represents a mistake in prediction
p@24:             return T.mean(T.neq(self.y_pred, y))
p@24:         else:
p@24:             raise NotImplementedError()
p@24: 
p@24: 
p@24: def load_data(dataset):
p@24:     ''' Loads the dataset
p@24: 
p@24:     :type dataset: string
p@24:     :param dataset: the path to the dataset (here MNIST)
p@24:     '''
p@24:     #############
p@24:     # LOAD DATA #
p@24:     #############
p@24:     '''
p@24:     # Download the MNIST dataset if it is not present
p@24:     data_dir, data_file = os.path.split(dataset)
p@24:     if data_dir == "" and not os.path.isfile(dataset):
p@24:         # Check if dataset is in the data directory.
p@24:         new_path = os.path.join(
p@24:             os.path.split(__file__)[0],
p@24:             "..",
p@24:             "data",
p@24:             dataset
p@24:         )
p@24:         if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
p@24:             dataset = new_path
p@24: 
p@24:     if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
p@24:         import urllib
p@24:         origin = (
p@24:             'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
p@24:         )
p@24:         print 'Downloading data from %s' % origin
p@24:         urllib.urlretrieve(origin, dataset)
p@24: 
p@24:     print '... loading data'
p@24:     
p@24:     # Load the dataset
p@24:     f = gzip.open(dataset, 'rb')
p@24:     '''
p@24:     f = file(dataset, 'rb')
p@24:     train_set, valid_set, test_set = cPickle.load(f)
p@24:     f.close()
p@24:     #train_set, valid_set, test_set format: tuple(input, target)
p@24:     #input is an numpy.ndarray of 2 dimensions (a matrix)
p@24:     #witch row's correspond to an example. target is a
p@24:     #numpy.ndarray of 1 dimensions (vector)) that have the same length as
p@24:     #the number of rows in the input. It should give the target
p@24:     #target to the example with the same index in the input.
p@24: 
p@24:     def shared_dataset(data_xy, borrow=True):
p@24:         """ Function that loads the dataset into shared variables
p@24: 
p@24:         The reason we store our dataset in shared variables is to allow
p@24:         Theano to copy it into the GPU memory (when code is run on GPU).
p@24:         Since copying data into the GPU is slow, copying a minibatch everytime
p@24:         is needed (the default behaviour if the data is not in a shared
p@24:         variable) would lead to a large decrease in performance.
p@24:         """
p@24:         data_x, data_y = data_xy
p@24:         shared_x = theano.shared(numpy.asarray(data_x,
p@24:                                                dtype=theano.config.floatX),
p@24:                                  borrow=borrow)
p@24:         shared_y = theano.shared(numpy.asarray(data_y,
p@24:                                                dtype=theano.config.floatX),
p@24:                                  borrow=borrow)
p@24:         # When storing data on the GPU it has to be stored as floats
p@24:         # therefore we will store the labels as ``floatX`` as well
p@24:         # (``shared_y`` does exactly that). But during our computations
p@24:         # we need them as ints (we use labels as index, and if they are
p@24:         # floats it doesn't make sense) therefore instead of returning
p@24:         # ``shared_y`` we will have to cast it to int. This little hack
p@24:         # lets ous get around this issue
p@24:         return shared_x, T.cast(shared_y, 'int32')
p@24: 
p@24:     test_set_x, test_set_y = shared_dataset(test_set)
p@24:     valid_set_x, valid_set_y = shared_dataset(valid_set)
p@24:     train_set_x, train_set_y = shared_dataset(train_set)
p@24: 
p@24:     rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
p@24:             (test_set_x, test_set_y)]
p@24:     return rval
p@24: 
p@24: 
p@24: def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
p@24:                            dataset='mnist.pkl.gz',
p@24:                            batch_size=600):
p@24:     """
p@24:     Demonstrate stochastic gradient descent optimization of a log-linear
p@24:     model
p@24: 
p@24:     This is demonstrated on MNIST.
p@24: 
p@24:     :type learning_rate: float
p@24:     :param learning_rate: learning rate used (factor for the stochastic
p@24:                           gradient)
p@24: 
p@24:     :type n_epochs: int
p@24:     :param n_epochs: maximal number of epochs to run the optimizer
p@24: 
p@24:     :type dataset: string
p@24:     :param dataset: the path of the MNIST dataset file from
p@24:                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
p@24: 
p@24:     """
p@24:     datasets = load_data(dataset)
p@24: 
p@24:     train_set_x, train_set_y = datasets[0]
p@24:     valid_set_x, valid_set_y = datasets[1]
p@24:     test_set_x, test_set_y = datasets[2]
p@24: 
p@24:     # compute number of minibatches for training, validation and testing
p@24:     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
p@24:     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
p@24:     n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
p@24: 
p@24:     ######################
p@24:     # BUILD ACTUAL MODEL #
p@24:     ######################
p@24:     print '... building the model'
p@24: 
p@24:     # allocate symbolic variables for the data
p@24:     index = T.lscalar()  # index to a [mini]batch
p@24: 
p@24:     # generate symbolic variables for input (x and y represent a
p@24:     # minibatch)
p@24:     x = T.matrix('x')  # data, presented as rasterized images
p@24:     y = T.ivector('y')  # labels, presented as 1D vector of [int] labels
p@24: 
p@24:     # construct the logistic regression class
p@24:     # Each MNIST image has size 28*28
p@24:     classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
p@24: 
p@24:     # the cost we minimize during training is the negative log likelihood of
p@24:     # the model in symbolic format
p@24:     cost = classifier.negative_log_likelihood(y)
p@24: 
p@24:     # compiling a Theano function that computes the mistakes that are made by
p@24:     # the model on a minibatch
p@24:     test_model = theano.function(
p@24:         inputs=[index],
p@24:         outputs=classifier.errors(y),
p@24:         givens={
p@24:             x: test_set_x[index * batch_size: (index + 1) * batch_size],
p@24:             y: test_set_y[index * batch_size: (index + 1) * batch_size]
p@24:         }
p@24:     )
p@24: 
p@24:     validate_model = theano.function(
p@24:         inputs=[index],
p@24:         outputs=classifier.errors(y),
p@24:         givens={
p@24:             x: valid_set_x[index * batch_size: (index + 1) * batch_size],
p@24:             y: valid_set_y[index * batch_size: (index + 1) * batch_size]
p@24:         }
p@24:     )
p@24: 
p@24:     # compute the gradient of cost with respect to theta = (W,b)
p@24:     g_W = T.grad(cost=cost, wrt=classifier.W)
p@24:     g_b = T.grad(cost=cost, wrt=classifier.b)
p@24: 
p@24:     # start-snippet-3
p@24:     # specify how to update the parameters of the model as a list of
p@24:     # (variable, update expression) pairs.
p@24:     updates = [(classifier.W, classifier.W - learning_rate * g_W),
p@24:                (classifier.b, classifier.b - learning_rate * g_b)]
p@24: 
p@24:     # compiling a Theano function `train_model` that returns the cost, but in
p@24:     # the same time updates the parameter of the model based on the rules
p@24:     # defined in `updates`
p@24:     train_model = theano.function(
p@24:         inputs=[index],
p@24:         outputs=cost,
p@24:         updates=updates,
p@24:         givens={
p@24:             x: train_set_x[index * batch_size: (index + 1) * batch_size],
p@24:             y: train_set_y[index * batch_size: (index + 1) * batch_size]
p@24:         }
p@24:     )
p@24:     # end-snippet-3
p@24: 
p@24:     ###############
p@24:     # TRAIN MODEL #
p@24:     ###############
p@24:     print '... training the model'
p@24:     # early-stopping parameters
p@24:     patience = 5000  # look as this many examples regardless
p@24:     patience_increase = 2  # wait this much longer when a new best is
p@24:                                   # found
p@24:     improvement_threshold = 0.995  # a relative improvement of this much is
p@24:                                   # considered significant
p@24:     validation_frequency = min(n_train_batches, patience / 2)
p@24:                                   # go through this many
p@24:                                   # minibatche before checking the network
p@24:                                   # on the validation set; in this case we
p@24:                                   # check every epoch
p@24: 
p@24:     best_validation_loss = numpy.inf
p@24:     test_score = 0.
p@24:     start_time = timeit.default_timer()
p@24: 
p@24:     done_looping = False
p@24:     epoch = 0
p@24:     while (epoch < n_epochs) and (not done_looping):
p@24:         epoch = epoch + 1
p@24:         for minibatch_index in xrange(n_train_batches):
p@24: 
p@24:             minibatch_avg_cost = train_model(minibatch_index)
p@24:             # iteration number
p@24:             iter = (epoch - 1) * n_train_batches + minibatch_index
p@24: 
p@24:             if (iter + 1) % validation_frequency == 0:
p@24:                 # compute zero-one loss on validation set
p@24:                 validation_losses = [validate_model(i)
p@24:                                      for i in xrange(n_valid_batches)]
p@24:                 this_validation_loss = numpy.mean(validation_losses)
p@24: 
p@24:                 print(
p@24:                     'epoch %i, minibatch %i/%i, validation error %f %%' %
p@24:                     (
p@24:                         epoch,
p@24:                         minibatch_index + 1,
p@24:                         n_train_batches,
p@24:                         this_validation_loss * 100.
p@24:                     )
p@24:                 )
p@24: 
p@24:                 # if we got the best validation score until now
p@24:                 if this_validation_loss < best_validation_loss:
p@24:                     #improve patience if loss improvement is good enough
p@24:                     if this_validation_loss < best_validation_loss *  \
p@24:                        improvement_threshold:
p@24:                         patience = max(patience, iter * patience_increase)
p@24: 
p@24:                     best_validation_loss = this_validation_loss
p@24:                     # test it on the test set
p@24: 
p@24:                     test_losses = [test_model(i)
p@24:                                    for i in xrange(n_test_batches)]
p@24:                     test_score = numpy.mean(test_losses)
p@24: 
p@24:                     print(
p@24:                         (
p@24:                             '     epoch %i, minibatch %i/%i, test error of'
p@24:                             ' best model %f %%'
p@24:                         ) %
p@24:                         (
p@24:                             epoch,
p@24:                             minibatch_index + 1,
p@24:                             n_train_batches,
p@24:                             test_score * 100.
p@24:                         )
p@24:                     )
p@24: 
p@24:                     # save the best model
p@24:                     with open('best_model.pkl', 'w') as f:
p@24:                         cPickle.dump(classifier, f)
p@24: 
p@24:             if patience <= iter:
p@24:                 done_looping = True
p@24:                 break
p@24: 
p@24:     end_time = timeit.default_timer()
p@24:     print(
p@24:         (
p@24:             'Optimization complete with best validation score of %f %%,'
p@24:             'with test performance %f %%'
p@24:         )
p@24:         % (best_validation_loss * 100., test_score * 100.)
p@24:     )
p@24:     print 'The code run for %d epochs, with %f epochs/sec' % (
p@24:         epoch, 1. * epoch / (end_time - start_time))
p@24:     print >> sys.stderr, ('The code for file ' +
p@24:                           os.path.split(__file__)[1] +
p@24:                           ' ran for %.1fs' % ((end_time - start_time)))
p@24: 
p@24: 
p@24: def predict():
p@24:     """
p@24:     An example of how to load a trained model and use it
p@24:     to predict labels.
p@24:     """
p@24: 
p@24:     # load the saved model
p@24:     classifier = cPickle.load(open('best_model.pkl'))
p@24: 
p@24:     # compile a predictor function
p@24:     predict_model = theano.function(
p@24:         inputs=[classifier.input],
p@24:         outputs=classifier.y_pred)
p@24: 
p@24:     # We can test it on some examples from test test
p@24:     dataset='mnist.pkl.gz'
p@24:     datasets = load_data(dataset)
p@24:     test_set_x, test_set_y = datasets[2]
p@24:     test_set_x = test_set_x.get_value()
p@24: 
p@24:     predicted_values = predict_model(test_set_x[:10])
p@24:     print ("Predicted values for the first 10 examples in test set:")
p@24:     print predicted_values
p@24: 
p@24: 
p@24: if __name__ == '__main__':
p@24:     sgd_optimization_mnist()
p@24: