p@24: """
p@24: This tutorial introduces the multilayer perceptron using Theano.
p@24: 
p@24:  A multilayer perceptron is a logistic regressor where
p@24: instead of feeding the input to the logistic regression you insert a
p@24: intermediate layer, called the hidden layer, that has a nonlinear
p@24: activation function (usually tanh or sigmoid) . One can use many such
p@24: hidden layers making the architecture deep. The tutorial will also tackle
p@24: the problem of MNIST digit classification.
p@24: 
p@24: .. math::
p@24: 
p@24:     f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
p@24: 
p@24: References:
p@24: 
p@24:     - textbooks: "Pattern Recognition and Machine Learning" -
p@24:                  Christopher M. Bishop, section 5
p@24: 
p@24: """
p@24: __docformat__ = 'restructedtext en'
p@24: 
p@24: 
p@24: import os
p@24: import sys
p@24: import timeit
p@24: 
p@24: import numpy
p@24: 
p@24: import theano
p@24: import theano.tensor as T
p@24: 
p@24: 
p@24: from logistic_sgd import LogisticRegression, load_data
p@24: 
p@24: 
p@24: # start-snippet-1
p@24: class HiddenLayer(object):
p@24:     def __init__(self, rng, input, n_in, n_out, W=None, b=None,
p@24:                  activation=T.tanh):
p@24:         """
p@24:         Typical hidden layer of a MLP: units are fully-connected and have
p@24:         sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
p@24:         and the bias vector b is of shape (n_out,).
p@24: 
p@24:         NOTE : The nonlinearity used here is tanh
p@24: 
p@24:         Hidden unit activation is given by: tanh(dot(input,W) + b)
p@24: 
p@24:         :type rng: numpy.random.RandomState
p@24:         :param rng: a random number generator used to initialize weights
p@24: 
p@24:         :type input: theano.tensor.dmatrix
p@24:         :param input: a symbolic tensor of shape (n_examples, n_in)
p@24: 
p@24:         :type n_in: int
p@24:         :param n_in: dimensionality of input
p@24: 
p@24:         :type n_out: int
p@24:         :param n_out: number of hidden units
p@24: 
p@24:         :type activation: theano.Op or function
p@24:         :param activation: Non linearity to be applied in the hidden
p@24:                            layer
p@24:         """
p@24:         self.input = input
p@24:         # end-snippet-1
p@24: 
p@24:         # `W` is initialized with `W_values` which is uniformely sampled
p@24:         # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
p@24:         # for tanh activation function
p@24:         # the output of uniform if converted using asarray to dtype
p@24:         # theano.config.floatX so that the code is runable on GPU
p@24:         # Note : optimal initialization of weights is dependent on the
p@24:         #        activation function used (among other things).
p@24:         #        For example, results presented in [Xavier10] suggest that you
p@24:         #        should use 4 times larger initial weights for sigmoid
p@24:         #        compared to tanh
p@24:         #        We have no info for other function, so we use the same as
p@24:         #        tanh.
p@24:         if W is None:
p@24:             W_values = numpy.asarray(
p@24:                 rng.uniform(
p@24:                     low=-numpy.sqrt(6. / (n_in + n_out)),
p@24:                     high=numpy.sqrt(6. / (n_in + n_out)),
p@24:                     size=(n_in, n_out)
p@24:                 ),
p@24:                 dtype=theano.config.floatX
p@24:             )
p@24:             if activation == theano.tensor.nnet.sigmoid:
p@24:                 W_values *= 4
p@24: 
p@24:             W = theano.shared(value=W_values, name='W', borrow=True)
p@24: 
p@24:         if b is None:
p@24:             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
p@24:             b = theano.shared(value=b_values, name='b', borrow=True)
p@24: 
p@24:         self.W = W
p@24:         self.b = b
p@24: 
p@24:         lin_output = T.dot(input, self.W) + self.b
p@24:         self.output = (
p@24:             lin_output if activation is None
p@24:             else activation(lin_output)
p@24:         )
p@24:         # parameters of the model
p@24:         self.params = [self.W, self.b]
p@24: 
p@24: 
p@24: # start-snippet-2
p@24: class MLP(object):
p@24:     """Multi-Layer Perceptron Class
p@24: 
p@24:     A multilayer perceptron is a feedforward artificial neural network model
p@24:     that has one layer or more of hidden units and nonlinear activations.
p@24:     Intermediate layers usually have as activation function tanh or the
p@24:     sigmoid function (defined here by a ``HiddenLayer`` class)  while the
p@24:     top layer is a softmax layer (defined here by a ``LogisticRegression``
p@24:     class).
p@24:     """
p@24: 
p@24:     def __init__(self, rng, input, n_in, n_hidden, n_out):
p@24:         """Initialize the parameters for the multilayer perceptron
p@24: 
p@24:         :type rng: numpy.random.RandomState
p@24:         :param rng: a random number generator used to initialize weights
p@24: 
p@24:         :type input: theano.tensor.TensorType
p@24:         :param input: symbolic variable that describes the input of the
p@24:         architecture (one minibatch)
p@24: 
p@24:         :type n_in: int
p@24:         :param n_in: number of input units, the dimension of the space in
p@24:         which the datapoints lie
p@24: 
p@24:         :type n_hidden: int
p@24:         :param n_hidden: number of hidden units
p@24: 
p@24:         :type n_out: int
p@24:         :param n_out: number of output units, the dimension of the space in
p@24:         which the labels lie
p@24: 
p@24:         """
p@24: 
p@24:         # Since we are dealing with a one hidden layer MLP, this will translate
p@24:         # into a HiddenLayer with a tanh activation function connected to the
p@24:         # LogisticRegression layer; the activation function can be replaced by
p@24:         # sigmoid or any other nonlinear function
p@24:         self.hiddenLayer = HiddenLayer(
p@24:             rng=rng,
p@24:             input=input,
p@24:             n_in=n_in,
p@24:             n_out=n_hidden,
p@24:             activation=T.tanh
p@24:         )
p@24: 
p@24:         # The logistic regression layer gets as input the hidden units
p@24:         # of the hidden layer
p@24:         self.logRegressionLayer = LogisticRegression(
p@24:             input=self.hiddenLayer.output,
p@24:             n_in=n_hidden,
p@24:             n_out=n_out
p@24:         )
p@24:         # end-snippet-2 start-snippet-3
p@24:         # L1 norm ; one regularization option is to enforce L1 norm to
p@24:         # be small
p@24:         self.L1 = (
p@24:             abs(self.hiddenLayer.W).sum()
p@24:             + abs(self.logRegressionLayer.W).sum()
p@24:         )
p@24: 
p@24:         # square of L2 norm ; one regularization option is to enforce
p@24:         # square of L2 norm to be small
p@24:         self.L2_sqr = (
p@24:             (self.hiddenLayer.W ** 2).sum()
p@24:             + (self.logRegressionLayer.W ** 2).sum()
p@24:         )
p@24: 
p@24:         # negative log likelihood of the MLP is given by the negative
p@24:         # log likelihood of the output of the model, computed in the
p@24:         # logistic regression layer
p@24:         self.negative_log_likelihood = (
p@24:             self.logRegressionLayer.negative_log_likelihood
p@24:         )
p@24:         # same holds for the function computing the number of errors
p@24:         self.errors = self.logRegressionLayer.errors
p@24: 
p@24:         # the parameters of the model are the parameters of the two layer it is
p@24:         # made out of
p@24:         self.params = self.hiddenLayer.params + self.logRegressionLayer.params
p@24:         # end-snippet-3
p@24: 
p@24:         # keep track of model input
p@24:         self.input = input
p@24: 
p@24: 
p@24: def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
p@24:              dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
p@24:     """
p@24:     Demonstrate stochastic gradient descent optimization for a multilayer
p@24:     perceptron
p@24: 
p@24:     This is demonstrated on MNIST.
p@24: 
p@24:     :type learning_rate: float
p@24:     :param learning_rate: learning rate used (factor for the stochastic
p@24:     gradient
p@24: 
p@24:     :type L1_reg: float
p@24:     :param L1_reg: L1-norm's weight when added to the cost (see
p@24:     regularization)
p@24: 
p@24:     :type L2_reg: float
p@24:     :param L2_reg: L2-norm's weight when added to the cost (see
p@24:     regularization)
p@24: 
p@24:     :type n_epochs: int
p@24:     :param n_epochs: maximal number of epochs to run the optimizer
p@24: 
p@24:     :type dataset: string
p@24:     :param dataset: the path of the MNIST dataset file from
p@24:                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
p@24: 
p@24: 
p@24:    """
p@24:     datasets = load_data(dataset)
p@24: 
p@24:     train_set_x, train_set_y = datasets[0]
p@24:     valid_set_x, valid_set_y = datasets[1]
p@24:     test_set_x, test_set_y = datasets[2]
p@24: 
p@24:     # compute number of minibatches for training, validation and testing
p@24:     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
p@24:     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
p@24:     n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
p@24: 
p@24:     ######################
p@24:     # BUILD ACTUAL MODEL #
p@24:     ######################
p@24:     print '... building the model'
p@24: 
p@24:     # allocate symbolic variables for the data
p@24:     index = T.lscalar()  # index to a [mini]batch
p@24:     x = T.matrix('x')  # the data is presented as rasterized images
p@24:     y = T.ivector('y')  # the labels are presented as 1D vector of
p@24:                         # [int] labels
p@24: 
p@24:     rng = numpy.random.RandomState(1234)
p@24: 
p@24:     # construct the MLP class
p@24:     classifier = MLP(
p@24:         rng=rng,
p@24:         input=x,
p@24:         n_in=28 * 28,
p@24:         n_hidden=n_hidden,
p@24:         n_out=10
p@24:     )
p@24: 
p@24:     # start-snippet-4
p@24:     # the cost we minimize during training is the negative log likelihood of
p@24:     # the model plus the regularization terms (L1 and L2); cost is expressed
p@24:     # here symbolically
p@24:     cost = (
p@24:         classifier.negative_log_likelihood(y)
p@24:         + L1_reg * classifier.L1
p@24:         + L2_reg * classifier.L2_sqr
p@24:     )
p@24:     # end-snippet-4
p@24: 
p@24:     # compiling a Theano function that computes the mistakes that are made
p@24:     # by the model on a minibatch
p@24:     test_model = theano.function(
p@24:         inputs=[index],
p@24:         outputs=classifier.errors(y),
p@24:         givens={
p@24:             x: test_set_x[index * batch_size:(index + 1) * batch_size],
p@24:             y: test_set_y[index * batch_size:(index + 1) * batch_size]
p@24:         }
p@24:     )
p@24: 
p@24:     validate_model = theano.function(
p@24:         inputs=[index],
p@24:         outputs=classifier.errors(y),
p@24:         givens={
p@24:             x: valid_set_x[index * batch_size:(index + 1) * batch_size],
p@24:             y: valid_set_y[index * batch_size:(index + 1) * batch_size]
p@24:         }
p@24:     )
p@24: 
p@24:     # start-snippet-5
p@24:     # compute the gradient of cost with respect to theta (sotred in params)
p@24:     # the resulting gradients will be stored in a list gparams
p@24:     gparams = [T.grad(cost, param) for param in classifier.params]
p@24: 
p@24:     # specify how to update the parameters of the model as a list of
p@24:     # (variable, update expression) pairs
p@24: 
p@24:     # given two lists of the same length, A = [a1, a2, a3, a4] and
p@24:     # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
p@24:     # element is a pair formed from the two lists :
p@24:     #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
p@24:     updates = [
p@24:         (param, param - learning_rate * gparam)
p@24:         for param, gparam in zip(classifier.params, gparams)
p@24:     ]
p@24: 
p@24:     # compiling a Theano function `train_model` that returns the cost, but
p@24:     # in the same time updates the parameter of the model based on the rules
p@24:     # defined in `updates`
p@24:     train_model = theano.function(
p@24:         inputs=[index],
p@24:         outputs=cost,
p@24:         updates=updates,
p@24:         givens={
p@24:             x: train_set_x[index * batch_size: (index + 1) * batch_size],
p@24:             y: train_set_y[index * batch_size: (index + 1) * batch_size]
p@24:         }
p@24:     )
p@24:     # end-snippet-5
p@24: 
p@24:     ###############
p@24:     # TRAIN MODEL #
p@24:     ###############
p@24:     print '... training'
p@24: 
p@24:     # early-stopping parameters
p@24:     patience = 10000  # look as this many examples regardless
p@24:     patience_increase = 2  # wait this much longer when a new best is
p@24:                            # found
p@24:     improvement_threshold = 0.995  # a relative improvement of this much is
p@24:                                    # considered significant
p@24:     validation_frequency = min(n_train_batches, patience / 2)
p@24:                                   # go through this many
p@24:                                   # minibatche before checking the network
p@24:                                   # on the validation set; in this case we
p@24:                                   # check every epoch
p@24: 
p@24:     best_validation_loss = numpy.inf
p@24:     best_iter = 0
p@24:     test_score = 0.
p@24:     start_time = timeit.default_timer()
p@24: 
p@24:     epoch = 0
p@24:     done_looping = False
p@24: 
p@24:     while (epoch < n_epochs) and (not done_looping):
p@24:         epoch = epoch + 1
p@24:         for minibatch_index in xrange(n_train_batches):
p@24: 
p@24:             minibatch_avg_cost = train_model(minibatch_index)
p@24:             # iteration number
p@24:             iter = (epoch - 1) * n_train_batches + minibatch_index
p@24: 
p@24:             if (iter + 1) % validation_frequency == 0:
p@24:                 # compute zero-one loss on validation set
p@24:                 validation_losses = [validate_model(i) for i
p@24:                                      in xrange(n_valid_batches)]
p@24:                 this_validation_loss = numpy.mean(validation_losses)
p@24: 
p@24:                 print(
p@24:                     'epoch %i, minibatch %i/%i, validation error %f %%' %
p@24:                     (
p@24:                         epoch,
p@24:                         minibatch_index + 1,
p@24:                         n_train_batches,
p@24:                         this_validation_loss * 100.
p@24:                     )
p@24:                 )
p@24: 
p@24:                 # if we got the best validation score until now
p@24:                 if this_validation_loss < best_validation_loss:
p@24:                     #improve patience if loss improvement is good enough
p@24:                     if (
p@24:                         this_validation_loss < best_validation_loss *
p@24:                         improvement_threshold
p@24:                     ):
p@24:                         patience = max(patience, iter * patience_increase)
p@24: 
p@24:                     best_validation_loss = this_validation_loss
p@24:                     best_iter = iter
p@24: 
p@24:                     # test it on the test set
p@24:                     test_losses = [test_model(i) for i
p@24:                                    in xrange(n_test_batches)]
p@24:                     test_score = numpy.mean(test_losses)
p@24: 
p@24:                     print(('     epoch %i, minibatch %i/%i, test error of '
p@24:                            'best model %f %%') %
p@24:                           (epoch, minibatch_index + 1, n_train_batches,
p@24:                            test_score * 100.))
p@24: 
p@24:             if patience <= iter:
p@24:                 done_looping = True
p@24:                 break
p@24: 
p@24:     end_time = timeit.default_timer()
p@24:     print(('Optimization complete. Best validation score of %f %% '
p@24:            'obtained at iteration %i, with test performance %f %%') %
p@24:           (best_validation_loss * 100., best_iter + 1, test_score * 100.))
p@24:     print >> sys.stderr, ('The code for file ' +
p@24:                           os.path.split(__file__)[1] +
p@24:                           ' ran for %.2fm' % ((end_time - start_time) / 60.))
p@24: 
p@24: 
p@24: if __name__ == '__main__':
p@24:     test_mlp()
p@24: 
p@24: # Rectifier Linear Unit
p@24: #Source: http://stackoverflow.com/questions/26497564/theano-hiddenlayer-activation-function
p@24: def relu(x):
p@24:     return T.maximum(0.,x)