p@24: """ p@24: This tutorial introduces the multilayer perceptron using Theano. p@24: p@24: A multilayer perceptron is a logistic regressor where p@24: instead of feeding the input to the logistic regression you insert a p@24: intermediate layer, called the hidden layer, that has a nonlinear p@24: activation function (usually tanh or sigmoid) . One can use many such p@24: hidden layers making the architecture deep. The tutorial will also tackle p@24: the problem of MNIST digit classification. p@24: p@24: .. math:: p@24: p@24: f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))), p@24: p@24: References: p@24: p@24: - textbooks: "Pattern Recognition and Machine Learning" - p@24: Christopher M. Bishop, section 5 p@24: p@24: """ p@24: __docformat__ = 'restructedtext en' p@24: p@24: p@24: import os p@24: import sys p@24: import timeit p@24: p@24: import numpy p@24: p@24: import theano p@24: import theano.tensor as T p@24: p@24: p@24: from logistic_sgd import LogisticRegression, load_data p@24: p@24: p@24: # start-snippet-1 p@24: class HiddenLayer(object): p@24: def __init__(self, rng, input, n_in, n_out, W=None, b=None, p@24: activation=T.tanh): p@24: """ p@24: Typical hidden layer of a MLP: units are fully-connected and have p@24: sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) p@24: and the bias vector b is of shape (n_out,). p@24: p@24: NOTE : The nonlinearity used here is tanh p@24: p@24: Hidden unit activation is given by: tanh(dot(input,W) + b) p@24: p@24: :type rng: numpy.random.RandomState p@24: :param rng: a random number generator used to initialize weights p@24: p@24: :type input: theano.tensor.dmatrix p@24: :param input: a symbolic tensor of shape (n_examples, n_in) p@24: p@24: :type n_in: int p@24: :param n_in: dimensionality of input p@24: p@24: :type n_out: int p@24: :param n_out: number of hidden units p@24: p@24: :type activation: theano.Op or function p@24: :param activation: Non linearity to be applied in the hidden p@24: layer p@24: """ p@24: self.input = input p@24: # end-snippet-1 p@24: p@24: # `W` is initialized with `W_values` which is uniformely sampled p@24: # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) p@24: # for tanh activation function p@24: # the output of uniform if converted using asarray to dtype p@24: # theano.config.floatX so that the code is runable on GPU p@24: # Note : optimal initialization of weights is dependent on the p@24: # activation function used (among other things). p@24: # For example, results presented in [Xavier10] suggest that you p@24: # should use 4 times larger initial weights for sigmoid p@24: # compared to tanh p@24: # We have no info for other function, so we use the same as p@24: # tanh. p@24: if W is None: p@24: W_values = numpy.asarray( p@24: rng.uniform( p@24: low=-numpy.sqrt(6. / (n_in + n_out)), p@24: high=numpy.sqrt(6. / (n_in + n_out)), p@24: size=(n_in, n_out) p@24: ), p@24: dtype=theano.config.floatX p@24: ) p@24: if activation == theano.tensor.nnet.sigmoid: p@24: W_values *= 4 p@24: p@24: W = theano.shared(value=W_values, name='W', borrow=True) p@24: p@24: if b is None: p@24: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) p@24: b = theano.shared(value=b_values, name='b', borrow=True) p@24: p@24: self.W = W p@24: self.b = b p@24: p@24: lin_output = T.dot(input, self.W) + self.b p@24: self.output = ( p@24: lin_output if activation is None p@24: else activation(lin_output) p@24: ) p@24: # parameters of the model p@24: self.params = [self.W, self.b] p@24: p@24: p@24: # start-snippet-2 p@24: class MLP(object): p@24: """Multi-Layer Perceptron Class p@24: p@24: A multilayer perceptron is a feedforward artificial neural network model p@24: that has one layer or more of hidden units and nonlinear activations. p@24: Intermediate layers usually have as activation function tanh or the p@24: sigmoid function (defined here by a ``HiddenLayer`` class) while the p@24: top layer is a softmax layer (defined here by a ``LogisticRegression`` p@24: class). p@24: """ p@24: p@24: def __init__(self, rng, input, n_in, n_hidden, n_out): p@24: """Initialize the parameters for the multilayer perceptron p@24: p@24: :type rng: numpy.random.RandomState p@24: :param rng: a random number generator used to initialize weights p@24: p@24: :type input: theano.tensor.TensorType p@24: :param input: symbolic variable that describes the input of the p@24: architecture (one minibatch) p@24: p@24: :type n_in: int p@24: :param n_in: number of input units, the dimension of the space in p@24: which the datapoints lie p@24: p@24: :type n_hidden: int p@24: :param n_hidden: number of hidden units p@24: p@24: :type n_out: int p@24: :param n_out: number of output units, the dimension of the space in p@24: which the labels lie p@24: p@24: """ p@24: p@24: # Since we are dealing with a one hidden layer MLP, this will translate p@24: # into a HiddenLayer with a tanh activation function connected to the p@24: # LogisticRegression layer; the activation function can be replaced by p@24: # sigmoid or any other nonlinear function p@24: self.hiddenLayer = HiddenLayer( p@24: rng=rng, p@24: input=input, p@24: n_in=n_in, p@24: n_out=n_hidden, p@24: activation=T.tanh p@24: ) p@24: p@24: # The logistic regression layer gets as input the hidden units p@24: # of the hidden layer p@24: self.logRegressionLayer = LogisticRegression( p@24: input=self.hiddenLayer.output, p@24: n_in=n_hidden, p@24: n_out=n_out p@24: ) p@24: # end-snippet-2 start-snippet-3 p@24: # L1 norm ; one regularization option is to enforce L1 norm to p@24: # be small p@24: self.L1 = ( p@24: abs(self.hiddenLayer.W).sum() p@24: + abs(self.logRegressionLayer.W).sum() p@24: ) p@24: p@24: # square of L2 norm ; one regularization option is to enforce p@24: # square of L2 norm to be small p@24: self.L2_sqr = ( p@24: (self.hiddenLayer.W ** 2).sum() p@24: + (self.logRegressionLayer.W ** 2).sum() p@24: ) p@24: p@24: # negative log likelihood of the MLP is given by the negative p@24: # log likelihood of the output of the model, computed in the p@24: # logistic regression layer p@24: self.negative_log_likelihood = ( p@24: self.logRegressionLayer.negative_log_likelihood p@24: ) p@24: # same holds for the function computing the number of errors p@24: self.errors = self.logRegressionLayer.errors p@24: p@24: # the parameters of the model are the parameters of the two layer it is p@24: # made out of p@24: self.params = self.hiddenLayer.params + self.logRegressionLayer.params p@24: # end-snippet-3 p@24: p@24: # keep track of model input p@24: self.input = input p@24: p@24: p@24: def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, p@24: dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): p@24: """ p@24: Demonstrate stochastic gradient descent optimization for a multilayer p@24: perceptron p@24: p@24: This is demonstrated on MNIST. p@24: p@24: :type learning_rate: float p@24: :param learning_rate: learning rate used (factor for the stochastic p@24: gradient p@24: p@24: :type L1_reg: float p@24: :param L1_reg: L1-norm's weight when added to the cost (see p@24: regularization) p@24: p@24: :type L2_reg: float p@24: :param L2_reg: L2-norm's weight when added to the cost (see p@24: regularization) p@24: p@24: :type n_epochs: int p@24: :param n_epochs: maximal number of epochs to run the optimizer p@24: p@24: :type dataset: string p@24: :param dataset: the path of the MNIST dataset file from p@24: http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz p@24: p@24: p@24: """ p@24: datasets = load_data(dataset) p@24: p@24: train_set_x, train_set_y = datasets[0] p@24: valid_set_x, valid_set_y = datasets[1] p@24: test_set_x, test_set_y = datasets[2] p@24: p@24: # compute number of minibatches for training, validation and testing p@24: n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size p@24: n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size p@24: n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size p@24: p@24: ###################### p@24: # BUILD ACTUAL MODEL # p@24: ###################### p@24: print '... building the model' p@24: p@24: # allocate symbolic variables for the data p@24: index = T.lscalar() # index to a [mini]batch p@24: x = T.matrix('x') # the data is presented as rasterized images p@24: y = T.ivector('y') # the labels are presented as 1D vector of p@24: # [int] labels p@24: p@24: rng = numpy.random.RandomState(1234) p@24: p@24: # construct the MLP class p@24: classifier = MLP( p@24: rng=rng, p@24: input=x, p@24: n_in=28 * 28, p@24: n_hidden=n_hidden, p@24: n_out=10 p@24: ) p@24: p@24: # start-snippet-4 p@24: # the cost we minimize during training is the negative log likelihood of p@24: # the model plus the regularization terms (L1 and L2); cost is expressed p@24: # here symbolically p@24: cost = ( p@24: classifier.negative_log_likelihood(y) p@24: + L1_reg * classifier.L1 p@24: + L2_reg * classifier.L2_sqr p@24: ) p@24: # end-snippet-4 p@24: p@24: # compiling a Theano function that computes the mistakes that are made p@24: # by the model on a minibatch p@24: test_model = theano.function( p@24: inputs=[index], p@24: outputs=classifier.errors(y), p@24: givens={ p@24: x: test_set_x[index * batch_size:(index + 1) * batch_size], p@24: y: test_set_y[index * batch_size:(index + 1) * batch_size] p@24: } p@24: ) p@24: p@24: validate_model = theano.function( p@24: inputs=[index], p@24: outputs=classifier.errors(y), p@24: givens={ p@24: x: valid_set_x[index * batch_size:(index + 1) * batch_size], p@24: y: valid_set_y[index * batch_size:(index + 1) * batch_size] p@24: } p@24: ) p@24: p@24: # start-snippet-5 p@24: # compute the gradient of cost with respect to theta (sotred in params) p@24: # the resulting gradients will be stored in a list gparams p@24: gparams = [T.grad(cost, param) for param in classifier.params] p@24: p@24: # specify how to update the parameters of the model as a list of p@24: # (variable, update expression) pairs p@24: p@24: # given two lists of the same length, A = [a1, a2, a3, a4] and p@24: # B = [b1, b2, b3, b4], zip generates a list C of same size, where each p@24: # element is a pair formed from the two lists : p@24: # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] p@24: updates = [ p@24: (param, param - learning_rate * gparam) p@24: for param, gparam in zip(classifier.params, gparams) p@24: ] p@24: p@24: # compiling a Theano function `train_model` that returns the cost, but p@24: # in the same time updates the parameter of the model based on the rules p@24: # defined in `updates` p@24: train_model = theano.function( p@24: inputs=[index], p@24: outputs=cost, p@24: updates=updates, p@24: givens={ p@24: x: train_set_x[index * batch_size: (index + 1) * batch_size], p@24: y: train_set_y[index * batch_size: (index + 1) * batch_size] p@24: } p@24: ) p@24: # end-snippet-5 p@24: p@24: ############### p@24: # TRAIN MODEL # p@24: ############### p@24: print '... training' p@24: p@24: # early-stopping parameters p@24: patience = 10000 # look as this many examples regardless p@24: patience_increase = 2 # wait this much longer when a new best is p@24: # found p@24: improvement_threshold = 0.995 # a relative improvement of this much is p@24: # considered significant p@24: validation_frequency = min(n_train_batches, patience / 2) p@24: # go through this many p@24: # minibatche before checking the network p@24: # on the validation set; in this case we p@24: # check every epoch p@24: p@24: best_validation_loss = numpy.inf p@24: best_iter = 0 p@24: test_score = 0. p@24: start_time = timeit.default_timer() p@24: p@24: epoch = 0 p@24: done_looping = False p@24: p@24: while (epoch < n_epochs) and (not done_looping): p@24: epoch = epoch + 1 p@24: for minibatch_index in xrange(n_train_batches): p@24: p@24: minibatch_avg_cost = train_model(minibatch_index) p@24: # iteration number p@24: iter = (epoch - 1) * n_train_batches + minibatch_index p@24: p@24: if (iter + 1) % validation_frequency == 0: p@24: # compute zero-one loss on validation set p@24: validation_losses = [validate_model(i) for i p@24: in xrange(n_valid_batches)] p@24: this_validation_loss = numpy.mean(validation_losses) p@24: p@24: print( p@24: 'epoch %i, minibatch %i/%i, validation error %f %%' % p@24: ( p@24: epoch, p@24: minibatch_index + 1, p@24: n_train_batches, p@24: this_validation_loss * 100. p@24: ) p@24: ) p@24: p@24: # if we got the best validation score until now p@24: if this_validation_loss < best_validation_loss: p@24: #improve patience if loss improvement is good enough p@24: if ( p@24: this_validation_loss < best_validation_loss * p@24: improvement_threshold p@24: ): p@24: patience = max(patience, iter * patience_increase) p@24: p@24: best_validation_loss = this_validation_loss p@24: best_iter = iter p@24: p@24: # test it on the test set p@24: test_losses = [test_model(i) for i p@24: in xrange(n_test_batches)] p@24: test_score = numpy.mean(test_losses) p@24: p@24: print((' epoch %i, minibatch %i/%i, test error of ' p@24: 'best model %f %%') % p@24: (epoch, minibatch_index + 1, n_train_batches, p@24: test_score * 100.)) p@24: p@24: if patience <= iter: p@24: done_looping = True p@24: break p@24: p@24: end_time = timeit.default_timer() p@24: print(('Optimization complete. Best validation score of %f %% ' p@24: 'obtained at iteration %i, with test performance %f %%') % p@24: (best_validation_loss * 100., best_iter + 1, test_score * 100.)) p@24: print >> sys.stderr, ('The code for file ' + p@24: os.path.split(__file__)[1] + p@24: ' ran for %.2fm' % ((end_time - start_time) / 60.)) p@24: p@24: p@24: if __name__ == '__main__': p@24: test_mlp() p@24: p@24: # Rectifier Linear Unit p@24: #Source: http://stackoverflow.com/questions/26497564/theano-hiddenlayer-activation-function p@24: def relu(x): p@24: return T.maximum(0.,x)