annotate Code/genre_classification/classification/mlp.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sat, 09 Jul 2022 00:50:43 -0500
parents 68a62ca32441
children
rev   line source
p@24 1 """
p@24 2 This tutorial introduces the multilayer perceptron using Theano.
p@24 3
p@24 4 A multilayer perceptron is a logistic regressor where
p@24 5 instead of feeding the input to the logistic regression you insert a
p@24 6 intermediate layer, called the hidden layer, that has a nonlinear
p@24 7 activation function (usually tanh or sigmoid) . One can use many such
p@24 8 hidden layers making the architecture deep. The tutorial will also tackle
p@24 9 the problem of MNIST digit classification.
p@24 10
p@24 11 .. math::
p@24 12
p@24 13 f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
p@24 14
p@24 15 References:
p@24 16
p@24 17 - textbooks: "Pattern Recognition and Machine Learning" -
p@24 18 Christopher M. Bishop, section 5
p@24 19
p@24 20 """
p@24 21 __docformat__ = 'restructedtext en'
p@24 22
p@24 23
p@24 24 import os
p@24 25 import sys
p@24 26 import timeit
p@24 27
p@24 28 import numpy
p@24 29
p@24 30 import theano
p@24 31 import theano.tensor as T
p@24 32
p@24 33
p@24 34 from logistic_sgd import LogisticRegression, load_data
p@24 35
p@24 36
p@24 37 # start-snippet-1
p@24 38 class HiddenLayer(object):
p@24 39 def __init__(self, rng, input, n_in, n_out, W=None, b=None,
p@24 40 activation=T.tanh):
p@24 41 """
p@24 42 Typical hidden layer of a MLP: units are fully-connected and have
p@24 43 sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
p@24 44 and the bias vector b is of shape (n_out,).
p@24 45
p@24 46 NOTE : The nonlinearity used here is tanh
p@24 47
p@24 48 Hidden unit activation is given by: tanh(dot(input,W) + b)
p@24 49
p@24 50 :type rng: numpy.random.RandomState
p@24 51 :param rng: a random number generator used to initialize weights
p@24 52
p@24 53 :type input: theano.tensor.dmatrix
p@24 54 :param input: a symbolic tensor of shape (n_examples, n_in)
p@24 55
p@24 56 :type n_in: int
p@24 57 :param n_in: dimensionality of input
p@24 58
p@24 59 :type n_out: int
p@24 60 :param n_out: number of hidden units
p@24 61
p@24 62 :type activation: theano.Op or function
p@24 63 :param activation: Non linearity to be applied in the hidden
p@24 64 layer
p@24 65 """
p@24 66 self.input = input
p@24 67 # end-snippet-1
p@24 68
p@24 69 # `W` is initialized with `W_values` which is uniformely sampled
p@24 70 # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
p@24 71 # for tanh activation function
p@24 72 # the output of uniform if converted using asarray to dtype
p@24 73 # theano.config.floatX so that the code is runable on GPU
p@24 74 # Note : optimal initialization of weights is dependent on the
p@24 75 # activation function used (among other things).
p@24 76 # For example, results presented in [Xavier10] suggest that you
p@24 77 # should use 4 times larger initial weights for sigmoid
p@24 78 # compared to tanh
p@24 79 # We have no info for other function, so we use the same as
p@24 80 # tanh.
p@24 81 if W is None:
p@24 82 W_values = numpy.asarray(
p@24 83 rng.uniform(
p@24 84 low=-numpy.sqrt(6. / (n_in + n_out)),
p@24 85 high=numpy.sqrt(6. / (n_in + n_out)),
p@24 86 size=(n_in, n_out)
p@24 87 ),
p@24 88 dtype=theano.config.floatX
p@24 89 )
p@24 90 if activation == theano.tensor.nnet.sigmoid:
p@24 91 W_values *= 4
p@24 92
p@24 93 W = theano.shared(value=W_values, name='W', borrow=True)
p@24 94
p@24 95 if b is None:
p@24 96 b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
p@24 97 b = theano.shared(value=b_values, name='b', borrow=True)
p@24 98
p@24 99 self.W = W
p@24 100 self.b = b
p@24 101
p@24 102 lin_output = T.dot(input, self.W) + self.b
p@24 103 self.output = (
p@24 104 lin_output if activation is None
p@24 105 else activation(lin_output)
p@24 106 )
p@24 107 # parameters of the model
p@24 108 self.params = [self.W, self.b]
p@24 109
p@24 110
p@24 111 # start-snippet-2
p@24 112 class MLP(object):
p@24 113 """Multi-Layer Perceptron Class
p@24 114
p@24 115 A multilayer perceptron is a feedforward artificial neural network model
p@24 116 that has one layer or more of hidden units and nonlinear activations.
p@24 117 Intermediate layers usually have as activation function tanh or the
p@24 118 sigmoid function (defined here by a ``HiddenLayer`` class) while the
p@24 119 top layer is a softmax layer (defined here by a ``LogisticRegression``
p@24 120 class).
p@24 121 """
p@24 122
p@24 123 def __init__(self, rng, input, n_in, n_hidden, n_out):
p@24 124 """Initialize the parameters for the multilayer perceptron
p@24 125
p@24 126 :type rng: numpy.random.RandomState
p@24 127 :param rng: a random number generator used to initialize weights
p@24 128
p@24 129 :type input: theano.tensor.TensorType
p@24 130 :param input: symbolic variable that describes the input of the
p@24 131 architecture (one minibatch)
p@24 132
p@24 133 :type n_in: int
p@24 134 :param n_in: number of input units, the dimension of the space in
p@24 135 which the datapoints lie
p@24 136
p@24 137 :type n_hidden: int
p@24 138 :param n_hidden: number of hidden units
p@24 139
p@24 140 :type n_out: int
p@24 141 :param n_out: number of output units, the dimension of the space in
p@24 142 which the labels lie
p@24 143
p@24 144 """
p@24 145
p@24 146 # Since we are dealing with a one hidden layer MLP, this will translate
p@24 147 # into a HiddenLayer with a tanh activation function connected to the
p@24 148 # LogisticRegression layer; the activation function can be replaced by
p@24 149 # sigmoid or any other nonlinear function
p@24 150 self.hiddenLayer = HiddenLayer(
p@24 151 rng=rng,
p@24 152 input=input,
p@24 153 n_in=n_in,
p@24 154 n_out=n_hidden,
p@24 155 activation=T.tanh
p@24 156 )
p@24 157
p@24 158 # The logistic regression layer gets as input the hidden units
p@24 159 # of the hidden layer
p@24 160 self.logRegressionLayer = LogisticRegression(
p@24 161 input=self.hiddenLayer.output,
p@24 162 n_in=n_hidden,
p@24 163 n_out=n_out
p@24 164 )
p@24 165 # end-snippet-2 start-snippet-3
p@24 166 # L1 norm ; one regularization option is to enforce L1 norm to
p@24 167 # be small
p@24 168 self.L1 = (
p@24 169 abs(self.hiddenLayer.W).sum()
p@24 170 + abs(self.logRegressionLayer.W).sum()
p@24 171 )
p@24 172
p@24 173 # square of L2 norm ; one regularization option is to enforce
p@24 174 # square of L2 norm to be small
p@24 175 self.L2_sqr = (
p@24 176 (self.hiddenLayer.W ** 2).sum()
p@24 177 + (self.logRegressionLayer.W ** 2).sum()
p@24 178 )
p@24 179
p@24 180 # negative log likelihood of the MLP is given by the negative
p@24 181 # log likelihood of the output of the model, computed in the
p@24 182 # logistic regression layer
p@24 183 self.negative_log_likelihood = (
p@24 184 self.logRegressionLayer.negative_log_likelihood
p@24 185 )
p@24 186 # same holds for the function computing the number of errors
p@24 187 self.errors = self.logRegressionLayer.errors
p@24 188
p@24 189 # the parameters of the model are the parameters of the two layer it is
p@24 190 # made out of
p@24 191 self.params = self.hiddenLayer.params + self.logRegressionLayer.params
p@24 192 # end-snippet-3
p@24 193
p@24 194 # keep track of model input
p@24 195 self.input = input
p@24 196
p@24 197
p@24 198 def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
p@24 199 dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
p@24 200 """
p@24 201 Demonstrate stochastic gradient descent optimization for a multilayer
p@24 202 perceptron
p@24 203
p@24 204 This is demonstrated on MNIST.
p@24 205
p@24 206 :type learning_rate: float
p@24 207 :param learning_rate: learning rate used (factor for the stochastic
p@24 208 gradient
p@24 209
p@24 210 :type L1_reg: float
p@24 211 :param L1_reg: L1-norm's weight when added to the cost (see
p@24 212 regularization)
p@24 213
p@24 214 :type L2_reg: float
p@24 215 :param L2_reg: L2-norm's weight when added to the cost (see
p@24 216 regularization)
p@24 217
p@24 218 :type n_epochs: int
p@24 219 :param n_epochs: maximal number of epochs to run the optimizer
p@24 220
p@24 221 :type dataset: string
p@24 222 :param dataset: the path of the MNIST dataset file from
p@24 223 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
p@24 224
p@24 225
p@24 226 """
p@24 227 datasets = load_data(dataset)
p@24 228
p@24 229 train_set_x, train_set_y = datasets[0]
p@24 230 valid_set_x, valid_set_y = datasets[1]
p@24 231 test_set_x, test_set_y = datasets[2]
p@24 232
p@24 233 # compute number of minibatches for training, validation and testing
p@24 234 n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
p@24 235 n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
p@24 236 n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
p@24 237
p@24 238 ######################
p@24 239 # BUILD ACTUAL MODEL #
p@24 240 ######################
p@24 241 print '... building the model'
p@24 242
p@24 243 # allocate symbolic variables for the data
p@24 244 index = T.lscalar() # index to a [mini]batch
p@24 245 x = T.matrix('x') # the data is presented as rasterized images
p@24 246 y = T.ivector('y') # the labels are presented as 1D vector of
p@24 247 # [int] labels
p@24 248
p@24 249 rng = numpy.random.RandomState(1234)
p@24 250
p@24 251 # construct the MLP class
p@24 252 classifier = MLP(
p@24 253 rng=rng,
p@24 254 input=x,
p@24 255 n_in=28 * 28,
p@24 256 n_hidden=n_hidden,
p@24 257 n_out=10
p@24 258 )
p@24 259
p@24 260 # start-snippet-4
p@24 261 # the cost we minimize during training is the negative log likelihood of
p@24 262 # the model plus the regularization terms (L1 and L2); cost is expressed
p@24 263 # here symbolically
p@24 264 cost = (
p@24 265 classifier.negative_log_likelihood(y)
p@24 266 + L1_reg * classifier.L1
p@24 267 + L2_reg * classifier.L2_sqr
p@24 268 )
p@24 269 # end-snippet-4
p@24 270
p@24 271 # compiling a Theano function that computes the mistakes that are made
p@24 272 # by the model on a minibatch
p@24 273 test_model = theano.function(
p@24 274 inputs=[index],
p@24 275 outputs=classifier.errors(y),
p@24 276 givens={
p@24 277 x: test_set_x[index * batch_size:(index + 1) * batch_size],
p@24 278 y: test_set_y[index * batch_size:(index + 1) * batch_size]
p@24 279 }
p@24 280 )
p@24 281
p@24 282 validate_model = theano.function(
p@24 283 inputs=[index],
p@24 284 outputs=classifier.errors(y),
p@24 285 givens={
p@24 286 x: valid_set_x[index * batch_size:(index + 1) * batch_size],
p@24 287 y: valid_set_y[index * batch_size:(index + 1) * batch_size]
p@24 288 }
p@24 289 )
p@24 290
p@24 291 # start-snippet-5
p@24 292 # compute the gradient of cost with respect to theta (sotred in params)
p@24 293 # the resulting gradients will be stored in a list gparams
p@24 294 gparams = [T.grad(cost, param) for param in classifier.params]
p@24 295
p@24 296 # specify how to update the parameters of the model as a list of
p@24 297 # (variable, update expression) pairs
p@24 298
p@24 299 # given two lists of the same length, A = [a1, a2, a3, a4] and
p@24 300 # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
p@24 301 # element is a pair formed from the two lists :
p@24 302 # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
p@24 303 updates = [
p@24 304 (param, param - learning_rate * gparam)
p@24 305 for param, gparam in zip(classifier.params, gparams)
p@24 306 ]
p@24 307
p@24 308 # compiling a Theano function `train_model` that returns the cost, but
p@24 309 # in the same time updates the parameter of the model based on the rules
p@24 310 # defined in `updates`
p@24 311 train_model = theano.function(
p@24 312 inputs=[index],
p@24 313 outputs=cost,
p@24 314 updates=updates,
p@24 315 givens={
p@24 316 x: train_set_x[index * batch_size: (index + 1) * batch_size],
p@24 317 y: train_set_y[index * batch_size: (index + 1) * batch_size]
p@24 318 }
p@24 319 )
p@24 320 # end-snippet-5
p@24 321
p@24 322 ###############
p@24 323 # TRAIN MODEL #
p@24 324 ###############
p@24 325 print '... training'
p@24 326
p@24 327 # early-stopping parameters
p@24 328 patience = 10000 # look as this many examples regardless
p@24 329 patience_increase = 2 # wait this much longer when a new best is
p@24 330 # found
p@24 331 improvement_threshold = 0.995 # a relative improvement of this much is
p@24 332 # considered significant
p@24 333 validation_frequency = min(n_train_batches, patience / 2)
p@24 334 # go through this many
p@24 335 # minibatche before checking the network
p@24 336 # on the validation set; in this case we
p@24 337 # check every epoch
p@24 338
p@24 339 best_validation_loss = numpy.inf
p@24 340 best_iter = 0
p@24 341 test_score = 0.
p@24 342 start_time = timeit.default_timer()
p@24 343
p@24 344 epoch = 0
p@24 345 done_looping = False
p@24 346
p@24 347 while (epoch < n_epochs) and (not done_looping):
p@24 348 epoch = epoch + 1
p@24 349 for minibatch_index in xrange(n_train_batches):
p@24 350
p@24 351 minibatch_avg_cost = train_model(minibatch_index)
p@24 352 # iteration number
p@24 353 iter = (epoch - 1) * n_train_batches + minibatch_index
p@24 354
p@24 355 if (iter + 1) % validation_frequency == 0:
p@24 356 # compute zero-one loss on validation set
p@24 357 validation_losses = [validate_model(i) for i
p@24 358 in xrange(n_valid_batches)]
p@24 359 this_validation_loss = numpy.mean(validation_losses)
p@24 360
p@24 361 print(
p@24 362 'epoch %i, minibatch %i/%i, validation error %f %%' %
p@24 363 (
p@24 364 epoch,
p@24 365 minibatch_index + 1,
p@24 366 n_train_batches,
p@24 367 this_validation_loss * 100.
p@24 368 )
p@24 369 )
p@24 370
p@24 371 # if we got the best validation score until now
p@24 372 if this_validation_loss < best_validation_loss:
p@24 373 #improve patience if loss improvement is good enough
p@24 374 if (
p@24 375 this_validation_loss < best_validation_loss *
p@24 376 improvement_threshold
p@24 377 ):
p@24 378 patience = max(patience, iter * patience_increase)
p@24 379
p@24 380 best_validation_loss = this_validation_loss
p@24 381 best_iter = iter
p@24 382
p@24 383 # test it on the test set
p@24 384 test_losses = [test_model(i) for i
p@24 385 in xrange(n_test_batches)]
p@24 386 test_score = numpy.mean(test_losses)
p@24 387
p@24 388 print((' epoch %i, minibatch %i/%i, test error of '
p@24 389 'best model %f %%') %
p@24 390 (epoch, minibatch_index + 1, n_train_batches,
p@24 391 test_score * 100.))
p@24 392
p@24 393 if patience <= iter:
p@24 394 done_looping = True
p@24 395 break
p@24 396
p@24 397 end_time = timeit.default_timer()
p@24 398 print(('Optimization complete. Best validation score of %f %% '
p@24 399 'obtained at iteration %i, with test performance %f %%') %
p@24 400 (best_validation_loss * 100., best_iter + 1, test_score * 100.))
p@24 401 print >> sys.stderr, ('The code for file ' +
p@24 402 os.path.split(__file__)[1] +
p@24 403 ' ran for %.2fm' % ((end_time - start_time) / 60.))
p@24 404
p@24 405
p@24 406 if __name__ == '__main__':
p@24 407 test_mlp()
p@24 408
p@24 409 # Rectifier Linear Unit
p@24 410 #Source: http://stackoverflow.com/questions/26497564/theano-hiddenlayer-activation-function
p@24 411 def relu(x):
p@24 412 return T.maximum(0.,x)