hybrid-music-recommender-using-content-based-and-social-information: Code/genre_classification/learning/logistic

comparison Code/genre_classification/learning/logistic_sgd.py @ 24:68a62ca32441

Organized python scripts

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sat, 15 Aug 2015 19:16:17 +0100
parents
children

comparison

equal deleted inserted replaced

-:45e6f85d0ba4
+:68a62ca32441
+"""
+This tutorial introduces logistic regression using Theano and stochastic
+gradient descent.
+Logistic regression is a probabilistic, linear classifier. It is parametrized
+by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
+done by projecting data points onto a set of hyperplanes, the distance to
+which is used to determine a class membership probability.
+Mathematically, this can be written as:
+.. math::
+P(Y=i|x, W,b) &= softmax_i(W x + b) \\
+&= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
+The output of the model or prediction is then done by taking the argmax of
+the vector whose i'th element is P(Y=i|x).
+.. math::
+y_{pred} = argmax_i P(Y=i|x,W,b)
+This tutorial presents a stochastic gradient descent optimization method
+suitable for large datasets.
+References:
+- textbooks: "Pattern Recognition and Machine Learning" -
+Christopher M. Bishop, section 4.3.2
+"""
+__docformat__ = 'restructedtext en'
+import cPickle
+import gzip
+import os
+import sys
+import timeit
+import numpy
+import theano
+import theano.tensor as T
+class LogisticRegression(object):
+"""Multi-class Logistic Regression Class
+The logistic regression is fully described by a weight matrix :math:`W`
+and bias vector :math:`b`. Classification is done by projecting data
+points onto a set of hyperplanes, the distance to which is used to
+determine a class membership probability.
+"""
+def __init__(self, input, n_in, n_out):
+""" Initialize the parameters of the logistic regression
+:type input: theano.tensor.TensorType
+:param input: symbolic variable that describes the input of the
+architecture (one minibatch)
+:type n_in: int
+:param n_in: number of input units, the dimension of the space in
+which the datapoints lie
+:type n_out: int
+:param n_out: number of output units, the dimension of the space in
+which the labels lie
+"""
+# start-snippet-1
+# initialize with 0 the weights W as a matrix of shape (n_in, n_out)
+self.W = theano.shared(
+value=numpy.zeros(
+(n_in, n_out),
+dtype=theano.config.floatX
+),
+name='W',
+borrow=True
+)
+# initialize the baises b as a vector of n_out 0s
+self.b = theano.shared(
+value=numpy.zeros(
+(n_out,),
+dtype=theano.config.floatX
+),
+name='b',
+borrow=True
+)
+# symbolic expression for computing the matrix of class-membership
+# probabilities
+# Where:
+# W is a matrix where column-k represent the separation hyperplane for
+# class-k
+# x is a matrix where row-j  represents input training sample-j
+# b is a vector where element-k represent the free parameter of
+# hyperplane-k
+self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
+# symbolic description of how to compute prediction as class whose
+# probability is maximal
+self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+# end-snippet-1
+# parameters of the model
+self.params = [self.W, self.b]
+# keep track of model input
+self.input = input
+def negative_log_likelihood(self, y):
+"""Return the mean of the negative log-likelihood of the prediction
+of this model under a given target distribution.
+.. math::
+\frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
+\frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
+\log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
+\ell (\theta=\{W,b\}, \mathcal{D})
+:type y: theano.tensor.TensorType
+:param y: corresponds to a vector that gives for each example the
+correct label
+Note: we use the mean instead of the sum so that
+the learning rate is less dependent on the batch size
+"""
+# start-snippet-2
+# y.shape[0] is (symbolically) the number of rows in y, i.e.,
+# number of examples (call it n) in the minibatch
+# T.arange(y.shape[0]) is a symbolic vector which will contain
+# [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
+# Log-Probabilities (call it LP) with one row per example and
+# one column per class LP[T.arange(y.shape[0]),y] is a vector
+# v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
+# LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
+# the mean (across minibatch examples) of the elements in v,
+# i.e., the mean log-likelihood across the minibatch.
+return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+# end-snippet-2
+def errors(self, y):
+"""Return a float representing the number of errors in the minibatch
+over the total number of examples of the minibatch ; zero one
+loss over the size of the minibatch
+:type y: theano.tensor.TensorType
+:param y: corresponds to a vector that gives for each example the
+correct label
+"""
+# check if y has same dimension of y_pred
+if y.ndim != self.y_pred.ndim:
+raise TypeError(
+'y should have the same shape as self.y_pred',
+('y', y.type, 'y_pred', self.y_pred.type)
+)
+# check if y is of the correct datatype
+if y.dtype.startswith('int'):
+# the T.neq operator returns a vector of 0s and 1s, where 1
+# represents a mistake in prediction
+return T.mean(T.neq(self.y_pred, y))
+else:
+raise NotImplementedError()
+def load_data(dataset):
+''' Loads the dataset
+:type dataset: string
+:param dataset: the path to the dataset (here MNIST)
+'''
+#############
+# LOAD DATA #
+#############
+'''
+# Download the MNIST dataset if it is not present
+data_dir, data_file = os.path.split(dataset)
+if data_dir == "" and not os.path.isfile(dataset):
+# Check if dataset is in the data directory.
+new_path = os.path.join(
+os.path.split(__file__)[0],
+"..",
+"data",
+dataset
+)
+if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
+dataset = new_path
+if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
+import urllib
+origin = (
+'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
+)
+print 'Downloading data from %s' % origin
+urllib.urlretrieve(origin, dataset)
+print '... loading data'
+# Load the dataset
+f = gzip.open(dataset, 'rb')
+'''
+f = file(dataset, 'rb')
+train_set, valid_set, test_set = cPickle.load(f)
+f.close()
+#train_set, valid_set, test_set format: tuple(input, target)
+#input is an numpy.ndarray of 2 dimensions (a matrix)
+#witch row's correspond to an example. target is a
+#numpy.ndarray of 1 dimensions (vector)) that have the same length as
+#the number of rows in the input. It should give the target
+#target to the example with the same index in the input.
+def shared_dataset(data_xy, borrow=True):
+""" Function that loads the dataset into shared variables
+The reason we store our dataset in shared variables is to allow
+Theano to copy it into the GPU memory (when code is run on GPU).
+Since copying data into the GPU is slow, copying a minibatch everytime
+is needed (the default behaviour if the data is not in a shared
+variable) would lead to a large decrease in performance.
+"""
+data_x, data_y = data_xy
+shared_x = theano.shared(numpy.asarray(data_x,
+dtype=theano.config.floatX),
+borrow=borrow)
+shared_y = theano.shared(numpy.asarray(data_y,
+dtype=theano.config.floatX),
+borrow=borrow)
+# When storing data on the GPU it has to be stored as floats
+# therefore we will store the labels as ``floatX`` as well
+# (``shared_y`` does exactly that). But during our computations
+# we need them as ints (we use labels as index, and if they are
+# floats it doesn't make sense) therefore instead of returning
+# ``shared_y`` we will have to cast it to int. This little hack
+# lets ous get around this issue
+return shared_x, T.cast(shared_y, 'int32')
+test_set_x, test_set_y = shared_dataset(test_set)
+valid_set_x, valid_set_y = shared_dataset(valid_set)
+train_set_x, train_set_y = shared_dataset(train_set)
+rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
+(test_set_x, test_set_y)]
+return rval
+def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
+dataset='mnist.pkl.gz',
+batch_size=600):
+"""
+Demonstrate stochastic gradient descent optimization of a log-linear
+model
+This is demonstrated on MNIST.
+:type learning_rate: float
+:param learning_rate: learning rate used (factor for the stochastic
+gradient)
+:type n_epochs: int
+:param n_epochs: maximal number of epochs to run the optimizer
+:type dataset: string
+:param dataset: the path of the MNIST dataset file from
+http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
+"""
+datasets = load_data(dataset)
+train_set_x, train_set_y = datasets[0]
+valid_set_x, valid_set_y = datasets[1]
+test_set_x, test_set_y = datasets[2]
+# compute number of minibatches for training, validation and testing
+n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
+n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
+n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
+######################
+# BUILD ACTUAL MODEL #
+######################
+print '... building the model'
+# allocate symbolic variables for the data
+index = T.lscalar()  # index to a [mini]batch
+# generate symbolic variables for input (x and y represent a
+# minibatch)
+x = T.matrix('x')  # data, presented as rasterized images
+y = T.ivector('y')  # labels, presented as 1D vector of [int] labels
+# construct the logistic regression class
+# Each MNIST image has size 28*28
+classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
+# the cost we minimize during training is the negative log likelihood of
+# the model in symbolic format
+cost = classifier.negative_log_likelihood(y)
+# compiling a Theano function that computes the mistakes that are made by
+# the model on a minibatch
+test_model = theano.function(
+inputs=[index],
+outputs=classifier.errors(y),
+givens={
+x: test_set_x[index * batch_size: (index + 1) * batch_size],
+y: test_set_y[index * batch_size: (index + 1) * batch_size]
+}
+)
+validate_model = theano.function(
+inputs=[index],
+outputs=classifier.errors(y),
+givens={
+x: valid_set_x[index * batch_size: (index + 1) * batch_size],
+y: valid_set_y[index * batch_size: (index + 1) * batch_size]
+}
+)
+# compute the gradient of cost with respect to theta = (W,b)
+g_W = T.grad(cost=cost, wrt=classifier.W)
+g_b = T.grad(cost=cost, wrt=classifier.b)
+# start-snippet-3
+# specify how to update the parameters of the model as a list of
+# (variable, update expression) pairs.
+updates = [(classifier.W, classifier.W - learning_rate * g_W),
+(classifier.b, classifier.b - learning_rate * g_b)]
+# compiling a Theano function `train_model` that returns the cost, but in
+# the same time updates the parameter of the model based on the rules
+# defined in `updates`
+train_model = theano.function(
+inputs=[index],
+outputs=cost,
+updates=updates,
+givens={
+x: train_set_x[index * batch_size: (index + 1) * batch_size],
+y: train_set_y[index * batch_size: (index + 1) * batch_size]
+}
+)
+# end-snippet-3
+###############
+# TRAIN MODEL #
+###############
+print '... training the model'
+# early-stopping parameters
+patience = 5000  # look as this many examples regardless
+patience_increase = 2  # wait this much longer when a new best is
+# found
+improvement_threshold = 0.995  # a relative improvement of this much is
+# considered significant
+validation_frequency = min(n_train_batches, patience / 2)
+# go through this many
+# minibatche before checking the network
+# on the validation set; in this case we
+# check every epoch
+best_validation_loss = numpy.inf
+test_score = 0.
+start_time = timeit.default_timer()
+done_looping = False
+epoch = 0
+while (epoch < n_epochs) and (not done_looping):
+epoch = epoch + 1
+for minibatch_index in xrange(n_train_batches):
+minibatch_avg_cost = train_model(minibatch_index)
+# iteration number
+iter = (epoch - 1) * n_train_batches + minibatch_index
+if (iter + 1) % validation_frequency == 0:
+# compute zero-one loss on validation set
+validation_losses = [validate_model(i)
+for i in xrange(n_valid_batches)]
+this_validation_loss = numpy.mean(validation_losses)
+print(
+'epoch %i, minibatch %i/%i, validation error %f %%' %
+(
+epoch,
+minibatch_index + 1,
+n_train_batches,
+this_validation_loss * 100.
+)
+)
+# if we got the best validation score until now
+if this_validation_loss < best_validation_loss:
+#improve patience if loss improvement is good enough
+if this_validation_loss < best_validation_loss *  \
+improvement_threshold:
+patience = max(patience, iter * patience_increase)
+best_validation_loss = this_validation_loss
+# test it on the test set
+test_losses = [test_model(i)
+for i in xrange(n_test_batches)]
+test_score = numpy.mean(test_losses)
+print(
+(
+'     epoch %i, minibatch %i/%i, test error of'
+' best model %f %%'
+) %
+(
+epoch,
+minibatch_index + 1,
+n_train_batches,
+test_score * 100.
+)
+)
+# save the best model
+with open('best_model.pkl', 'w') as f:
+cPickle.dump(classifier, f)
+if patience <= iter:
+done_looping = True
+break
+end_time = timeit.default_timer()
+print(
+(
+'Optimization complete with best validation score of %f %%,'
+'with test performance %f %%'
+)
+% (best_validation_loss * 100., test_score * 100.)
+)
+print 'The code run for %d epochs, with %f epochs/sec' % (
+epoch, 1. * epoch / (end_time - start_time))
+print >> sys.stderr, ('The code for file ' +
+os.path.split(__file__)[1] +
+' ran for %.1fs' % ((end_time - start_time)))
+def predict():
+"""
+An example of how to load a trained model and use it
+to predict labels.
+"""
+# load the saved model
+classifier = cPickle.load(open('best_model.pkl'))
+# compile a predictor function
+predict_model = theano.function(
+inputs=[classifier.input],
+outputs=classifier.y_pred)
+# We can test it on some examples from test test
+dataset='mnist.pkl.gz'
+datasets = load_data(dataset)
+test_set_x, test_set_y = datasets[2]
+test_set_x = test_set_x.get_value()
+predicted_values = predict_model(test_set_x[:10])
+print ("Predicted values for the first 10 examples in test set:")
+print predicted_values
+if __name__ == '__main__':
+sgd_optimization_mnist()

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

comparison Code/genre_classification/learning/logistic_sgd.py @ 24:68a62ca32441