Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
changeset 12:20589ba1908a
Uploading Weight Matrix Factorization script
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Mon, 20 Jul 2015 11:22:01 +0100 |
parents | 38f44dd7e54b |
children | 0a0d6203638a |
files | Code/preview_clip.py Code/preview_mp3.py Code/wmf.py |
diffstat | 3 files changed, 167 insertions(+), 36 deletions(-) [+] |
line wrap: on
line diff
--- a/Code/preview_clip.py Fri Jul 17 21:52:18 2015 +0100 +++ b/Code/preview_clip.py Mon Jul 20 11:22:01 2015 +0100 @@ -52,7 +52,7 @@ except: time.sleep(22) print([next[:-2], 'NA']) - writer.writerow([next[:-2], 'NA', s.artist_name, s.title]) + writer.writerow([next[:-2], 'NA', s.artist_name.encode("utf-8"), s.title.encode("utf-8")]) else: time.sleep(22) print([next[:-2], preview_url, s.artist_name, s.title])
--- a/Code/preview_mp3.py Fri Jul 17 21:52:18 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Spyder Editor - -This is a temporary script file. -""" - - - -import oauth2 as oauth -import csv -import time -import webbrowser - -consumer_key = '7ds28qendsk9' -consumer_secret = 'm5nsktn3hu6x45cy' -consumer = oauth.Consumer(consumer_key, consumer_secret) - -with open('C:\Users\Paulo\Documents\Queen Mary\MSc Project\Dataset\echonest.txt', 'rb') as input: - idreader = csv.reader(input) - for row in idreader: - request_url = row[1] - req = oauth.Request(method="GET", url=request_url,is_form_encoded=True) - req['oauth_timestamp'] = oauth.Request.make_timestamp() - req['oauth_nonce'] = oauth.Request.make_nonce() - req['country'] = "GB" - sig_method = oauth.SignatureMethod_HMAC_SHA1() - req.sign_request(sig_method, consumer, token=None) - #print req.to_url() - new = 2 # open in a new tab, if possible - # open a public URL, in this case, the webbrowser docs - webbrowser.open(req.to_url(),new=new) - time.sleep(22) - -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Code/wmf.py Mon Jul 20 11:22:01 2015 +0100 @@ -0,0 +1,166 @@ +import numpy as np +import time +import itertools + + + +def linear_surplus_confidence_matrix(B, alpha): + # To construct the surplus confidence matrix, we need to operate only on the nonzero elements. + # This is not possible: S = alpha * B + S = B.copy() + S.data = alpha * S.data + return S + + +def log_surplus_confidence_matrix(B, alpha, epsilon): + # To construct the surplus confidence matrix, we need to operate only on the nonzero elements. + # This is not possible: S = alpha * np.log(1 + B / epsilon) + S = B.copy() + S.data = alpha * np.log(1 + S.data / epsilon) + return S + + + +def iter_rows(S): + """ + Helper function to iterate quickly over the data and indices of the + rows of the S matrix. A naive implementation using indexing + on S is much, much slower. + """ + for i in xrange(S.shape[0]): + lo, hi = S.indptr[i], S.indptr[i + 1] + yield i, S.data[lo:hi], S.indices[lo:hi] + + +def recompute_factors(Y, S, lambda_reg, dtype='float32'): + """ + recompute matrix X from Y. + X = recompute_factors(Y, S, lambda_reg) + This can also be used for the reverse operation as follows: + Y = recompute_factors(X, ST, lambda_reg) + + The comments are in terms of X being the users and Y being the items. + """ + m = S.shape[0] # m = number of users + f = Y.shape[1] # f = number of factors + YTY = np.dot(Y.T, Y) # precompute this + YTYpI = YTY + lambda_reg * np.eye(f) + X_new = np.zeros((m, f), dtype=dtype) + + for k, s_u, i_u in iter_rows(S): + Y_u = Y[i_u] # exploit sparsity + A = np.dot(s_u + 1, Y_u) + YTSY = np.dot(Y_u.T, (Y_u * s_u.reshape(-1, 1))) + B = YTSY + YTYpI + + # Binv = np.linalg.inv(B) + # X_new[k] = np.dot(A, Binv) + X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e + + return X_new + + + +def recompute_factors_bias(Y, S, lambda_reg, dtype='float32'): + """ + Like recompute_factors, but the last column of X and Y is treated as + a bias vector. + """ + m = S.shape[0] # m = number of users + f = Y.shape[1] - 1 # f = number of factors + + b_y = Y[:, f] # vector of biases + + Y_e = Y.copy() + Y_e[:, f] = 1 # factors with added column of ones + + YTY = np.dot(Y_e.T, Y_e) # precompute this + + R = np.eye(f + 1) # regularization matrix + R[f, f] = 0 # don't regularize the biases! + R *= lambda_reg + + YTYpR = YTY + R + + byY = np.dot(b_y, Y_e) # precompute this as well + + X_new = np.zeros((m, f + 1), dtype=dtype) + + for k, s_u, i_u in iter_rows(S): + Y_u = Y_e[i_u] # exploit sparsity + b_y_u = b_y[i_u] + A = np.dot((1 - b_y_u) * s_u + 1, Y_u) + A -= byY + + YTSY = np.dot(Y_u.T, (Y_u * s_u[:, None])) + B = YTSY + YTYpR + + # Binv = np.linalg.inv(B) + # X_new[k] = np.dot(A, Binv) + X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e + + return X_new + + + +def factorize(S, num_factors, lambda_reg=1e-5, num_iterations=20, init_std=0.01, verbose=False, dtype='float32', recompute_factors=recompute_factors, *args, **kwargs): + """ + factorize a given sparse matrix using the Weighted Matrix Factorization algorithm by + Hu, Koren and Volinsky. + + S: 'surplus' confidence matrix, i.e. C - I where C is the matrix with confidence weights. + S is sparse while C is not (and the sparsity pattern of S is the same as that of + the preference matrix, so it doesn't need to be specified separately). + + num_factors: the number of factors. + + lambda_reg: the value of the regularization constant. + + num_iterations: the number of iterations to run the algorithm for. Each iteration consists + of two steps, one to recompute U given V, and one to recompute V given U. + + init_std: the standard deviation of the Gaussian with which V is initialized. + + verbose: print a bunch of stuff during training, including timing information. + + dtype: the dtype of the resulting factor matrices. Using single precision is recommended, + it speeds things up a bit. + + recompute_factors: helper function that implements the inner loop. + + returns: + U, V: factor matrices. If bias=True, the last columns of the matrices contain the biases. + """ + num_users, num_items = S.shape + + if verbose: + print "precompute transpose" + start_time = time.time() + + ST = S.T.tocsr() + + if verbose: + print " took %.3f seconds" % (time.time() - start_time) + print "run ALS algorithm" + start_time = time.time() + + U = None # no need to initialize U, it will be overwritten anyway + V = np.random.randn(num_items, num_factors).astype(dtype) * init_std + + for i in xrange(num_iterations): + if verbose: + print " iteration %d" % i + print " recompute user factors U" + + U = recompute_factors(V, S, lambda_reg, dtype, *args, **kwargs) + + if verbose: + print " time since start: %.3f seconds" % (time.time() - start_time) + print " recompute item factors V" + + V = recompute_factors(U, ST, lambda_reg, dtype, *args, **kwargs) + + if verbose: + print " time since start: %.3f seconds" % (time.time() - start_time) + + return U, V