# HG changeset patch
# User Paulo Chiliguano
# Date 1437387721 -3600
# Node ID 20589ba1908a996297ddcf60fca3cf76683b4e2f
# Parent 38f44dd7e54bd562d07084435c064f0dd79c6dec
Uploading Weight Matrix Factorization script
diff -r 38f44dd7e54b -r 20589ba1908a Code/preview_clip.py
--- a/Code/preview_clip.py Fri Jul 17 21:52:18 2015 +0100
+++ b/Code/preview_clip.py Mon Jul 20 11:22:01 2015 +0100
@@ -52,7 +52,7 @@
except:
time.sleep(22)
print([next[:-2], 'NA'])
- writer.writerow([next[:-2], 'NA', s.artist_name, s.title])
+ writer.writerow([next[:-2], 'NA', s.artist_name.encode("utf-8"), s.title.encode("utf-8")])
else:
time.sleep(22)
print([next[:-2], preview_url, s.artist_name, s.title])
diff -r 38f44dd7e54b -r 20589ba1908a Code/preview_mp3.py
--- a/Code/preview_mp3.py Fri Jul 17 21:52:18 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Spyder Editor
-
-This is a temporary script file.
-"""
-
-
-
-import oauth2 as oauth
-import csv
-import time
-import webbrowser
-
-consumer_key = '7ds28qendsk9'
-consumer_secret = 'm5nsktn3hu6x45cy'
-consumer = oauth.Consumer(consumer_key, consumer_secret)
-
-with open('C:\Users\Paulo\Documents\Queen Mary\MSc Project\Dataset\echonest.txt', 'rb') as input:
- idreader = csv.reader(input)
- for row in idreader:
- request_url = row[1]
- req = oauth.Request(method="GET", url=request_url,is_form_encoded=True)
- req['oauth_timestamp'] = oauth.Request.make_timestamp()
- req['oauth_nonce'] = oauth.Request.make_nonce()
- req['country'] = "GB"
- sig_method = oauth.SignatureMethod_HMAC_SHA1()
- req.sign_request(sig_method, consumer, token=None)
- #print req.to_url()
- new = 2 # open in a new tab, if possible
- # open a public URL, in this case, the webbrowser docs
- webbrowser.open(req.to_url(),new=new)
- time.sleep(22)
-
-
diff -r 38f44dd7e54b -r 20589ba1908a Code/wmf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Code/wmf.py Mon Jul 20 11:22:01 2015 +0100
@@ -0,0 +1,166 @@
+import numpy as np
+import time
+import itertools
+
+
+
+def linear_surplus_confidence_matrix(B, alpha):
+ # To construct the surplus confidence matrix, we need to operate only on the nonzero elements.
+ # This is not possible: S = alpha * B
+ S = B.copy()
+ S.data = alpha * S.data
+ return S
+
+
+def log_surplus_confidence_matrix(B, alpha, epsilon):
+ # To construct the surplus confidence matrix, we need to operate only on the nonzero elements.
+ # This is not possible: S = alpha * np.log(1 + B / epsilon)
+ S = B.copy()
+ S.data = alpha * np.log(1 + S.data / epsilon)
+ return S
+
+
+
+def iter_rows(S):
+ """
+ Helper function to iterate quickly over the data and indices of the
+ rows of the S matrix. A naive implementation using indexing
+ on S is much, much slower.
+ """
+ for i in xrange(S.shape[0]):
+ lo, hi = S.indptr[i], S.indptr[i + 1]
+ yield i, S.data[lo:hi], S.indices[lo:hi]
+
+
+def recompute_factors(Y, S, lambda_reg, dtype='float32'):
+ """
+ recompute matrix X from Y.
+ X = recompute_factors(Y, S, lambda_reg)
+ This can also be used for the reverse operation as follows:
+ Y = recompute_factors(X, ST, lambda_reg)
+
+ The comments are in terms of X being the users and Y being the items.
+ """
+ m = S.shape[0] # m = number of users
+ f = Y.shape[1] # f = number of factors
+ YTY = np.dot(Y.T, Y) # precompute this
+ YTYpI = YTY + lambda_reg * np.eye(f)
+ X_new = np.zeros((m, f), dtype=dtype)
+
+ for k, s_u, i_u in iter_rows(S):
+ Y_u = Y[i_u] # exploit sparsity
+ A = np.dot(s_u + 1, Y_u)
+ YTSY = np.dot(Y_u.T, (Y_u * s_u.reshape(-1, 1)))
+ B = YTSY + YTYpI
+
+ # Binv = np.linalg.inv(B)
+ # X_new[k] = np.dot(A, Binv)
+ X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e
+
+ return X_new
+
+
+
+def recompute_factors_bias(Y, S, lambda_reg, dtype='float32'):
+ """
+ Like recompute_factors, but the last column of X and Y is treated as
+ a bias vector.
+ """
+ m = S.shape[0] # m = number of users
+ f = Y.shape[1] - 1 # f = number of factors
+
+ b_y = Y[:, f] # vector of biases
+
+ Y_e = Y.copy()
+ Y_e[:, f] = 1 # factors with added column of ones
+
+ YTY = np.dot(Y_e.T, Y_e) # precompute this
+
+ R = np.eye(f + 1) # regularization matrix
+ R[f, f] = 0 # don't regularize the biases!
+ R *= lambda_reg
+
+ YTYpR = YTY + R
+
+ byY = np.dot(b_y, Y_e) # precompute this as well
+
+ X_new = np.zeros((m, f + 1), dtype=dtype)
+
+ for k, s_u, i_u in iter_rows(S):
+ Y_u = Y_e[i_u] # exploit sparsity
+ b_y_u = b_y[i_u]
+ A = np.dot((1 - b_y_u) * s_u + 1, Y_u)
+ A -= byY
+
+ YTSY = np.dot(Y_u.T, (Y_u * s_u[:, None]))
+ B = YTSY + YTYpR
+
+ # Binv = np.linalg.inv(B)
+ # X_new[k] = np.dot(A, Binv)
+ X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e
+
+ return X_new
+
+
+
+def factorize(S, num_factors, lambda_reg=1e-5, num_iterations=20, init_std=0.01, verbose=False, dtype='float32', recompute_factors=recompute_factors, *args, **kwargs):
+ """
+ factorize a given sparse matrix using the Weighted Matrix Factorization algorithm by
+ Hu, Koren and Volinsky.
+
+ S: 'surplus' confidence matrix, i.e. C - I where C is the matrix with confidence weights.
+ S is sparse while C is not (and the sparsity pattern of S is the same as that of
+ the preference matrix, so it doesn't need to be specified separately).
+
+ num_factors: the number of factors.
+
+ lambda_reg: the value of the regularization constant.
+
+ num_iterations: the number of iterations to run the algorithm for. Each iteration consists
+ of two steps, one to recompute U given V, and one to recompute V given U.
+
+ init_std: the standard deviation of the Gaussian with which V is initialized.
+
+ verbose: print a bunch of stuff during training, including timing information.
+
+ dtype: the dtype of the resulting factor matrices. Using single precision is recommended,
+ it speeds things up a bit.
+
+ recompute_factors: helper function that implements the inner loop.
+
+ returns:
+ U, V: factor matrices. If bias=True, the last columns of the matrices contain the biases.
+ """
+ num_users, num_items = S.shape
+
+ if verbose:
+ print "precompute transpose"
+ start_time = time.time()
+
+ ST = S.T.tocsr()
+
+ if verbose:
+ print " took %.3f seconds" % (time.time() - start_time)
+ print "run ALS algorithm"
+ start_time = time.time()
+
+ U = None # no need to initialize U, it will be overwritten anyway
+ V = np.random.randn(num_items, num_factors).astype(dtype) * init_std
+
+ for i in xrange(num_iterations):
+ if verbose:
+ print " iteration %d" % i
+ print " recompute user factors U"
+
+ U = recompute_factors(V, S, lambda_reg, dtype, *args, **kwargs)
+
+ if verbose:
+ print " time since start: %.3f seconds" % (time.time() - start_time)
+ print " recompute item factors V"
+
+ V = recompute_factors(U, ST, lambda_reg, dtype, *args, **kwargs)
+
+ if verbose:
+ print " time since start: %.3f seconds" % (time.time() - start_time)
+
+ return U, V