Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/wmf.py @ 18:c0a08cbdfacd
First script
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 28 Jul 2015 20:58:57 +0100 |
parents | 20589ba1908a |
children |
line wrap: on
line source
import numpy as np import time import itertools def linear_surplus_confidence_matrix(B, alpha): # To construct the surplus confidence matrix, we need to operate only on the nonzero elements. # This is not possible: S = alpha * B S = B.copy() S.data = alpha * S.data return S def log_surplus_confidence_matrix(B, alpha, epsilon): # To construct the surplus confidence matrix, we need to operate only on the nonzero elements. # This is not possible: S = alpha * np.log(1 + B / epsilon) S = B.copy() S.data = alpha * np.log(1 + S.data / epsilon) return S def iter_rows(S): """ Helper function to iterate quickly over the data and indices of the rows of the S matrix. A naive implementation using indexing on S is much, much slower. """ for i in xrange(S.shape[0]): lo, hi = S.indptr[i], S.indptr[i + 1] yield i, S.data[lo:hi], S.indices[lo:hi] def recompute_factors(Y, S, lambda_reg, dtype='float32'): """ recompute matrix X from Y. X = recompute_factors(Y, S, lambda_reg) This can also be used for the reverse operation as follows: Y = recompute_factors(X, ST, lambda_reg) The comments are in terms of X being the users and Y being the items. """ m = S.shape[0] # m = number of users f = Y.shape[1] # f = number of factors YTY = np.dot(Y.T, Y) # precompute this YTYpI = YTY + lambda_reg * np.eye(f) X_new = np.zeros((m, f), dtype=dtype) for k, s_u, i_u in iter_rows(S): Y_u = Y[i_u] # exploit sparsity A = np.dot(s_u + 1, Y_u) YTSY = np.dot(Y_u.T, (Y_u * s_u.reshape(-1, 1))) B = YTSY + YTYpI # Binv = np.linalg.inv(B) # X_new[k] = np.dot(A, Binv) X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e return X_new def recompute_factors_bias(Y, S, lambda_reg, dtype='float32'): """ Like recompute_factors, but the last column of X and Y is treated as a bias vector. """ m = S.shape[0] # m = number of users f = Y.shape[1] - 1 # f = number of factors b_y = Y[:, f] # vector of biases Y_e = Y.copy() Y_e[:, f] = 1 # factors with added column of ones YTY = np.dot(Y_e.T, Y_e) # precompute this R = np.eye(f + 1) # regularization matrix R[f, f] = 0 # don't regularize the biases! R *= lambda_reg YTYpR = YTY + R byY = np.dot(b_y, Y_e) # precompute this as well X_new = np.zeros((m, f + 1), dtype=dtype) for k, s_u, i_u in iter_rows(S): Y_u = Y_e[i_u] # exploit sparsity b_y_u = b_y[i_u] A = np.dot((1 - b_y_u) * s_u + 1, Y_u) A -= byY YTSY = np.dot(Y_u.T, (Y_u * s_u[:, None])) B = YTSY + YTYpR # Binv = np.linalg.inv(B) # X_new[k] = np.dot(A, Binv) X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e return X_new def factorize(S, num_factors, lambda_reg=1e-5, num_iterations=20, init_std=0.01, verbose=False, dtype='float32', recompute_factors=recompute_factors, *args, **kwargs): """ factorize a given sparse matrix using the Weighted Matrix Factorization algorithm by Hu, Koren and Volinsky. S: 'surplus' confidence matrix, i.e. C - I where C is the matrix with confidence weights. S is sparse while C is not (and the sparsity pattern of S is the same as that of the preference matrix, so it doesn't need to be specified separately). num_factors: the number of factors. lambda_reg: the value of the regularization constant. num_iterations: the number of iterations to run the algorithm for. Each iteration consists of two steps, one to recompute U given V, and one to recompute V given U. init_std: the standard deviation of the Gaussian with which V is initialized. verbose: print a bunch of stuff during training, including timing information. dtype: the dtype of the resulting factor matrices. Using single precision is recommended, it speeds things up a bit. recompute_factors: helper function that implements the inner loop. returns: U, V: factor matrices. If bias=True, the last columns of the matrices contain the biases. """ num_users, num_items = S.shape if verbose: print "precompute transpose" start_time = time.time() ST = S.T.tocsr() if verbose: print " took %.3f seconds" % (time.time() - start_time) print "run ALS algorithm" start_time = time.time() U = None # no need to initialize U, it will be overwritten anyway V = np.random.randn(num_items, num_factors).astype(dtype) * init_std for i in xrange(num_iterations): if verbose: print " iteration %d" % i print " recompute user factors U" U = recompute_factors(V, S, lambda_reg, dtype, *args, **kwargs) if verbose: print " time since start: %.3f seconds" % (time.time() - start_time) print " recompute item factors V" V = recompute_factors(U, ST, lambda_reg, dtype, *args, **kwargs) if verbose: print " time since start: %.3f seconds" % (time.time() - start_time) return U, V