changeset 22:ce1e76c21c12

Removed library for matrix factorization
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Tue, 11 Aug 2015 10:56:51 +0100
parents e68dbee1f6db
children 45e6f85d0ba4
files Code/wmf.py
diffstat 1 files changed, 0 insertions(+), 166 deletions(-) [+]
line wrap: on
line diff
--- a/Code/wmf.py	Tue Aug 11 10:50:36 2015 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,166 +0,0 @@
-import numpy as np
-import time
-import itertools
-
-
-
-def linear_surplus_confidence_matrix(B, alpha):
-    # To construct the surplus confidence matrix, we need to operate only on the nonzero elements.
-    # This is not possible: S = alpha * B
-    S = B.copy()
-    S.data = alpha * S.data
-    return S
-
-
-def log_surplus_confidence_matrix(B, alpha, epsilon):
-    # To construct the surplus confidence matrix, we need to operate only on the nonzero elements.
-    # This is not possible: S = alpha * np.log(1 + B / epsilon)
-    S = B.copy()
-    S.data = alpha * np.log(1 + S.data / epsilon)
-    return S
-
-
-
-def iter_rows(S):
-    """
-    Helper function to iterate quickly over the data and indices of the
-    rows of the S matrix. A naive implementation using indexing
-    on S is much, much slower.
-    """
-    for i in xrange(S.shape[0]):
-        lo, hi = S.indptr[i], S.indptr[i + 1]
-        yield i, S.data[lo:hi], S.indices[lo:hi]
-
-
-def recompute_factors(Y, S, lambda_reg, dtype='float32'):
-    """
-    recompute matrix X from Y.
-    X = recompute_factors(Y, S, lambda_reg)
-    This can also be used for the reverse operation as follows:
-    Y = recompute_factors(X, ST, lambda_reg)
-    
-    The comments are in terms of X being the users and Y being the items.
-    """
-    m = S.shape[0] # m = number of users
-    f = Y.shape[1] # f = number of factors
-    YTY = np.dot(Y.T, Y) # precompute this
-    YTYpI = YTY + lambda_reg * np.eye(f)
-    X_new = np.zeros((m, f), dtype=dtype)
-
-    for k, s_u, i_u in iter_rows(S):
-        Y_u = Y[i_u] # exploit sparsity
-        A = np.dot(s_u + 1, Y_u)
-        YTSY = np.dot(Y_u.T, (Y_u * s_u.reshape(-1, 1)))
-        B = YTSY + YTYpI
-
-        # Binv = np.linalg.inv(B)
-        # X_new[k] = np.dot(A, Binv) 
-        X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e
-
-    return X_new
-
-
-
-def recompute_factors_bias(Y, S, lambda_reg, dtype='float32'):
-    """
-    Like recompute_factors, but the last column of X and Y is treated as
-    a bias vector.
-    """
-    m = S.shape[0] # m = number of users
-    f = Y.shape[1] - 1 # f = number of factors
-    
-    b_y = Y[:, f] # vector of biases
-
-    Y_e = Y.copy()
-    Y_e[:, f] = 1 # factors with added column of ones
-
-    YTY = np.dot(Y_e.T, Y_e) # precompute this
-
-    R = np.eye(f + 1) # regularization matrix
-    R[f, f] = 0 # don't regularize the biases!
-    R *= lambda_reg
-
-    YTYpR = YTY + R
-
-    byY = np.dot(b_y, Y_e) # precompute this as well
-
-    X_new = np.zeros((m, f + 1), dtype=dtype)
-
-    for k, s_u, i_u in iter_rows(S):
-        Y_u = Y_e[i_u] # exploit sparsity
-        b_y_u = b_y[i_u]
-        A = np.dot((1 - b_y_u) * s_u + 1, Y_u)
-        A -= byY
-
-        YTSY = np.dot(Y_u.T, (Y_u * s_u[:, None]))
-        B = YTSY + YTYpR
-
-        # Binv = np.linalg.inv(B)
-        # X_new[k] = np.dot(A, Binv) 
-        X_new[k] = np.linalg.solve(B.T, A.T).T # doesn't seem to make much of a difference in terms of speed, but w/e
-
-    return X_new
-
-
-
-def factorize(S, num_factors, lambda_reg=1e-5, num_iterations=20, init_std=0.01, verbose=False, dtype='float32', recompute_factors=recompute_factors, *args, **kwargs):
-    """
-    factorize a given sparse matrix using the Weighted Matrix Factorization algorithm by
-    Hu, Koren and Volinsky.
-
-    S: 'surplus' confidence matrix, i.e. C - I where C is the matrix with confidence weights.
-        S is sparse while C is not (and the sparsity pattern of S is the same as that of
-        the preference matrix, so it doesn't need to be specified separately).
-
-    num_factors: the number of factors.
-
-    lambda_reg: the value of the regularization constant.
-
-    num_iterations: the number of iterations to run the algorithm for. Each iteration consists
-        of two steps, one to recompute U given V, and one to recompute V given U.
-
-    init_std: the standard deviation of the Gaussian with which V is initialized.
-
-    verbose: print a bunch of stuff during training, including timing information.
-
-    dtype: the dtype of the resulting factor matrices. Using single precision is recommended,
-        it speeds things up a bit.
-
-    recompute_factors: helper function that implements the inner loop.
-
-    returns:
-        U, V: factor matrices. If bias=True, the last columns of the matrices contain the biases.
-    """
-    num_users, num_items = S.shape
-
-    if verbose:
-        print "precompute transpose"
-        start_time = time.time()
-
-    ST = S.T.tocsr()
-
-    if verbose:
-        print "  took %.3f seconds" % (time.time() - start_time)
-        print "run ALS algorithm"
-        start_time = time.time()
-
-    U = None # no need to initialize U, it will be overwritten anyway
-    V = np.random.randn(num_items, num_factors).astype(dtype) * init_std
-
-    for i in xrange(num_iterations):
-        if verbose:
-            print "  iteration %d" % i
-            print "    recompute user factors U"
-
-        U = recompute_factors(V, S, lambda_reg, dtype, *args, **kwargs)
-
-        if verbose:
-            print "    time since start: %.3f seconds" % (time.time() - start_time)
-            print "    recompute item factors V"
-
-        V = recompute_factors(U, ST, lambda_reg, dtype, *args, **kwargs)
-
-        if verbose:
-            print "    time since start: %.3f seconds" % (time.time() - start_time)
-
-    return U, V