mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: #$Id$ mi@0: """ mi@0: PyMF CUR Decomposition [1] mi@0: mi@0: CURSL(SVD) : Class for CUR Decomposition (uses statistical leverage based sampling) mi@0: mi@0: [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing mi@0: a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206. mi@0: """ mi@0: mi@0: mi@0: import numpy as np mi@0: import scipy.sparse mi@0: mi@0: from svd import pinv, SVD mi@0: from cmd import CMD mi@0: mi@0: __all__ = ["CURSL"] mi@0: mi@0: class CURSL(CMD): mi@0: """ mi@0: CURSL(data, data, rrank=0, crank=0) mi@0: mi@0: CUR/CMD Decomposition. Factorize a data matrix into three matrices s.t. mi@0: F = | data - USV| is minimal. CURSL randomly selects rows and columns from mi@0: data for building U and V, respectively. The importance sampling is based mi@0: on a statistical leverage score from the top-k singular vectors (k is mi@0: currently set to 4/5*rrank and 4/5*crank). mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like [data_dimension x num_samples] mi@0: the input data mi@0: rrank: int, optional mi@0: Number of rows to sample from data. mi@0: 4 (default) mi@0: crank: int, optional mi@0: Number of columns to sample from data. mi@0: 4 (default) mi@0: show_progress: bool, optional mi@0: Print some extra information mi@0: False (default) mi@0: mi@0: Attributes mi@0: ---------- mi@0: U,S,V : submatrices s.t. data = USV (or _C _U _R) mi@0: mi@0: Example mi@0: ------- mi@0: >>> import numpy as np mi@0: >>> from cur import CUR mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> cur_mdl = CURSL(data, show_progress=False, rrank=1, crank=2) mi@0: >>> cur_mdl.factorize() mi@0: """ mi@0: mi@0: def __init__(self, data, k=-1, rrank=0, crank=0): mi@0: SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank) mi@0: mi@0: def sample_probability(self): mi@0: def comp_prob(d, k): mi@0: # compute statistical leverage score mi@0: c = np.round(k - k/5.0) mi@0: mi@0: svd_mdl = SVD(d, k=c) mi@0: svd_mdl.factorize() mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: A = svd_mdl.V.multiply(svd_mdl.V) mi@0: ## Rule 1 mi@0: pcol = np.array(A.sum(axis=0)/k) mi@0: else: mi@0: A = svd_mdl.V[:k,:]**2.0 mi@0: ## Rule 1 mi@0: pcol = A.sum(axis=0)/k mi@0: mi@0: #c = k * np.log(k/ (self._eps**2.0)) mi@0: #pcol = c * pcol.reshape((-1,1)) mi@0: pcol /= np.sum(pcol) mi@0: return pcol mi@0: mi@0: pcol = comp_prob(self.data, self._rrank) mi@0: prow = comp_prob(self.data.transpose(), self._crank) mi@0: mi@0: mi@0: return (prow.reshape(-1,1), pcol.reshape(-1,1))