mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF CUR Decomposition [1] mi@0: mi@0: CUR(SVD) : Class for CUR Decomposition mi@0: mi@0: [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing mi@0: a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206. mi@0: """ mi@0: mi@0: mi@0: import numpy as np mi@0: import scipy.sparse mi@0: mi@0: from svd import pinv, SVD mi@0: mi@0: mi@0: __all__ = ["CUR"] mi@0: mi@0: class CUR(SVD): mi@0: """ mi@0: CUR(data, data, k=-1, rrank=0, crank=0) mi@0: mi@0: CUR Decomposition. Factorize a data matrix into three matrices s.t. mi@0: F = | data - USV| is minimal. CUR randomly selects rows and columns from mi@0: data for building U and V, respectively. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like [data_dimension x num_samples] mi@0: the input data mi@0: rrank: int, optional mi@0: Number of rows to sample from data. mi@0: 4 (default) mi@0: crank: int, optional mi@0: Number of columns to sample from data. mi@0: 4 (default) mi@0: show_progress: bool, optional mi@0: Print some extra information mi@0: False (default) mi@0: mi@0: Attributes mi@0: ---------- mi@0: U,S,V : submatrices s.t. data = USV mi@0: mi@0: Example mi@0: ------- mi@0: >>> import numpy as np mi@0: >>> from cur import CUR mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> cur_mdl = CUR(data, show_progress=False, rrank=1, crank=2) mi@0: >>> cur_mdl.factorize() mi@0: """ mi@0: mi@0: def __init__(self, data, k=-1, rrank=0, crank=0): mi@0: SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank) mi@0: mi@0: # select all data samples for computing the error: mi@0: # note that this might take very long, adjust self._rset and self._cset mi@0: # for faster computations. mi@0: self._rset = range(self._rows) mi@0: self._cset = range(self._cols) mi@0: mi@0: mi@0: def sample(self, s, probs): mi@0: prob_rows = np.cumsum(probs.flatten()) mi@0: temp_ind = np.zeros(s, np.int32) mi@0: mi@0: for i in range(s): mi@0: v = np.random.rand() mi@0: mi@0: try: mi@0: tempI = np.where(prob_rows >= v)[0] mi@0: temp_ind[i] = tempI[0] mi@0: except: mi@0: temp_ind[i] = len(prob_rows) mi@0: mi@0: return np.sort(temp_ind) mi@0: mi@0: def sample_probability(self): mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: dsquare = self.data.multiply(self.data) mi@0: else: mi@0: dsquare = self.data[:,:]**2 mi@0: mi@0: prow = np.array(dsquare.sum(axis=1), np.float64) mi@0: pcol = np.array(dsquare.sum(axis=0), np.float64) mi@0: mi@0: prow /= prow.sum() mi@0: pcol /= pcol.sum() mi@0: mi@0: return (prow.reshape(-1,1), pcol.reshape(-1,1)) mi@0: mi@0: def computeUCR(self): mi@0: # the next lines do NOT work with h5py if CUR is used -> double indices in self.cid or self.rid mi@0: # can occur and are not supported by h5py. When using h5py data, always use CMD which ignores mi@0: # reoccuring row/column selections. mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: self._C = self.data[:, self._cid] * scipy.sparse.csc_matrix(np.diag(self._ccnt**(1/2))) mi@0: self._R = scipy.sparse.csc_matrix(np.diag(self._rcnt**(1/2))) * self.data[self._rid,:] mi@0: mi@0: self._U = pinv(self._C, self._k) * self.data[:,:] * pinv(self._R, self._k) mi@0: mi@0: else: mi@0: self._C = np.dot(self.data[:, self._cid].reshape((self._rows, len(self._cid))), np.diag(self._ccnt**(1/2))) mi@0: self._R = np.dot(np.diag(self._rcnt**(1/2)), self.data[self._rid,:].reshape((len(self._rid), self._cols))) mi@0: mi@0: self._U = np.dot(np.dot(pinv(self._C, self._k), self.data[:,:]), mi@0: pinv(self._R, self._k)) mi@0: mi@0: # set some standard (with respect to SVD) variable names mi@0: self.U = self._C mi@0: self.S = self._U mi@0: self.V = self._R mi@0: mi@0: def factorize(self): mi@0: """ Factorize s.t. CUR = data mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .C : updated values for C. mi@0: .U : updated values for U. mi@0: .R : updated values for R. mi@0: """ mi@0: [prow, pcol] = self.sample_probability() mi@0: self._rid = self.sample(self._rrank, prow) mi@0: self._cid = self.sample(self._crank, pcol) mi@0: mi@0: self._rcnt = np.ones(len(self._rid)) mi@0: self._ccnt = np.ones(len(self._cid)) mi@0: mi@0: self.computeUCR() mi@0: mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()