mi@0: #!/usr/bin/python
mi@0: #
mi@0: # Copyright (C) Christian Thurau, 2010. 
mi@0: # Licensed under the GNU General Public License (GPL). 
mi@0: # http://www.gnu.org/licenses/gpl.txt
mi@0: """
mi@0: PyMF CUR Decomposition [1]
mi@0: 
mi@0:     CUR(SVD) : Class for CUR Decomposition
mi@0: 
mi@0: [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing 
mi@0: a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
mi@0: """
mi@0: 
mi@0: 
mi@0: import numpy as np
mi@0: import scipy.sparse
mi@0: 
mi@0: from svd import pinv, SVD
mi@0: 
mi@0: 
mi@0: __all__ = ["CUR"]
mi@0: 
mi@0: class CUR(SVD):
mi@0:     """      
mi@0:     CUR(data,  data, k=-1, rrank=0, crank=0)
mi@0:         
mi@0:     CUR Decomposition. Factorize a data matrix into three matrices s.t.
mi@0:     F = | data - USV| is minimal. CUR randomly selects rows and columns from
mi@0:     data for building U and V, respectively. 
mi@0:     
mi@0:     Parameters
mi@0:     ----------
mi@0:     data : array_like [data_dimension x num_samples]
mi@0:         the input data
mi@0:     rrank: int, optional 
mi@0:         Number of rows to sample from data.
mi@0:         4 (default)
mi@0:     crank: int, optional
mi@0:         Number of columns to sample from data.
mi@0:         4 (default)
mi@0:     show_progress: bool, optional
mi@0:         Print some extra information
mi@0:         False (default)    
mi@0:     
mi@0:     Attributes
mi@0:     ----------
mi@0:         U,S,V : submatrices s.t. data = USV        
mi@0:     
mi@0:     Example
mi@0:     -------
mi@0:     >>> import numpy as np
mi@0:     >>> from cur import CUR
mi@0:     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0:     >>> cur_mdl = CUR(data, show_progress=False, rrank=1, crank=2)    
mi@0:     >>> cur_mdl.factorize()
mi@0:     """
mi@0:     
mi@0:     def __init__(self, data, k=-1, rrank=0, crank=0):
mi@0:         SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank)
mi@0:         
mi@0:         # select all data samples for computing the error:
mi@0:         # note that this might take very long, adjust self._rset and self._cset 
mi@0:         # for faster computations.
mi@0:         self._rset = range(self._rows)
mi@0:         self._cset = range(self._cols) 
mi@0: 
mi@0:         
mi@0:     def sample(self, s, probs):        
mi@0:         prob_rows = np.cumsum(probs.flatten())            
mi@0:         temp_ind = np.zeros(s, np.int32)
mi@0:     
mi@0:         for i in range(s):            
mi@0:             v = np.random.rand()
mi@0:                         
mi@0:             try:
mi@0:                 tempI = np.where(prob_rows >= v)[0]
mi@0:                 temp_ind[i] = tempI[0]        
mi@0:             except:
mi@0:                 temp_ind[i] = len(prob_rows)
mi@0:             
mi@0:         return np.sort(temp_ind)
mi@0:         
mi@0:     def sample_probability(self):
mi@0:         
mi@0:         if scipy.sparse.issparse(self.data):
mi@0:             dsquare = self.data.multiply(self.data)    
mi@0:         else:
mi@0:             dsquare = self.data[:,:]**2
mi@0:             
mi@0:         prow = np.array(dsquare.sum(axis=1), np.float64)
mi@0:         pcol = np.array(dsquare.sum(axis=0), np.float64)
mi@0:         
mi@0:         prow /= prow.sum()
mi@0:         pcol /= pcol.sum()    
mi@0:         
mi@0:         return (prow.reshape(-1,1), pcol.reshape(-1,1))
mi@0:                             
mi@0:     def computeUCR(self):                
mi@0:         # the next  lines do NOT work with h5py if CUR is used -> double indices in self.cid or self.rid
mi@0:         # can occur and are not supported by h5py. When using h5py data, always use CMD which ignores
mi@0:         # reoccuring row/column selections.
mi@0:         
mi@0:         if scipy.sparse.issparse(self.data):
mi@0:             self._C = self.data[:, self._cid] * scipy.sparse.csc_matrix(np.diag(self._ccnt**(1/2)))        
mi@0:             self._R = scipy.sparse.csc_matrix(np.diag(self._rcnt**(1/2))) * self.data[self._rid,:]        
mi@0: 
mi@0:             self._U = pinv(self._C, self._k) * self.data[:,:] * pinv(self._R, self._k)
mi@0:                      
mi@0:         else:        
mi@0:             self._C = np.dot(self.data[:, self._cid].reshape((self._rows, len(self._cid))), np.diag(self._ccnt**(1/2)))        
mi@0:             self._R = np.dot(np.diag(self._rcnt**(1/2)), self.data[self._rid,:].reshape((len(self._rid), self._cols)))
mi@0: 
mi@0:             self._U = np.dot(np.dot(pinv(self._C, self._k), self.data[:,:]), 
mi@0:                              pinv(self._R, self._k))
mi@0:             
mi@0:         # set some standard (with respect to SVD) variable names 
mi@0:         self.U = self._C
mi@0:         self.S = self._U
mi@0:         self.V = self._R
mi@0:             
mi@0:     def factorize(self):
mi@0:         """ Factorize s.t. CUR = data
mi@0:             
mi@0:             Updated Values
mi@0:             --------------
mi@0:             .C : updated values for C.
mi@0:             .U : updated values for U.
mi@0:             .R : updated values for R.           
mi@0:         """          
mi@0:         [prow, pcol] = self.sample_probability()
mi@0:         self._rid = self.sample(self._rrank, prow)
mi@0:         self._cid = self.sample(self._crank, pcol)
mi@0:         
mi@0:         self._rcnt = np.ones(len(self._rid))
mi@0:         self._ccnt = np.ones(len(self._cid))    
mi@0:                                     
mi@0:         self.computeUCR()
mi@0:     
mi@0: 
mi@0: if __name__ == "__main__":
mi@0:     import doctest  
mi@0:     doctest.testmod()