mi@0: #!/usr/bin/python2.6
mi@0: #
mi@0: # Copyright (C) Christian Thurau, 2010. 
mi@0: # Licensed under the GNU General Public License (GPL). 
mi@0: # http://www.gnu.org/licenses/gpl.txt
mi@0: #$Id$
mi@0: """ 
mi@0: PyMF GREEDY[1]
mi@0: 
mi@0: 	GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1].
mi@0: 
mi@0: 
mi@0: [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
mi@0: Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
mi@0: """
mi@0: 
mi@0: 
mi@0: import time
mi@0: import scipy.sparse
mi@0: import numpy as np
mi@0: from svd import *
mi@0: from nmf import NMF
mi@0: 
mi@0: __all__ = ["GREEDY"]
mi@0: 
mi@0: class GREEDY(NMF):
mi@0:     """ 
mi@0:     GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True)
mi@0: 
mi@0: 
mi@0:     Deterministic Sparse Column Based Matrix Reconstruction via Greedy 
mi@0:     Approximation of SVD. Factorize a data matrix into two matrices s.t.
mi@0:     F = | data - W*H | is minimal. W is iteratively selected as columns
mi@0:     of data.
mi@0: 
mi@0:     Parameters
mi@0:     ----------
mi@0:     data : array_like, shape (_data_dimension, _num_samples)
mi@0:         the input data
mi@0:     num_bases: int, optional
mi@0:         Number of bases to compute (column rank of W and row rank of H).
mi@0:         4 (default) 
mi@0:     k   : number of singular vectors for the SVD step of the algorithm
mi@0:         num_bases (default)
mi@0: 
mi@0:     Attributes
mi@0:     ----------
mi@0:     W : "data_dimension x num_bases" matrix of basis vectors
mi@0:     H : "num bases x num_samples" matrix of coefficients
mi@0:     ferr : frobenius norm (after calling .factorize())       
mi@0: 
mi@0:     Example
mi@0:     -------
mi@0:     Applying GREEDY to some rather stupid data set:
mi@0: 
mi@0:     >>> import numpy as np
mi@0:     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0:     >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10)
mi@0:     >>> greedy_mdl.factorize()
mi@0: 
mi@0:     The basis vectors are now stored in greedy_mdl.W, the coefficients in 
mi@0:     greedy_mdl.H. To compute coefficients for an existing set of basis 
mi@0:     vectors simply  copy W to greedy_mdl.W, and set compW to False:
mi@0: 
mi@0:     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0:     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0:     >>> greedy_mdl = GREEDY(data, num_bases=2)
mi@0:     >>> greedy_mdl.W = W
mi@0:     >>> greedy_mdl.factorize(compute_w=False)
mi@0: 
mi@0:     The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H.
mi@0:     """
mi@0: 
mi@0: 
mi@0:     def __init__(self, data, k=-1, num_bases=4):
mi@0:         # call inherited method
mi@0:         NMF.__init__(self, data, num_bases=num_bases)
mi@0:         self._k = k
mi@0:         if self._k == -1:
mi@0:             self._k = num_bases
mi@0: 
mi@0:     def update_h(self):
mi@0:         if scipy.sparse.issparse(self.data):
mi@0:             self.H = pinv(self.W) * self.data
mi@0:         else:
mi@0:             self.H = np.dot(pinv(self.W), self.data)
mi@0:         
mi@0:     def update_w(self):
mi@0:         def normalize_matrix(K):
mi@0:             """ Normalize a matrix K s.t. columns have Euclidean-norm |1|
mi@0:             """
mi@0:             if scipy.sparse.issparse(K):                
mi@0:                 L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:]                
mi@0:                 s = np.where(L > 0.0)[0]                
mi@0:                 L[s] = L[s]**-1
mi@0:                 KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc')      
mi@0:                 K = K*KN
mi@0:             else:
mi@0:                 L = np.sqrt((K**2).sum(axis=0))               
mi@0:                 s = np.where(L > 0.0)[0]            
mi@0:                 L[s] = L[s]**-1
mi@0:                 K = K*L                   
mi@0:             return K
mi@0:             
mi@0:         self._t = np.zeros((self._num_bases))
mi@0:         t0 = time.time()
mi@0:         self.select = []       
mi@0:             
mi@0:         # normalize data
mi@0:         A = self.data.copy()               
mi@0: 
mi@0:         svd_mdl = SVD(A, k=self._k)
mi@0:         svd_mdl.factorize()
mi@0:         
mi@0:         if scipy.sparse.issparse(self.data):
mi@0:             B = svd_mdl.U * svd_mdl.S
mi@0:             B = B.tocsc()   
mi@0:         else:
mi@0:             B = np.dot(svd_mdl.U, svd_mdl.S)
mi@0:             B = B[:, :self._num_bases]            
mi@0:        
mi@0:         for i in range(self._num_bases):
mi@0:             A = normalize_matrix(A)  
mi@0:            
mi@0:             if scipy.sparse.issparse(self.data):
mi@0:                 T = B.transpose() * A
mi@0:                 T = np.array(T.multiply(T).sum(axis=0))[0,:]
mi@0:                                 
mi@0:                 # next selected column index
mi@0:                 T[self.select] = 0.0
mi@0:                 idx = np.argmax(T)
mi@0:                 Aidx = A[:, idx].copy()
mi@0:                 self.select.append(idx)
mi@0:                 
mi@0:                 # update B
mi@0:                 BC = Aidx.transpose() * B
mi@0:                 B = B - (Aidx*BC)
mi@0:                 
mi@0:                 # update A               
mi@0:                 AC = Aidx.transpose() * A            
mi@0:                 A = A - (Aidx*AC)
mi@0: 
mi@0:             else:
mi@0:                 T = np.dot(B.transpose(), A)
mi@0:                 T = np.sum(T**2.0, axis=0)
mi@0:                 
mi@0:                 # next selected column index
mi@0:                 T[self.select] = 0.0
mi@0:                 idx = np.argmax(T)
mi@0:                 self.select.append(idx)
mi@0:                 
mi@0:                 # update B
mi@0:                 BC = np.dot(B.transpose(),A[:,idx])            
mi@0:                 B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1))
mi@0:                 
mi@0:                 # and A
mi@0:                 AC = np.dot(A.transpose(),A[:,idx])
mi@0:                 A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1))
mi@0: 
mi@0: 
mi@0:             # detect the next best data point
mi@0:             self._logger.info('searching for next best column ...')
mi@0:             self._logger.info('cur_columns: ' + str(self.select))
mi@0:             self._t[i] = time.time() - t0
mi@0: 
mi@0:         # sort indices, otherwise h5py won't work
mi@0:         self.W = self.data[:, np.sort(self.select)]
mi@0: 
mi@0:         # "unsort" it again to keep the correct order
mi@0:         self.W = self.W[:, np.argsort(np.argsort(self.select))]       
mi@0:         
mi@0: if __name__ == "__main__":
mi@0:     import doctest  
mi@0:     doctest.testmod()