mi@0: #!/usr/bin/python
mi@0: #
mi@0: # Copyright (C) Christian Thurau, 2010. 
mi@0: # Licensed under the GNU General Public License (GPL). 
mi@0: # http://www.gnu.org/licenses/gpl.txt
mi@0: """ 
mi@0: PyMF Simplex Volume Maximization [1]
mi@0: 
mi@0:     SIVM: class for SiVM
mi@0: 
mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 
mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 
mi@0: Conf. on Information and Knowledge Management. ACM. 2010.
mi@0: """
mi@0: 
mi@0: 
mi@0: import scipy.sparse
mi@0: import numpy as np
mi@0: 
mi@0: from dist import *
mi@0: from aa import AA
mi@0: 
mi@0: __all__ = ["SIVM"]
mi@0: 
mi@0: class SIVM(AA):
mi@0:     """      
mi@0:     SIVM(data, num_bases=4, dist_measure='l2')
mi@0:     
mi@0:     
mi@0:     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
mi@0:     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
mi@0:     found by maximizing the volume of the resulting simplex (see [1]).
mi@0:     
mi@0:     Parameters
mi@0:     ----------
mi@0:     data : array_like, shape (_data_dimension, _num_samples)
mi@0:         the input data
mi@0:     num_bases: int, optional
mi@0:         Number of bases to compute (column rank of W and row rank of H).
mi@0:         4 (default)     
mi@0:     dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
mi@0:         Standard is 'l2' which maximizes the volume of the simplex. In contrast,
mi@0:         'cosine' maximizes the volume of a cone (see [1] for details).
mi@0:      init : string (default: 'fastmap')
mi@0:         'fastmap' or 'origin'. Sets the method used for finding the very first 
mi@0:         basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of 
mi@0:         the two vectors that have the largest pairwise distance.
mi@0:     Attributes
mi@0:     ----------
mi@0:     W : "data_dimension x num_bases" matrix of basis vectors
mi@0:     H : "num bases x num_samples" matrix of coefficients
mi@0:     ferr : frobenius norm (after calling .factorize())       
mi@0:     
mi@0:     Example
mi@0:     -------
mi@0:     Applying SIVM to some rather stupid data set:
mi@0:     
mi@0:     >>> import numpy as np
mi@0:     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0:     >>> sivm_mdl = SIVM(data, num_bases=2)
mi@0:     >>> sivm_mdl.factorize()
mi@0:     
mi@0:     The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. 
mi@0:     To compute coefficients for an existing set of basis vectors simply copy W 
mi@0:     to sivm_mdl.W, and set compute_w to False:
mi@0:     
mi@0:     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0:     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0:     >>> sivm_mdl = SIVM(data, num_bases=2)
mi@0:     >>> sivm_mdl.W = W
mi@0:     >>> sivm_mdl.factorize(compute_w=False)
mi@0:     
mi@0:     The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
mi@0:     """
mi@0:     
mi@0:     # always overwrite the default number of iterations
mi@0:     # -> any value other does not make sense.
mi@0:     _NITER = 1
mi@0: 
mi@0:     def __init__(self, data, num_bases=4, dist_measure='l2',  init='fastmap'):
mi@0:        
mi@0:         AA.__init__(self, data, num_bases=num_bases)
mi@0:             
mi@0:         self._dist_measure = dist_measure            
mi@0:         self._init = init      
mi@0:         
mi@0:         # assign the correct distance function
mi@0:         if self._dist_measure == 'l1':
mi@0:             self._distfunc = l1_distance
mi@0:                 
mi@0:         elif self._dist_measure == 'l2':
mi@0:             self._distfunc = l2_distance
mi@0:         
mi@0:         elif self._dist_measure == 'cosine':                
mi@0:             self._distfunc = cosine_distance
mi@0:         
mi@0:         elif self._dist_measure == 'abs_cosine':                
mi@0:             self._distfunc = abs_cosine_distance
mi@0:         
mi@0:         elif self._dist_measure == 'weighted_abs_cosine':                
mi@0:             self._distfunc = weighted_abs_cosine_distance
mi@0:                 
mi@0:         elif self._dist_measure == 'kl':
mi@0:             self._distfunc = kl_divergence    
mi@0: 
mi@0:   
mi@0:     def _distance(self, idx):
mi@0:         """ compute distances of a specific data point to all other samples"""
mi@0:             
mi@0:         if scipy.sparse.issparse(self.data):
mi@0:             step = self.data.shape[1]
mi@0:         else:    
mi@0:             step = 50000    
mi@0:                 
mi@0:         d = np.zeros((self.data.shape[1]))        
mi@0:         if idx == -1:
mi@0:             # set vec to origin if idx=-1
mi@0:             vec = np.zeros((self.data.shape[0], 1))
mi@0:             if scipy.sparse.issparse(self.data):
mi@0:                 vec = scipy.sparse.csc_matrix(vec)
mi@0:         else:
mi@0:             vec = self.data[:, idx:idx+1]    
mi@0:             
mi@0:         self._logger.info('compute distance to node ' + str(idx))
mi@0:                                                 
mi@0:         # slice data into smaller chunks
mi@0:         for idx_start in range(0, self.data.shape[1], step):                    
mi@0:             if idx_start + step > self.data.shape[1]:
mi@0:                 idx_end = self.data.shape[1]
mi@0:             else:
mi@0:                 idx_end = idx_start + step
mi@0: 
mi@0:             d[idx_start:idx_end] = self._distfunc(
mi@0:                 self.data[:,idx_start:idx_end], vec)
mi@0:             self._logger.info('completed:' + 
mi@0:                 str(idx_end/(self.data.shape[1]/100.0)) + "%")    
mi@0:         return d
mi@0:        
mi@0:     def init_h(self):
mi@0:         self.H = np.zeros((self._num_bases, self._num_samples))
mi@0:         
mi@0:     def init_w(self):
mi@0:         self.W = np.zeros((self._data_dimension, self._num_bases))
mi@0:         
mi@0:     def init_sivm(self):
mi@0:         self.select = []
mi@0:         if self._init == 'fastmap':
mi@0:             # Fastmap like initialization
mi@0:             # set the starting index for fastmap initialization        
mi@0:             cur_p = 0        
mi@0:             
mi@0:             # after 3 iterations the first "real" index is found
mi@0:             for i in range(3):                                
mi@0:                 d = self._distance(cur_p)                        
mi@0:                 cur_p = np.argmax(d)
mi@0:                 
mi@0:             # store maximal found distance -> later used for "a" (->update_w) 
mi@0:             self._maxd = np.max(d)                        
mi@0:             self.select.append(cur_p)
mi@0: 
mi@0:         elif self._init == 'origin':
mi@0:             # set first vertex to origin
mi@0:             cur_p = -1
mi@0:             d = self._distance(cur_p)
mi@0:             self._maxd = np.max(d)
mi@0:             self.select.append(cur_p)         
mi@0:         
mi@0:     def update_w(self): 
mi@0:         """ compute new W """        
mi@0:         EPS = 10**-8
mi@0:         self.init_sivm()       
mi@0:         
mi@0:         # initialize some of the recursively updated distance measures ....
mi@0:         d_square = np.zeros((self.data.shape[1]))
mi@0:         d_sum = np.zeros((self.data.shape[1]))
mi@0:         d_i_times_d_j = np.zeros((self.data.shape[1]))
mi@0:         distiter = np.zeros((self.data.shape[1]))
mi@0:         a = np.log(self._maxd) 
mi@0:         a_inc = a.copy()
mi@0:         
mi@0:         for l in range(1, self._num_bases):
mi@0:             d = self._distance(self.select[l-1])
mi@0:             
mi@0:             # take the log of d (sually more stable that d)
mi@0:             d = np.log(d + EPS)            
mi@0:             
mi@0:             d_i_times_d_j += d * d_sum
mi@0:             d_sum += d
mi@0:             d_square += d**2
mi@0:             distiter = d_i_times_d_j + a*d_sum - (l/2.0) * d_square                   
mi@0:             
mi@0:             # detect the next best data point                      
mi@0:             self.select.append(np.argmax(distiter))                       
mi@0:         
mi@0:             self._logger.info('cur_nodes: ' + str(self.select))
mi@0: 
mi@0:         # sort indices, otherwise h5py won't work
mi@0:         self.W = self.data[:, np.sort(self.select)]
mi@0:             
mi@0:         # "unsort" it again to keep the correct order
mi@0:         self.W = self.W[:, np.argsort(np.argsort(self.select))]
mi@0:     
mi@0:     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0:                   compute_err=True, niter=1):
mi@0:         """ Factorize s.t. WH = data
mi@0:             
mi@0:             Parameters
mi@0:             ----------
mi@0:             show_progress : bool
mi@0:                     print some extra information to stdout.
mi@0:             compute_h : bool
mi@0:                     iteratively update values for H.
mi@0:             compute_w : bool
mi@0:                     iteratively update values for W.
mi@0:             compute_err : bool
mi@0:                     compute Frobenius norm |data-WH| after each update and store
mi@0:                     it to .ferr[k].
mi@0:             
mi@0:             Updated Values
mi@0:             --------------
mi@0:             .W : updated values for W.
mi@0:             .H : updated values for H.
mi@0:             .ferr : Frobenius norm |data-WH|.
mi@0:         """
mi@0:         
mi@0:         AA.factorize(self, niter=1, show_progress=show_progress, 
mi@0:                   compute_w=compute_w, compute_h=compute_h, 
mi@0:                   compute_err=compute_err)
mi@0:              
mi@0: if __name__ == "__main__":
mi@0:     import doctest  
mi@0:     doctest.testmod()