mi@0: #!/usr/bin/python2.6
mi@0: #
mi@0: # Copyright (C) Christian Thurau, 2010. 
mi@0: # Licensed under the GNU General Public License (GPL). 
mi@0: # http://www.gnu.org/licenses/gpl.txt
mi@0: """ 
mi@0: PyMF Simplex Volume Maximization [1]
mi@0: 
mi@0:     SIVM_GSAT: class for gsat-SiVM
mi@0: 
mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume 
mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. 
mi@0: Conf. on Information and Knowledge Management. ACM. 2010.
mi@0: """
mi@0: 
mi@0: 
mi@0: import logging
mi@0: import numpy as np
mi@0: from dist import *
mi@0: from vol import cmdet
mi@0: from sivm import SIVM
mi@0: 
mi@0: __all__ = ["SIVM_GSAT"]
mi@0: 
mi@0: class SIVM_GSAT(SIVM):
mi@0:     """      
mi@0:     SIVM(data, num_bases=4, dist_measure='l2')
mi@0:     
mi@0:     
mi@0:     Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
mi@0:     F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
mi@0:     found by maximizing the volume of the resulting simplex (see [1]). Can be 
mi@0:     applied to data streams using the .online_update_w(vec) function which decides
mi@0:     on adding data sample "vec" to the already selected basis vectors.
mi@0:     
mi@0:     Parameters
mi@0:     ----------
mi@0:     data : array_like, shape (_data_dimension, _num_samples)
mi@0:         the input data
mi@0:     num_bases: int, optional
mi@0:         Number of bases to compute (column rank of W and row rank of H).
mi@0:         4 (default)     
mi@0:     dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
mi@0:         Standard is 'l2' which maximizes the volume of the simplex. In contrast,
mi@0:         'cosine' maximizes the volume of a cone (see [1] for details).
mi@0:      init : string (default: 'fastmap')
mi@0:         'fastmap' or 'origin'. Sets the method used for finding the very first 
mi@0:         basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of 
mi@0:         the two vectors that have the largest pairwise distance.
mi@0:     Attributes
mi@0:     ----------
mi@0:     W : "data_dimension x num_bases" matrix of basis vectors
mi@0:     H : "num bases x num_samples" matrix of coefficients
mi@0:     ferr : frobenius norm (after calling .factorize())       
mi@0:     
mi@0:     Example
mi@0:     -------
mi@0:     Applying SIVM to some rather stupid data set:
mi@0:     
mi@0:     >>> import numpy as np
mi@0:     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0:     >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
mi@0:     >>> sivm_mdl.factorize()
mi@0:     
mi@0:     The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. 
mi@0:     To compute coefficients for an existing set of basis vectors simply    copy W 
mi@0:     to sivm_mdl.W, and set compute_w to False:
mi@0:     
mi@0:     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0:     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0:     >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
mi@0:     >>> sivm_mdl.W = W
mi@0:     >>> sivm_mdl.factorize(compute_w=False)
mi@0:     
mi@0:     The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
mi@0:     """
mi@0: 
mi@0:     def init_w(self):
mi@0:         self.select = range(self._num_bases)
mi@0:         self.W = self.data[:, self.select]
mi@0:         
mi@0:     def online_update_w(self, vec):
mi@0:         # update D if it does not exist   
mi@0:         k = self._num_bases
mi@0:         if not hasattr(self, 'D'):
mi@0:             self.D = np.zeros((k + 1, k + 1))
mi@0:             self.D[:k, :k] = pdist(self.W, self.W)
mi@0:             self.V = cmdet(self.D[:k, :k])
mi@0:             
mi@0:         tmp_d = self._distfunc(self.W, vec.reshape((-1,1)))        
mi@0:         self.D[k, :-1] = tmp_d
mi@0:         self.D[:-1, k] = tmp_d
mi@0:        
mi@0:         v = np.zeros((self._num_bases + 1))
mi@0: 
mi@0:         for i in range(self._num_bases):
mi@0:                 # compute volume for each combination... 
mi@0:                 s = np.setdiff1d(range(self._num_bases + 1), [i])
mi@0:                 v[i] = cmdet((self.D[s,:])[:,s])
mi@0: 
mi@0:         # select index that maximizes the volume 
mi@0:         v[-1] = self.V
mi@0:         s = np.argmax(v)
mi@0: 
mi@0:         if s < self._num_bases:     
mi@0:             self.W[:,s] = vec
mi@0:             self.D[:self._num_bases, :self._num_bases] = pdist(self.W, self.W)
mi@0:             
mi@0:             if not hasattr(self, '_v'):
mi@0:                 self._v = [self.V]
mi@0:             self.V = v[s]   
mi@0:             self._v.append(v[s])                
mi@0:             
mi@0:             self._logger.info('Volume increased:' + str(self.V))
mi@0:             return True, s
mi@0:             
mi@0:         return False,-1
mi@0:         
mi@0:     def update_w(self): 
mi@0:         n = np.int(np.floor(np.random.random() * self._num_samples))
mi@0:         if n not in self.select:
mi@0:             updated, s = self.online_update_w(self.data[:,n])
mi@0:             if updated:
mi@0:                 self.select[s] = n    
mi@0:                 self._logger.info('Current selection:' + str(self.select))
mi@0:                 
mi@0:   
mi@0:     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0:                   compute_err=True, niter=1):
mi@0:         """ Factorize s.t. WH = data
mi@0:             
mi@0:             Parameters
mi@0:             ----------
mi@0:             show_progress : bool
mi@0:                     print some extra information to stdout.
mi@0:             niter : int
mi@0:                     number of iterations.
mi@0:             compute_h : bool
mi@0:                     iteratively update values for H.
mi@0:             compute_w : bool
mi@0:                     iteratively update values for W.
mi@0:             compute_err : bool
mi@0:                     compute Frobenius norm |data-WH| after each update and store
mi@0:                     it to .ferr[k].
mi@0:             
mi@0:             Updated Values
mi@0:             --------------
mi@0:             .W : updated values for W.
mi@0:             .H : updated values for H.
mi@0:             .ferr : Frobenius norm |data-WH|.
mi@0:         """
mi@0:         if show_progress:
mi@0:             self._logger.setLevel(logging.INFO)
mi@0:         else:
mi@0:             self._logger.setLevel(logging.ERROR)        
mi@0:         
mi@0:         # create W and H if they don't already exist
mi@0:         # -> any custom initialization to W,H should be done before
mi@0:         if not hasattr(self,'W'):
mi@0:                self.init_w()
mi@0:                
mi@0:         if not hasattr(self,'H'):
mi@0:                 self.init_h()                   
mi@0: 
mi@0:         if compute_err:
mi@0:             self.ferr = np.zeros(niter)
mi@0:              
mi@0:         for i in xrange(niter):
mi@0:             if compute_w:
mi@0:                 self.update_w()
mi@0: 
mi@0:             if compute_h:
mi@0:                 self.update_h()                                        
mi@0:              
mi@0:             if compute_err:                 
mi@0:                 self.ferr[i] = self.frobenius_norm()
mi@0:                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) + 
mi@0:                     ' FN:' + str(self.ferr[i]))
mi@0:             else:                
mi@0:                 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
mi@0: 
mi@0:                   
mi@0: if __name__ == "__main__":
mi@0:     import doctest  
mi@0:     doctest.testmod()