mi@0: #!/usr/bin/python
mi@0: #
mi@0: # Copyright (C) Christian Thurau, 2010. 
mi@0: # Licensed under the GNU General Public License (GPL). 
mi@0: # http://www.gnu.org/licenses/gpl.txt
mi@0: """ 
mi@0: PyMF Geometric-Map
mi@0: 
mi@0:     GMAP: Class for Geometric-Map
mi@0: """
mi@0: 
mi@0: 
mi@0: import scipy.sparse
mi@0: import numpy as np
mi@0: 
mi@0: from dist import *
mi@0: from aa import AA
mi@0: from kmeans import Kmeans
mi@0: 
mi@0: __all__ = ["GMAP"]
mi@0: 
mi@0: class GMAP(AA):
mi@0:     """      
mi@0:     GMAP(data, num_bases=4, dist_measure='l2')
mi@0:     
mi@0:     
mi@0:     Geometric-Map. Factorize a data matrix into two matrices s.t. 
mi@0:     F = | data - W*H | is minimal. G-MAP can emulate/approximate several 
mi@0:     standard methods including PCA, NMF, and AA.
mi@0:     
mi@0:     Parameters
mi@0:     ----------
mi@0:     data : array_like, shape (_data_dimension, _num_samples)
mi@0:         the input data
mi@0:     num_bases: int, optional
mi@0:         Number of bases to compute (column rank of W and row rank of H).
mi@0:         4 (default)     
mi@0:     method : one of 'pca' ,'nmf', 'aa', default is 'pca' which emulates
mi@0:         Principal Component Analysis using the geometric map method ('nmf' 
mi@0:         emulates Non-negative Matrix Factorization, 'aa' emulates Archetypal 
mi@0:         Analysis).
mi@0:     robust_map : bool, optional
mi@0:         use robust_map or the standard max-val selection 
mi@0:         [see "On FastMap and the Convex Hull of Multivariate Data: Toward
mi@0:         Fast and Robust Dimension Reduction", Ostrouchov and Samatova, PAMI
mi@0:         2005]
mi@0:     Attributes
mi@0:     ----------
mi@0:     W : "data_dimension x num_bases" matrix of basis vectors
mi@0:     H : "num bases x num_samples" matrix of coefficients
mi@0:     ferr : frobenius norm (after calling .factorize())       
mi@0:     
mi@0:     Example
mi@0:     -------
mi@0:     Applying GMAP to some rather stupid data set:
mi@0:     
mi@0:     >>> import numpy as np
mi@0:     >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0:     >>> gmap_mdl = GMAP(data, num_bases=2)
mi@0:     >>> gmap_mdl.factorize()
mi@0:     
mi@0:     The basis vectors are now stored in gmap_mdl.W, the coefficients in gmap_mdl.H. 
mi@0:     To compute coefficients for an existing set of basis vectors simply copy W 
mi@0:     to gmap_mdl.W, and set compute_w to False:
mi@0:     
mi@0:     >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0:     >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0:     >>> gmap_mdl = GMAP(data, num_bases=2)
mi@0:     >>> gmap_mdl.W = W
mi@0:     >>> gmap_mdl.factorize(compute_w=False)
mi@0:     
mi@0:     The result is a set of coefficients gmap_mdl.H, s.t. data = W * gmap_mdl.H.
mi@0:     """
mi@0:     
mi@0:     # always overwrite the default number of iterations
mi@0:     # -> any value other does not make sense.
mi@0:     _NITER = 1
mi@0: 
mi@0:     def __init__(self, data, num_bases=4, method='pca', robust_map=True):
mi@0:        
mi@0:         AA.__init__(self, data, num_bases=num_bases)
mi@0:         self.sub = []           
mi@0:         self._robust_map = robust_map
mi@0:         self._method = method
mi@0:        
mi@0:           
mi@0:     def init_h(self):
mi@0:         self.H = np.zeros((self._num_bases, self._num_samples))
mi@0:         
mi@0:     def init_w(self):
mi@0:         self.W = np.zeros((self._data_dimension, self._num_bases))
mi@0:            
mi@0:     def update_w(self): 
mi@0:         """ compute new W """     
mi@0:         
mi@0:         def select_next(iterval):
mi@0:             """ select the next best data sample using robust map
mi@0:             or simply the max iterval ... """
mi@0:         
mi@0:             if self._robust_map:                
mi@0:                 k = np.argsort(iterval)[::-1]
mi@0:                 d_sub = self.data[:,k[:self._robust_nselect]]
mi@0:                 self.sub.extend(k[:self._robust_nselect])
mi@0:                 
mi@0:                 # cluster d_sub
mi@0:                 kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster)
mi@0:                 kmeans_mdl.factorize(niter=10)
mi@0:                 
mi@0:                 # get largest cluster
mi@0:                 h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0]               
mi@0:                 largest_cluster = np.argmax(h)
mi@0:                 sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub)               
mi@0:                 sel = k[np.argmin(sel)]
mi@0:             else:
mi@0:                 sel = np.argmax(iterval)        
mi@0:                 
mi@0:             return sel        
mi@0:                 
mi@0:         EPS = 10**-8
mi@0:           
mi@0:         if scipy.sparse.issparse(self.data):
mi@0:             norm_data = np.sqrt(self.data.multiply(self.data).sum(axis=0))            
mi@0:             norm_data = np.array(norm_data).reshape((-1))
mi@0:         else:
mi@0:             norm_data = np.sqrt(np.sum(self.data**2, axis=0))
mi@0:             
mi@0: 
mi@0:         self.select = []
mi@0:         
mi@0:         if self._method == 'pca' or self._method == 'aa':    
mi@0:             iterval = norm_data.copy()
mi@0:             
mi@0:         if self._method == 'nmf':
mi@0:             iterval = np.sum(self.data, axis=0)/(np.sqrt(self.data.shape[0])*norm_data)
mi@0:             iterval = 1.0 - iterval
mi@0:         
mi@0:         self.select.append(select_next(iterval))
mi@0:        
mi@0:                 
mi@0:         for l in range(1, self._num_bases):
mi@0:             
mi@0:             if scipy.sparse.issparse(self.data):
mi@0:                 c = self.data[:, self.select[-1]:self.select[-1]+1].T * self.data
mi@0:                 c = np.array(c.todense())
mi@0:             else:                            
mi@0:                 c = np.dot(self.data[:,self.select[-1]], self.data)
mi@0:                 
mi@0:             c = c/(norm_data * norm_data[self.select[-1]])  
mi@0:             
mi@0:             if self._method == 'pca':            
mi@0:                 c = 1.0 - np.abs(c)
mi@0:                 c = c * norm_data
mi@0:                 
mi@0:             elif self._method == 'aa':
mi@0:                 c = (c*-1.0 + 1.0)/2.0  
mi@0:                 c = c * norm_data
mi@0:                 
mi@0:             elif self._method == 'nmf':
mi@0:                 c = 1.0 - np.abs(c)
mi@0: 
mi@0:             ### update the estimated volume
mi@0:             iterval = c * iterval            
mi@0:             
mi@0:             # detect the next best data point            
mi@0:             self.select.append(select_next(iterval))
mi@0:                       
mi@0:             self._logger.info('cur_nodes: ' + str(self.select))
mi@0: 
mi@0:         # sort indices, otherwise h5py won't work
mi@0:         self.W = self.data[:, np.sort(self.select)]
mi@0:             
mi@0:         # "unsort" it again to keep the correct order
mi@0:         self.W = self.W[:, np.argsort(np.argsort(self.select))]
mi@0:     
mi@0:     def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0:                   compute_err=True, robust_cluster=3, niter=1, robust_nselect=-1):
mi@0:         """ Factorize s.t. WH = data
mi@0:             
mi@0:             Parameters
mi@0:             ----------
mi@0:             show_progress : bool
mi@0:                     print some extra information to stdout.
mi@0:                     False, default
mi@0:             compute_h : bool
mi@0:                     iteratively update values for H.
mi@0:                     True, default
mi@0:             compute_w : bool
mi@0:                     iteratively update values for W.
mi@0:                     default, True
mi@0:             compute_err : bool
mi@0:                     compute Frobenius norm |data-WH| after each update and store
mi@0:                     it to .ferr[k].
mi@0:             robust_cluster : int, optional
mi@0:                     set the number of clusters for robust map selection.
mi@0:                     3, default 
mi@0:             robust_nselect : int, optional
mi@0:                     set the number of samples to consider for robust map
mi@0:                     selection.
mi@0:                     -1, default (automatically determine suitable number)
mi@0:             
mi@0:             Updated Values
mi@0:             --------------
mi@0:             .W : updated values for W.
mi@0:             .H : updated values for H.
mi@0:             .ferr : Frobenius norm |data-WH|.
mi@0:         """
mi@0:         self._robust_cluster = robust_cluster
mi@0:         self._robust_nselect = robust_nselect
mi@0:         
mi@0:         if self._robust_nselect == -1:
mi@0:             self._robust_nselect = np.round(np.log(self.data.shape[1])*2)        
mi@0:         
mi@0:         AA.factorize(self, niter=1, show_progress=show_progress, 
mi@0:                   compute_w=compute_w, compute_h=compute_h, 
mi@0:                   compute_err=compute_err)
mi@0:              
mi@0: if __name__ == "__main__":
mi@0:     import doctest  
mi@0:     doctest.testmod()