diff pymf/greedy.py @ 0:26838b1f560f

initial commit of a segmenter project
author mi tian
date Thu, 02 Apr 2015 18:09:27 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pymf/greedy.py	Thu Apr 02 18:09:27 2015 +0100
@@ -0,0 +1,174 @@
+#!/usr/bin/python2.6
+#
+# Copyright (C) Christian Thurau, 2010. 
+# Licensed under the GNU General Public License (GPL). 
+# http://www.gnu.org/licenses/gpl.txt
+#$Id$
+""" 
+PyMF GREEDY[1]
+
+	GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1].
+
+
+[1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
+Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
+"""
+
+
+import time
+import scipy.sparse
+import numpy as np
+from svd import *
+from nmf import NMF
+
+__all__ = ["GREEDY"]
+
+class GREEDY(NMF):
+    """ 
+    GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True)
+
+
+    Deterministic Sparse Column Based Matrix Reconstruction via Greedy 
+    Approximation of SVD. Factorize a data matrix into two matrices s.t.
+    F = | data - W*H | is minimal. W is iteratively selected as columns
+    of data.
+
+    Parameters
+    ----------
+    data : array_like, shape (_data_dimension, _num_samples)
+        the input data
+    num_bases: int, optional
+        Number of bases to compute (column rank of W and row rank of H).
+        4 (default) 
+    k   : number of singular vectors for the SVD step of the algorithm
+        num_bases (default)
+
+    Attributes
+    ----------
+    W : "data_dimension x num_bases" matrix of basis vectors
+    H : "num bases x num_samples" matrix of coefficients
+    ferr : frobenius norm (after calling .factorize())       
+
+    Example
+    -------
+    Applying GREEDY to some rather stupid data set:
+
+    >>> import numpy as np
+    >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
+    >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10)
+    >>> greedy_mdl.factorize()
+
+    The basis vectors are now stored in greedy_mdl.W, the coefficients in 
+    greedy_mdl.H. To compute coefficients for an existing set of basis 
+    vectors simply  copy W to greedy_mdl.W, and set compW to False:
+
+    >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
+    >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
+    >>> greedy_mdl = GREEDY(data, num_bases=2)
+    >>> greedy_mdl.W = W
+    >>> greedy_mdl.factorize(compute_w=False)
+
+    The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H.
+    """
+
+
+    def __init__(self, data, k=-1, num_bases=4):
+        # call inherited method
+        NMF.__init__(self, data, num_bases=num_bases)
+        self._k = k
+        if self._k == -1:
+            self._k = num_bases
+
+    def update_h(self):
+        if scipy.sparse.issparse(self.data):
+            self.H = pinv(self.W) * self.data
+        else:
+            self.H = np.dot(pinv(self.W), self.data)
+        
+    def update_w(self):
+        def normalize_matrix(K):
+            """ Normalize a matrix K s.t. columns have Euclidean-norm |1|
+            """
+            if scipy.sparse.issparse(K):                
+                L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:]                
+                s = np.where(L > 0.0)[0]                
+                L[s] = L[s]**-1
+                KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc')      
+                K = K*KN
+            else:
+                L = np.sqrt((K**2).sum(axis=0))               
+                s = np.where(L > 0.0)[0]            
+                L[s] = L[s]**-1
+                K = K*L                   
+            return K
+            
+        self._t = np.zeros((self._num_bases))
+        t0 = time.time()
+        self.select = []       
+            
+        # normalize data
+        A = self.data.copy()               
+
+        svd_mdl = SVD(A, k=self._k)
+        svd_mdl.factorize()
+        
+        if scipy.sparse.issparse(self.data):
+            B = svd_mdl.U * svd_mdl.S
+            B = B.tocsc()   
+        else:
+            B = np.dot(svd_mdl.U, svd_mdl.S)
+            B = B[:, :self._num_bases]            
+       
+        for i in range(self._num_bases):
+            A = normalize_matrix(A)  
+           
+            if scipy.sparse.issparse(self.data):
+                T = B.transpose() * A
+                T = np.array(T.multiply(T).sum(axis=0))[0,:]
+                                
+                # next selected column index
+                T[self.select] = 0.0
+                idx = np.argmax(T)
+                Aidx = A[:, idx].copy()
+                self.select.append(idx)
+                
+                # update B
+                BC = Aidx.transpose() * B
+                B = B - (Aidx*BC)
+                
+                # update A               
+                AC = Aidx.transpose() * A            
+                A = A - (Aidx*AC)
+
+            else:
+                T = np.dot(B.transpose(), A)
+                T = np.sum(T**2.0, axis=0)
+                
+                # next selected column index
+                T[self.select] = 0.0
+                idx = np.argmax(T)
+                self.select.append(idx)
+                
+                # update B
+                BC = np.dot(B.transpose(),A[:,idx])            
+                B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1))
+                
+                # and A
+                AC = np.dot(A.transpose(),A[:,idx])
+                A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1))
+
+
+            # detect the next best data point
+            self._logger.info('searching for next best column ...')
+            self._logger.info('cur_columns: ' + str(self.select))
+            self._t[i] = time.time() - t0
+
+        # sort indices, otherwise h5py won't work
+        self.W = self.data[:, np.sort(self.select)]
+
+        # "unsort" it again to keep the correct order
+        self.W = self.W[:, np.argsort(np.argsort(self.select))]       
+        
+if __name__ == "__main__":
+    import doctest  
+    doctest.testmod()