Mercurial > hg > segmentation
view pymf/greedy.py @ 19:890cfe424f4a tip
added annotations
author | mitian |
---|---|
date | Fri, 11 Dec 2015 09:47:40 +0000 |
parents | 26838b1f560f |
children |
line wrap: on
line source
#!/usr/bin/python2.6 # # Copyright (C) Christian Thurau, 2010. # Licensed under the GNU General Public License (GPL). # http://www.gnu.org/licenses/gpl.txt #$Id$ """ PyMF GREEDY[1] GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1]. [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix Reconstruction via Greedy Approximation of SVD. ISAAC'2008. """ import time import scipy.sparse import numpy as np from svd import * from nmf import NMF __all__ = ["GREEDY"] class GREEDY(NMF): """ GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True) Deterministic Sparse Column Based Matrix Reconstruction via Greedy Approximation of SVD. Factorize a data matrix into two matrices s.t. F = | data - W*H | is minimal. W is iteratively selected as columns of data. Parameters ---------- data : array_like, shape (_data_dimension, _num_samples) the input data num_bases: int, optional Number of bases to compute (column rank of W and row rank of H). 4 (default) k : number of singular vectors for the SVD step of the algorithm num_bases (default) Attributes ---------- W : "data_dimension x num_bases" matrix of basis vectors H : "num bases x num_samples" matrix of coefficients ferr : frobenius norm (after calling .factorize()) Example ------- Applying GREEDY to some rather stupid data set: >>> import numpy as np >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10) >>> greedy_mdl.factorize() The basis vectors are now stored in greedy_mdl.W, the coefficients in greedy_mdl.H. To compute coefficients for an existing set of basis vectors simply copy W to greedy_mdl.W, and set compW to False: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) >>> greedy_mdl = GREEDY(data, num_bases=2) >>> greedy_mdl.W = W >>> greedy_mdl.factorize(compute_w=False) The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H. """ def __init__(self, data, k=-1, num_bases=4): # call inherited method NMF.__init__(self, data, num_bases=num_bases) self._k = k if self._k == -1: self._k = num_bases def update_h(self): if scipy.sparse.issparse(self.data): self.H = pinv(self.W) * self.data else: self.H = np.dot(pinv(self.W), self.data) def update_w(self): def normalize_matrix(K): """ Normalize a matrix K s.t. columns have Euclidean-norm |1| """ if scipy.sparse.issparse(K): L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:] s = np.where(L > 0.0)[0] L[s] = L[s]**-1 KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc') K = K*KN else: L = np.sqrt((K**2).sum(axis=0)) s = np.where(L > 0.0)[0] L[s] = L[s]**-1 K = K*L return K self._t = np.zeros((self._num_bases)) t0 = time.time() self.select = [] # normalize data A = self.data.copy() svd_mdl = SVD(A, k=self._k) svd_mdl.factorize() if scipy.sparse.issparse(self.data): B = svd_mdl.U * svd_mdl.S B = B.tocsc() else: B = np.dot(svd_mdl.U, svd_mdl.S) B = B[:, :self._num_bases] for i in range(self._num_bases): A = normalize_matrix(A) if scipy.sparse.issparse(self.data): T = B.transpose() * A T = np.array(T.multiply(T).sum(axis=0))[0,:] # next selected column index T[self.select] = 0.0 idx = np.argmax(T) Aidx = A[:, idx].copy() self.select.append(idx) # update B BC = Aidx.transpose() * B B = B - (Aidx*BC) # update A AC = Aidx.transpose() * A A = A - (Aidx*AC) else: T = np.dot(B.transpose(), A) T = np.sum(T**2.0, axis=0) # next selected column index T[self.select] = 0.0 idx = np.argmax(T) self.select.append(idx) # update B BC = np.dot(B.transpose(),A[:,idx]) B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1)) # and A AC = np.dot(A.transpose(),A[:,idx]) A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1)) # detect the next best data point self._logger.info('searching for next best column ...') self._logger.info('cur_columns: ' + str(self.select)) self._t[i] = time.time() - t0 # sort indices, otherwise h5py won't work self.W = self.data[:, np.sort(self.select)] # "unsort" it again to keep the correct order self.W = self.W[:, np.argsort(np.argsort(self.select))] if __name__ == "__main__": import doctest doctest.testmod()