mi@0: #!/usr/bin/python2.6 mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: #$Id$ mi@0: """ mi@0: PyMF GREEDY[1] mi@0: mi@0: GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1]. mi@0: mi@0: mi@0: [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix mi@0: Reconstruction via Greedy Approximation of SVD. ISAAC'2008. mi@0: """ mi@0: mi@0: mi@0: import time mi@0: import scipy.sparse mi@0: import numpy as np mi@0: from svd import * mi@0: from nmf import NMF mi@0: mi@0: __all__ = ["GREEDY"] mi@0: mi@0: class GREEDY(NMF): mi@0: """ mi@0: GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True) mi@0: mi@0: mi@0: Deterministic Sparse Column Based Matrix Reconstruction via Greedy mi@0: Approximation of SVD. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. W is iteratively selected as columns mi@0: of data. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: k : number of singular vectors for the SVD step of the algorithm mi@0: num_bases (default) mi@0: mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying GREEDY to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10) mi@0: >>> greedy_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in greedy_mdl.W, the coefficients in mi@0: greedy_mdl.H. To compute coefficients for an existing set of basis mi@0: vectors simply copy W to greedy_mdl.W, and set compW to False: mi@0: mi@0: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> greedy_mdl = GREEDY(data, num_bases=2) mi@0: >>> greedy_mdl.W = W mi@0: >>> greedy_mdl.factorize(compute_w=False) mi@0: mi@0: The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H. mi@0: """ mi@0: mi@0: mi@0: def __init__(self, data, k=-1, num_bases=4): mi@0: # call inherited method mi@0: NMF.__init__(self, data, num_bases=num_bases) mi@0: self._k = k mi@0: if self._k == -1: mi@0: self._k = num_bases mi@0: mi@0: def update_h(self): mi@0: if scipy.sparse.issparse(self.data): mi@0: self.H = pinv(self.W) * self.data mi@0: else: mi@0: self.H = np.dot(pinv(self.W), self.data) mi@0: mi@0: def update_w(self): mi@0: def normalize_matrix(K): mi@0: """ Normalize a matrix K s.t. columns have Euclidean-norm |1| mi@0: """ mi@0: if scipy.sparse.issparse(K): mi@0: L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:] mi@0: s = np.where(L > 0.0)[0] mi@0: L[s] = L[s]**-1 mi@0: KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc') mi@0: K = K*KN mi@0: else: mi@0: L = np.sqrt((K**2).sum(axis=0)) mi@0: s = np.where(L > 0.0)[0] mi@0: L[s] = L[s]**-1 mi@0: K = K*L mi@0: return K mi@0: mi@0: self._t = np.zeros((self._num_bases)) mi@0: t0 = time.time() mi@0: self.select = [] mi@0: mi@0: # normalize data mi@0: A = self.data.copy() mi@0: mi@0: svd_mdl = SVD(A, k=self._k) mi@0: svd_mdl.factorize() mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: B = svd_mdl.U * svd_mdl.S mi@0: B = B.tocsc() mi@0: else: mi@0: B = np.dot(svd_mdl.U, svd_mdl.S) mi@0: B = B[:, :self._num_bases] mi@0: mi@0: for i in range(self._num_bases): mi@0: A = normalize_matrix(A) mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: T = B.transpose() * A mi@0: T = np.array(T.multiply(T).sum(axis=0))[0,:] mi@0: mi@0: # next selected column index mi@0: T[self.select] = 0.0 mi@0: idx = np.argmax(T) mi@0: Aidx = A[:, idx].copy() mi@0: self.select.append(idx) mi@0: mi@0: # update B mi@0: BC = Aidx.transpose() * B mi@0: B = B - (Aidx*BC) mi@0: mi@0: # update A mi@0: AC = Aidx.transpose() * A mi@0: A = A - (Aidx*AC) mi@0: mi@0: else: mi@0: T = np.dot(B.transpose(), A) mi@0: T = np.sum(T**2.0, axis=0) mi@0: mi@0: # next selected column index mi@0: T[self.select] = 0.0 mi@0: idx = np.argmax(T) mi@0: self.select.append(idx) mi@0: mi@0: # update B mi@0: BC = np.dot(B.transpose(),A[:,idx]) mi@0: B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1)) mi@0: mi@0: # and A mi@0: AC = np.dot(A.transpose(),A[:,idx]) mi@0: A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1)) mi@0: mi@0: mi@0: # detect the next best data point mi@0: self._logger.info('searching for next best column ...') mi@0: self._logger.info('cur_columns: ' + str(self.select)) mi@0: self._t[i] = time.time() - t0 mi@0: mi@0: # sort indices, otherwise h5py won't work mi@0: self.W = self.data[:, np.sort(self.select)] mi@0: mi@0: # "unsort" it again to keep the correct order mi@0: self.W = self.W[:, np.argsort(np.argsort(self.select))] mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()