annotate pymf/greedy.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python2.6
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 #$Id$
mi@0 7 """
mi@0 8 PyMF GREEDY[1]
mi@0 9
mi@0 10 GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1].
mi@0 11
mi@0 12
mi@0 13 [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
mi@0 14 Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
mi@0 15 """
mi@0 16
mi@0 17
mi@0 18 import time
mi@0 19 import scipy.sparse
mi@0 20 import numpy as np
mi@0 21 from svd import *
mi@0 22 from nmf import NMF
mi@0 23
mi@0 24 __all__ = ["GREEDY"]
mi@0 25
mi@0 26 class GREEDY(NMF):
mi@0 27 """
mi@0 28 GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True)
mi@0 29
mi@0 30
mi@0 31 Deterministic Sparse Column Based Matrix Reconstruction via Greedy
mi@0 32 Approximation of SVD. Factorize a data matrix into two matrices s.t.
mi@0 33 F = | data - W*H | is minimal. W is iteratively selected as columns
mi@0 34 of data.
mi@0 35
mi@0 36 Parameters
mi@0 37 ----------
mi@0 38 data : array_like, shape (_data_dimension, _num_samples)
mi@0 39 the input data
mi@0 40 num_bases: int, optional
mi@0 41 Number of bases to compute (column rank of W and row rank of H).
mi@0 42 4 (default)
mi@0 43 k : number of singular vectors for the SVD step of the algorithm
mi@0 44 num_bases (default)
mi@0 45
mi@0 46 Attributes
mi@0 47 ----------
mi@0 48 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 49 H : "num bases x num_samples" matrix of coefficients
mi@0 50 ferr : frobenius norm (after calling .factorize())
mi@0 51
mi@0 52 Example
mi@0 53 -------
mi@0 54 Applying GREEDY to some rather stupid data set:
mi@0 55
mi@0 56 >>> import numpy as np
mi@0 57 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 58 >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10)
mi@0 59 >>> greedy_mdl.factorize()
mi@0 60
mi@0 61 The basis vectors are now stored in greedy_mdl.W, the coefficients in
mi@0 62 greedy_mdl.H. To compute coefficients for an existing set of basis
mi@0 63 vectors simply copy W to greedy_mdl.W, and set compW to False:
mi@0 64
mi@0 65 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0 66 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0 67 >>> greedy_mdl = GREEDY(data, num_bases=2)
mi@0 68 >>> greedy_mdl.W = W
mi@0 69 >>> greedy_mdl.factorize(compute_w=False)
mi@0 70
mi@0 71 The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H.
mi@0 72 """
mi@0 73
mi@0 74
mi@0 75 def __init__(self, data, k=-1, num_bases=4):
mi@0 76 # call inherited method
mi@0 77 NMF.__init__(self, data, num_bases=num_bases)
mi@0 78 self._k = k
mi@0 79 if self._k == -1:
mi@0 80 self._k = num_bases
mi@0 81
mi@0 82 def update_h(self):
mi@0 83 if scipy.sparse.issparse(self.data):
mi@0 84 self.H = pinv(self.W) * self.data
mi@0 85 else:
mi@0 86 self.H = np.dot(pinv(self.W), self.data)
mi@0 87
mi@0 88 def update_w(self):
mi@0 89 def normalize_matrix(K):
mi@0 90 """ Normalize a matrix K s.t. columns have Euclidean-norm |1|
mi@0 91 """
mi@0 92 if scipy.sparse.issparse(K):
mi@0 93 L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:]
mi@0 94 s = np.where(L > 0.0)[0]
mi@0 95 L[s] = L[s]**-1
mi@0 96 KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc')
mi@0 97 K = K*KN
mi@0 98 else:
mi@0 99 L = np.sqrt((K**2).sum(axis=0))
mi@0 100 s = np.where(L > 0.0)[0]
mi@0 101 L[s] = L[s]**-1
mi@0 102 K = K*L
mi@0 103 return K
mi@0 104
mi@0 105 self._t = np.zeros((self._num_bases))
mi@0 106 t0 = time.time()
mi@0 107 self.select = []
mi@0 108
mi@0 109 # normalize data
mi@0 110 A = self.data.copy()
mi@0 111
mi@0 112 svd_mdl = SVD(A, k=self._k)
mi@0 113 svd_mdl.factorize()
mi@0 114
mi@0 115 if scipy.sparse.issparse(self.data):
mi@0 116 B = svd_mdl.U * svd_mdl.S
mi@0 117 B = B.tocsc()
mi@0 118 else:
mi@0 119 B = np.dot(svd_mdl.U, svd_mdl.S)
mi@0 120 B = B[:, :self._num_bases]
mi@0 121
mi@0 122 for i in range(self._num_bases):
mi@0 123 A = normalize_matrix(A)
mi@0 124
mi@0 125 if scipy.sparse.issparse(self.data):
mi@0 126 T = B.transpose() * A
mi@0 127 T = np.array(T.multiply(T).sum(axis=0))[0,:]
mi@0 128
mi@0 129 # next selected column index
mi@0 130 T[self.select] = 0.0
mi@0 131 idx = np.argmax(T)
mi@0 132 Aidx = A[:, idx].copy()
mi@0 133 self.select.append(idx)
mi@0 134
mi@0 135 # update B
mi@0 136 BC = Aidx.transpose() * B
mi@0 137 B = B - (Aidx*BC)
mi@0 138
mi@0 139 # update A
mi@0 140 AC = Aidx.transpose() * A
mi@0 141 A = A - (Aidx*AC)
mi@0 142
mi@0 143 else:
mi@0 144 T = np.dot(B.transpose(), A)
mi@0 145 T = np.sum(T**2.0, axis=0)
mi@0 146
mi@0 147 # next selected column index
mi@0 148 T[self.select] = 0.0
mi@0 149 idx = np.argmax(T)
mi@0 150 self.select.append(idx)
mi@0 151
mi@0 152 # update B
mi@0 153 BC = np.dot(B.transpose(),A[:,idx])
mi@0 154 B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1))
mi@0 155
mi@0 156 # and A
mi@0 157 AC = np.dot(A.transpose(),A[:,idx])
mi@0 158 A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1))
mi@0 159
mi@0 160
mi@0 161 # detect the next best data point
mi@0 162 self._logger.info('searching for next best column ...')
mi@0 163 self._logger.info('cur_columns: ' + str(self.select))
mi@0 164 self._t[i] = time.time() - t0
mi@0 165
mi@0 166 # sort indices, otherwise h5py won't work
mi@0 167 self.W = self.data[:, np.sort(self.select)]
mi@0 168
mi@0 169 # "unsort" it again to keep the correct order
mi@0 170 self.W = self.W[:, np.argsort(np.argsort(self.select))]
mi@0 171
mi@0 172 if __name__ == "__main__":
mi@0 173 import doctest
mi@0 174 doctest.testmod()