comparison pymf/greedy.py @ 0:26838b1f560f

initial commit of a segmenter project
author mi tian
date Thu, 02 Apr 2015 18:09:27 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26838b1f560f
1 #!/usr/bin/python2.6
2 #
3 # Copyright (C) Christian Thurau, 2010.
4 # Licensed under the GNU General Public License (GPL).
5 # http://www.gnu.org/licenses/gpl.txt
6 #$Id$
7 """
8 PyMF GREEDY[1]
9
10 GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1].
11
12
13 [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
14 Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
15 """
16
17
18 import time
19 import scipy.sparse
20 import numpy as np
21 from svd import *
22 from nmf import NMF
23
24 __all__ = ["GREEDY"]
25
26 class GREEDY(NMF):
27 """
28 GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True)
29
30
31 Deterministic Sparse Column Based Matrix Reconstruction via Greedy
32 Approximation of SVD. Factorize a data matrix into two matrices s.t.
33 F = | data - W*H | is minimal. W is iteratively selected as columns
34 of data.
35
36 Parameters
37 ----------
38 data : array_like, shape (_data_dimension, _num_samples)
39 the input data
40 num_bases: int, optional
41 Number of bases to compute (column rank of W and row rank of H).
42 4 (default)
43 k : number of singular vectors for the SVD step of the algorithm
44 num_bases (default)
45
46 Attributes
47 ----------
48 W : "data_dimension x num_bases" matrix of basis vectors
49 H : "num bases x num_samples" matrix of coefficients
50 ferr : frobenius norm (after calling .factorize())
51
52 Example
53 -------
54 Applying GREEDY to some rather stupid data set:
55
56 >>> import numpy as np
57 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
58 >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10)
59 >>> greedy_mdl.factorize()
60
61 The basis vectors are now stored in greedy_mdl.W, the coefficients in
62 greedy_mdl.H. To compute coefficients for an existing set of basis
63 vectors simply copy W to greedy_mdl.W, and set compW to False:
64
65 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
66 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
67 >>> greedy_mdl = GREEDY(data, num_bases=2)
68 >>> greedy_mdl.W = W
69 >>> greedy_mdl.factorize(compute_w=False)
70
71 The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H.
72 """
73
74
75 def __init__(self, data, k=-1, num_bases=4):
76 # call inherited method
77 NMF.__init__(self, data, num_bases=num_bases)
78 self._k = k
79 if self._k == -1:
80 self._k = num_bases
81
82 def update_h(self):
83 if scipy.sparse.issparse(self.data):
84 self.H = pinv(self.W) * self.data
85 else:
86 self.H = np.dot(pinv(self.W), self.data)
87
88 def update_w(self):
89 def normalize_matrix(K):
90 """ Normalize a matrix K s.t. columns have Euclidean-norm |1|
91 """
92 if scipy.sparse.issparse(K):
93 L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:]
94 s = np.where(L > 0.0)[0]
95 L[s] = L[s]**-1
96 KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc')
97 K = K*KN
98 else:
99 L = np.sqrt((K**2).sum(axis=0))
100 s = np.where(L > 0.0)[0]
101 L[s] = L[s]**-1
102 K = K*L
103 return K
104
105 self._t = np.zeros((self._num_bases))
106 t0 = time.time()
107 self.select = []
108
109 # normalize data
110 A = self.data.copy()
111
112 svd_mdl = SVD(A, k=self._k)
113 svd_mdl.factorize()
114
115 if scipy.sparse.issparse(self.data):
116 B = svd_mdl.U * svd_mdl.S
117 B = B.tocsc()
118 else:
119 B = np.dot(svd_mdl.U, svd_mdl.S)
120 B = B[:, :self._num_bases]
121
122 for i in range(self._num_bases):
123 A = normalize_matrix(A)
124
125 if scipy.sparse.issparse(self.data):
126 T = B.transpose() * A
127 T = np.array(T.multiply(T).sum(axis=0))[0,:]
128
129 # next selected column index
130 T[self.select] = 0.0
131 idx = np.argmax(T)
132 Aidx = A[:, idx].copy()
133 self.select.append(idx)
134
135 # update B
136 BC = Aidx.transpose() * B
137 B = B - (Aidx*BC)
138
139 # update A
140 AC = Aidx.transpose() * A
141 A = A - (Aidx*AC)
142
143 else:
144 T = np.dot(B.transpose(), A)
145 T = np.sum(T**2.0, axis=0)
146
147 # next selected column index
148 T[self.select] = 0.0
149 idx = np.argmax(T)
150 self.select.append(idx)
151
152 # update B
153 BC = np.dot(B.transpose(),A[:,idx])
154 B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1))
155
156 # and A
157 AC = np.dot(A.transpose(),A[:,idx])
158 A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1))
159
160
161 # detect the next best data point
162 self._logger.info('searching for next best column ...')
163 self._logger.info('cur_columns: ' + str(self.select))
164 self._t[i] = time.time() - t0
165
166 # sort indices, otherwise h5py won't work
167 self.W = self.data[:, np.sort(self.select)]
168
169 # "unsort" it again to keep the correct order
170 self.W = self.W[:, np.argsort(np.argsort(self.select))]
171
172 if __name__ == "__main__":
173 import doctest
174 doctest.testmod()