mi@0
|
1 #!/usr/bin/python2.6
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 #$Id$
|
mi@0
|
7 """
|
mi@0
|
8 PyMF GREEDY[1]
|
mi@0
|
9
|
mi@0
|
10 GREEDY: class for a deterministic SVD based greedy matrix reconstruction [1].
|
mi@0
|
11
|
mi@0
|
12
|
mi@0
|
13 [1] Ali Civril, Malik Magdon-Ismail. Deterministic Sparse Column Based Matrix
|
mi@0
|
14 Reconstruction via Greedy Approximation of SVD. ISAAC'2008.
|
mi@0
|
15 """
|
mi@0
|
16
|
mi@0
|
17
|
mi@0
|
18 import time
|
mi@0
|
19 import scipy.sparse
|
mi@0
|
20 import numpy as np
|
mi@0
|
21 from svd import *
|
mi@0
|
22 from nmf import NMF
|
mi@0
|
23
|
mi@0
|
24 __all__ = ["GREEDY"]
|
mi@0
|
25
|
mi@0
|
26 class GREEDY(NMF):
|
mi@0
|
27 """
|
mi@0
|
28 GREEDYVOL(data, num_bases=4, niter=100, show_progress=True, compW=True)
|
mi@0
|
29
|
mi@0
|
30
|
mi@0
|
31 Deterministic Sparse Column Based Matrix Reconstruction via Greedy
|
mi@0
|
32 Approximation of SVD. Factorize a data matrix into two matrices s.t.
|
mi@0
|
33 F = | data - W*H | is minimal. W is iteratively selected as columns
|
mi@0
|
34 of data.
|
mi@0
|
35
|
mi@0
|
36 Parameters
|
mi@0
|
37 ----------
|
mi@0
|
38 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
39 the input data
|
mi@0
|
40 num_bases: int, optional
|
mi@0
|
41 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
42 4 (default)
|
mi@0
|
43 k : number of singular vectors for the SVD step of the algorithm
|
mi@0
|
44 num_bases (default)
|
mi@0
|
45
|
mi@0
|
46 Attributes
|
mi@0
|
47 ----------
|
mi@0
|
48 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
49 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
50 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
51
|
mi@0
|
52 Example
|
mi@0
|
53 -------
|
mi@0
|
54 Applying GREEDY to some rather stupid data set:
|
mi@0
|
55
|
mi@0
|
56 >>> import numpy as np
|
mi@0
|
57 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
58 >>> greedy_mdl = GREEDY(data, num_bases=2, niter=10)
|
mi@0
|
59 >>> greedy_mdl.factorize()
|
mi@0
|
60
|
mi@0
|
61 The basis vectors are now stored in greedy_mdl.W, the coefficients in
|
mi@0
|
62 greedy_mdl.H. To compute coefficients for an existing set of basis
|
mi@0
|
63 vectors simply copy W to greedy_mdl.W, and set compW to False:
|
mi@0
|
64
|
mi@0
|
65 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
|
mi@0
|
66 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
67 >>> greedy_mdl = GREEDY(data, num_bases=2)
|
mi@0
|
68 >>> greedy_mdl.W = W
|
mi@0
|
69 >>> greedy_mdl.factorize(compute_w=False)
|
mi@0
|
70
|
mi@0
|
71 The result is a set of coefficients greedy_mdl.H, s.t. data = W * greedy_mdl.H.
|
mi@0
|
72 """
|
mi@0
|
73
|
mi@0
|
74
|
mi@0
|
75 def __init__(self, data, k=-1, num_bases=4):
|
mi@0
|
76 # call inherited method
|
mi@0
|
77 NMF.__init__(self, data, num_bases=num_bases)
|
mi@0
|
78 self._k = k
|
mi@0
|
79 if self._k == -1:
|
mi@0
|
80 self._k = num_bases
|
mi@0
|
81
|
mi@0
|
82 def update_h(self):
|
mi@0
|
83 if scipy.sparse.issparse(self.data):
|
mi@0
|
84 self.H = pinv(self.W) * self.data
|
mi@0
|
85 else:
|
mi@0
|
86 self.H = np.dot(pinv(self.W), self.data)
|
mi@0
|
87
|
mi@0
|
88 def update_w(self):
|
mi@0
|
89 def normalize_matrix(K):
|
mi@0
|
90 """ Normalize a matrix K s.t. columns have Euclidean-norm |1|
|
mi@0
|
91 """
|
mi@0
|
92 if scipy.sparse.issparse(K):
|
mi@0
|
93 L = np.sqrt(np.array(K.multiply(K).sum(axis=0)))[0,:]
|
mi@0
|
94 s = np.where(L > 0.0)[0]
|
mi@0
|
95 L[s] = L[s]**-1
|
mi@0
|
96 KN = scipy.sparse.spdiags(L,0,len(L),len(L),format='csc')
|
mi@0
|
97 K = K*KN
|
mi@0
|
98 else:
|
mi@0
|
99 L = np.sqrt((K**2).sum(axis=0))
|
mi@0
|
100 s = np.where(L > 0.0)[0]
|
mi@0
|
101 L[s] = L[s]**-1
|
mi@0
|
102 K = K*L
|
mi@0
|
103 return K
|
mi@0
|
104
|
mi@0
|
105 self._t = np.zeros((self._num_bases))
|
mi@0
|
106 t0 = time.time()
|
mi@0
|
107 self.select = []
|
mi@0
|
108
|
mi@0
|
109 # normalize data
|
mi@0
|
110 A = self.data.copy()
|
mi@0
|
111
|
mi@0
|
112 svd_mdl = SVD(A, k=self._k)
|
mi@0
|
113 svd_mdl.factorize()
|
mi@0
|
114
|
mi@0
|
115 if scipy.sparse.issparse(self.data):
|
mi@0
|
116 B = svd_mdl.U * svd_mdl.S
|
mi@0
|
117 B = B.tocsc()
|
mi@0
|
118 else:
|
mi@0
|
119 B = np.dot(svd_mdl.U, svd_mdl.S)
|
mi@0
|
120 B = B[:, :self._num_bases]
|
mi@0
|
121
|
mi@0
|
122 for i in range(self._num_bases):
|
mi@0
|
123 A = normalize_matrix(A)
|
mi@0
|
124
|
mi@0
|
125 if scipy.sparse.issparse(self.data):
|
mi@0
|
126 T = B.transpose() * A
|
mi@0
|
127 T = np.array(T.multiply(T).sum(axis=0))[0,:]
|
mi@0
|
128
|
mi@0
|
129 # next selected column index
|
mi@0
|
130 T[self.select] = 0.0
|
mi@0
|
131 idx = np.argmax(T)
|
mi@0
|
132 Aidx = A[:, idx].copy()
|
mi@0
|
133 self.select.append(idx)
|
mi@0
|
134
|
mi@0
|
135 # update B
|
mi@0
|
136 BC = Aidx.transpose() * B
|
mi@0
|
137 B = B - (Aidx*BC)
|
mi@0
|
138
|
mi@0
|
139 # update A
|
mi@0
|
140 AC = Aidx.transpose() * A
|
mi@0
|
141 A = A - (Aidx*AC)
|
mi@0
|
142
|
mi@0
|
143 else:
|
mi@0
|
144 T = np.dot(B.transpose(), A)
|
mi@0
|
145 T = np.sum(T**2.0, axis=0)
|
mi@0
|
146
|
mi@0
|
147 # next selected column index
|
mi@0
|
148 T[self.select] = 0.0
|
mi@0
|
149 idx = np.argmax(T)
|
mi@0
|
150 self.select.append(idx)
|
mi@0
|
151
|
mi@0
|
152 # update B
|
mi@0
|
153 BC = np.dot(B.transpose(),A[:,idx])
|
mi@0
|
154 B -= np.dot(A[:,idx].reshape(-1,1), BC.reshape(1,-1))
|
mi@0
|
155
|
mi@0
|
156 # and A
|
mi@0
|
157 AC = np.dot(A.transpose(),A[:,idx])
|
mi@0
|
158 A -= np.dot(A[:,idx].reshape(-1,1), AC.reshape(1,-1))
|
mi@0
|
159
|
mi@0
|
160
|
mi@0
|
161 # detect the next best data point
|
mi@0
|
162 self._logger.info('searching for next best column ...')
|
mi@0
|
163 self._logger.info('cur_columns: ' + str(self.select))
|
mi@0
|
164 self._t[i] = time.time() - t0
|
mi@0
|
165
|
mi@0
|
166 # sort indices, otherwise h5py won't work
|
mi@0
|
167 self.W = self.data[:, np.sort(self.select)]
|
mi@0
|
168
|
mi@0
|
169 # "unsort" it again to keep the correct order
|
mi@0
|
170 self.W = self.W[:, np.argsort(np.argsort(self.select))]
|
mi@0
|
171
|
mi@0
|
172 if __name__ == "__main__":
|
mi@0
|
173 import doctest
|
mi@0
|
174 doctest.testmod()
|