mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Geometric-Map
|
mi@0
|
8
|
mi@0
|
9 GMAP: Class for Geometric-Map
|
mi@0
|
10 """
|
mi@0
|
11
|
mi@0
|
12
|
mi@0
|
13 import scipy.sparse
|
mi@0
|
14 import numpy as np
|
mi@0
|
15
|
mi@0
|
16 from dist import *
|
mi@0
|
17 from aa import AA
|
mi@0
|
18 from kmeans import Kmeans
|
mi@0
|
19
|
mi@0
|
20 __all__ = ["GMAP"]
|
mi@0
|
21
|
mi@0
|
22 class GMAP(AA):
|
mi@0
|
23 """
|
mi@0
|
24 GMAP(data, num_bases=4, dist_measure='l2')
|
mi@0
|
25
|
mi@0
|
26
|
mi@0
|
27 Geometric-Map. Factorize a data matrix into two matrices s.t.
|
mi@0
|
28 F = | data - W*H | is minimal. G-MAP can emulate/approximate several
|
mi@0
|
29 standard methods including PCA, NMF, and AA.
|
mi@0
|
30
|
mi@0
|
31 Parameters
|
mi@0
|
32 ----------
|
mi@0
|
33 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
34 the input data
|
mi@0
|
35 num_bases: int, optional
|
mi@0
|
36 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
37 4 (default)
|
mi@0
|
38 method : one of 'pca' ,'nmf', 'aa', default is 'pca' which emulates
|
mi@0
|
39 Principal Component Analysis using the geometric map method ('nmf'
|
mi@0
|
40 emulates Non-negative Matrix Factorization, 'aa' emulates Archetypal
|
mi@0
|
41 Analysis).
|
mi@0
|
42 robust_map : bool, optional
|
mi@0
|
43 use robust_map or the standard max-val selection
|
mi@0
|
44 [see "On FastMap and the Convex Hull of Multivariate Data: Toward
|
mi@0
|
45 Fast and Robust Dimension Reduction", Ostrouchov and Samatova, PAMI
|
mi@0
|
46 2005]
|
mi@0
|
47 Attributes
|
mi@0
|
48 ----------
|
mi@0
|
49 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
50 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
51 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
52
|
mi@0
|
53 Example
|
mi@0
|
54 -------
|
mi@0
|
55 Applying GMAP to some rather stupid data set:
|
mi@0
|
56
|
mi@0
|
57 >>> import numpy as np
|
mi@0
|
58 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
59 >>> gmap_mdl = GMAP(data, num_bases=2)
|
mi@0
|
60 >>> gmap_mdl.factorize()
|
mi@0
|
61
|
mi@0
|
62 The basis vectors are now stored in gmap_mdl.W, the coefficients in gmap_mdl.H.
|
mi@0
|
63 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
64 to gmap_mdl.W, and set compute_w to False:
|
mi@0
|
65
|
mi@0
|
66 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
|
mi@0
|
67 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
68 >>> gmap_mdl = GMAP(data, num_bases=2)
|
mi@0
|
69 >>> gmap_mdl.W = W
|
mi@0
|
70 >>> gmap_mdl.factorize(compute_w=False)
|
mi@0
|
71
|
mi@0
|
72 The result is a set of coefficients gmap_mdl.H, s.t. data = W * gmap_mdl.H.
|
mi@0
|
73 """
|
mi@0
|
74
|
mi@0
|
75 # always overwrite the default number of iterations
|
mi@0
|
76 # -> any value other does not make sense.
|
mi@0
|
77 _NITER = 1
|
mi@0
|
78
|
mi@0
|
79 def __init__(self, data, num_bases=4, method='pca', robust_map=True):
|
mi@0
|
80
|
mi@0
|
81 AA.__init__(self, data, num_bases=num_bases)
|
mi@0
|
82 self.sub = []
|
mi@0
|
83 self._robust_map = robust_map
|
mi@0
|
84 self._method = method
|
mi@0
|
85
|
mi@0
|
86
|
mi@0
|
87 def init_h(self):
|
mi@0
|
88 self.H = np.zeros((self._num_bases, self._num_samples))
|
mi@0
|
89
|
mi@0
|
90 def init_w(self):
|
mi@0
|
91 self.W = np.zeros((self._data_dimension, self._num_bases))
|
mi@0
|
92
|
mi@0
|
93 def update_w(self):
|
mi@0
|
94 """ compute new W """
|
mi@0
|
95
|
mi@0
|
96 def select_next(iterval):
|
mi@0
|
97 """ select the next best data sample using robust map
|
mi@0
|
98 or simply the max iterval ... """
|
mi@0
|
99
|
mi@0
|
100 if self._robust_map:
|
mi@0
|
101 k = np.argsort(iterval)[::-1]
|
mi@0
|
102 d_sub = self.data[:,k[:self._robust_nselect]]
|
mi@0
|
103 self.sub.extend(k[:self._robust_nselect])
|
mi@0
|
104
|
mi@0
|
105 # cluster d_sub
|
mi@0
|
106 kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster)
|
mi@0
|
107 kmeans_mdl.factorize(niter=10)
|
mi@0
|
108
|
mi@0
|
109 # get largest cluster
|
mi@0
|
110 h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0]
|
mi@0
|
111 largest_cluster = np.argmax(h)
|
mi@0
|
112 sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub)
|
mi@0
|
113 sel = k[np.argmin(sel)]
|
mi@0
|
114 else:
|
mi@0
|
115 sel = np.argmax(iterval)
|
mi@0
|
116
|
mi@0
|
117 return sel
|
mi@0
|
118
|
mi@0
|
119 EPS = 10**-8
|
mi@0
|
120
|
mi@0
|
121 if scipy.sparse.issparse(self.data):
|
mi@0
|
122 norm_data = np.sqrt(self.data.multiply(self.data).sum(axis=0))
|
mi@0
|
123 norm_data = np.array(norm_data).reshape((-1))
|
mi@0
|
124 else:
|
mi@0
|
125 norm_data = np.sqrt(np.sum(self.data**2, axis=0))
|
mi@0
|
126
|
mi@0
|
127
|
mi@0
|
128 self.select = []
|
mi@0
|
129
|
mi@0
|
130 if self._method == 'pca' or self._method == 'aa':
|
mi@0
|
131 iterval = norm_data.copy()
|
mi@0
|
132
|
mi@0
|
133 if self._method == 'nmf':
|
mi@0
|
134 iterval = np.sum(self.data, axis=0)/(np.sqrt(self.data.shape[0])*norm_data)
|
mi@0
|
135 iterval = 1.0 - iterval
|
mi@0
|
136
|
mi@0
|
137 self.select.append(select_next(iterval))
|
mi@0
|
138
|
mi@0
|
139
|
mi@0
|
140 for l in range(1, self._num_bases):
|
mi@0
|
141
|
mi@0
|
142 if scipy.sparse.issparse(self.data):
|
mi@0
|
143 c = self.data[:, self.select[-1]:self.select[-1]+1].T * self.data
|
mi@0
|
144 c = np.array(c.todense())
|
mi@0
|
145 else:
|
mi@0
|
146 c = np.dot(self.data[:,self.select[-1]], self.data)
|
mi@0
|
147
|
mi@0
|
148 c = c/(norm_data * norm_data[self.select[-1]])
|
mi@0
|
149
|
mi@0
|
150 if self._method == 'pca':
|
mi@0
|
151 c = 1.0 - np.abs(c)
|
mi@0
|
152 c = c * norm_data
|
mi@0
|
153
|
mi@0
|
154 elif self._method == 'aa':
|
mi@0
|
155 c = (c*-1.0 + 1.0)/2.0
|
mi@0
|
156 c = c * norm_data
|
mi@0
|
157
|
mi@0
|
158 elif self._method == 'nmf':
|
mi@0
|
159 c = 1.0 - np.abs(c)
|
mi@0
|
160
|
mi@0
|
161 ### update the estimated volume
|
mi@0
|
162 iterval = c * iterval
|
mi@0
|
163
|
mi@0
|
164 # detect the next best data point
|
mi@0
|
165 self.select.append(select_next(iterval))
|
mi@0
|
166
|
mi@0
|
167 self._logger.info('cur_nodes: ' + str(self.select))
|
mi@0
|
168
|
mi@0
|
169 # sort indices, otherwise h5py won't work
|
mi@0
|
170 self.W = self.data[:, np.sort(self.select)]
|
mi@0
|
171
|
mi@0
|
172 # "unsort" it again to keep the correct order
|
mi@0
|
173 self.W = self.W[:, np.argsort(np.argsort(self.select))]
|
mi@0
|
174
|
mi@0
|
175 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
|
mi@0
|
176 compute_err=True, robust_cluster=3, niter=1, robust_nselect=-1):
|
mi@0
|
177 """ Factorize s.t. WH = data
|
mi@0
|
178
|
mi@0
|
179 Parameters
|
mi@0
|
180 ----------
|
mi@0
|
181 show_progress : bool
|
mi@0
|
182 print some extra information to stdout.
|
mi@0
|
183 False, default
|
mi@0
|
184 compute_h : bool
|
mi@0
|
185 iteratively update values for H.
|
mi@0
|
186 True, default
|
mi@0
|
187 compute_w : bool
|
mi@0
|
188 iteratively update values for W.
|
mi@0
|
189 default, True
|
mi@0
|
190 compute_err : bool
|
mi@0
|
191 compute Frobenius norm |data-WH| after each update and store
|
mi@0
|
192 it to .ferr[k].
|
mi@0
|
193 robust_cluster : int, optional
|
mi@0
|
194 set the number of clusters for robust map selection.
|
mi@0
|
195 3, default
|
mi@0
|
196 robust_nselect : int, optional
|
mi@0
|
197 set the number of samples to consider for robust map
|
mi@0
|
198 selection.
|
mi@0
|
199 -1, default (automatically determine suitable number)
|
mi@0
|
200
|
mi@0
|
201 Updated Values
|
mi@0
|
202 --------------
|
mi@0
|
203 .W : updated values for W.
|
mi@0
|
204 .H : updated values for H.
|
mi@0
|
205 .ferr : Frobenius norm |data-WH|.
|
mi@0
|
206 """
|
mi@0
|
207 self._robust_cluster = robust_cluster
|
mi@0
|
208 self._robust_nselect = robust_nselect
|
mi@0
|
209
|
mi@0
|
210 if self._robust_nselect == -1:
|
mi@0
|
211 self._robust_nselect = np.round(np.log(self.data.shape[1])*2)
|
mi@0
|
212
|
mi@0
|
213 AA.factorize(self, niter=1, show_progress=show_progress,
|
mi@0
|
214 compute_w=compute_w, compute_h=compute_h,
|
mi@0
|
215 compute_err=compute_err)
|
mi@0
|
216
|
mi@0
|
217 if __name__ == "__main__":
|
mi@0
|
218 import doctest
|
mi@0
|
219 doctest.testmod()
|