mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Matrix sampling methods
|
mi@0
|
8
|
mi@0
|
9 SUB: apply one of the matrix factorization methods of PyMF
|
mi@0
|
10 on sampled data for computing W, then compute H.
|
mi@0
|
11
|
mi@0
|
12 Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL).
|
mi@0
|
13 """
|
mi@0
|
14
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import numpy as np
|
mi@0
|
18 import random
|
mi@0
|
19 #from itertools import combinations
|
mi@0
|
20 from chnmf import combinations
|
mi@0
|
21
|
mi@0
|
22 import dist
|
mi@0
|
23 from chnmf import quickhull
|
mi@0
|
24 from nmf import NMF
|
mi@0
|
25 from pca import PCA
|
mi@0
|
26 from kmeans import Kmeans
|
mi@0
|
27 from laesa import LAESA
|
mi@0
|
28 from sivm import SIVM
|
mi@0
|
29
|
mi@0
|
30 __all__ = ["SUB"]
|
mi@0
|
31
|
mi@0
|
32 class SUB(NMF):
|
mi@0
|
33 """
|
mi@0
|
34 SUB(data, mfmethod, sstrategy='rand', nsub=20, show_progress=True, mapW=False,
|
mi@0
|
35 base_sel=2, num_bases=3 , niterH=1, niter=100, compute_h=True, compute_w=True, )
|
mi@0
|
36
|
mi@0
|
37 Evaluate a matrix factorization method "mfmethod" for a certain sampling
|
mi@0
|
38 strategy "sstrategy". This is particular useful for very large datasets.
|
mi@0
|
39
|
mi@0
|
40 Parameters
|
mi@0
|
41 ----------
|
mi@0
|
42 todo ...
|
mi@0
|
43
|
mi@0
|
44 Attributes
|
mi@0
|
45 ----------
|
mi@0
|
46 todo ....
|
mi@0
|
47 """
|
mi@0
|
48
|
mi@0
|
49 def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2,
|
mi@0
|
50 num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'):
|
mi@0
|
51 NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w)
|
mi@0
|
52
|
mi@0
|
53 self._niterH = niterH
|
mi@0
|
54 self._nsub = nsub
|
mi@0
|
55 self.data = data
|
mi@0
|
56 self._mfmethod = mfmethod
|
mi@0
|
57 self._mapW = mapW
|
mi@0
|
58 self._sstrategy = sstrategy
|
mi@0
|
59 self._base_sel = base_sel
|
mi@0
|
60
|
mi@0
|
61 # assign the correct distance function
|
mi@0
|
62 if self._sstrategy == 'cur':
|
mi@0
|
63 self._subfunc = self.curselect
|
mi@0
|
64
|
mi@0
|
65 elif self._sstrategy == 'kmeans':
|
mi@0
|
66 self._subfunc = self.kmeansselect
|
mi@0
|
67
|
mi@0
|
68 elif self._sstrategy == 'hull':
|
mi@0
|
69 self._subfunc = self.hullselect
|
mi@0
|
70
|
mi@0
|
71 elif self._sstrategy == 'laesa':
|
mi@0
|
72 self._subfunc = self.laesaselect
|
mi@0
|
73
|
mi@0
|
74 elif self._sstrategy == 'sivm':
|
mi@0
|
75 self._subfunc = self.sivmselect
|
mi@0
|
76
|
mi@0
|
77 else:
|
mi@0
|
78 self._subfunc = self.randselect
|
mi@0
|
79
|
mi@0
|
80 def hullselect(self):
|
mi@0
|
81
|
mi@0
|
82 def selectHullPoints(data, n=20):
|
mi@0
|
83 """ select data points for pairwise projections of the first n
|
mi@0
|
84 dimensions """
|
mi@0
|
85
|
mi@0
|
86 # iterate over all projections and select data points
|
mi@0
|
87 idx = np.array([])
|
mi@0
|
88
|
mi@0
|
89 # iterate over some pairwise combinations of dimensions
|
mi@0
|
90 for i in combinations(range(n), 2):
|
mi@0
|
91
|
mi@0
|
92 # sample convex hull points in 2D projection
|
mi@0
|
93 convex_hull_d = quickhull(data[i, :].T)
|
mi@0
|
94
|
mi@0
|
95 # get indices for convex hull data points
|
mi@0
|
96 idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T))
|
mi@0
|
97 idx = np.unique(idx)
|
mi@0
|
98
|
mi@0
|
99 return np.int32(idx)
|
mi@0
|
100
|
mi@0
|
101
|
mi@0
|
102 # determine convex hull data points only if the total
|
mi@0
|
103 # amount of available data is >50
|
mi@0
|
104 #if self.data.shape[1] > 50:
|
mi@0
|
105 pcamodel = PCA(self.data, show_progress=self._show_progress)
|
mi@0
|
106 pcamodel.factorize()
|
mi@0
|
107
|
mi@0
|
108 idx = selectHullPoints(pcamodel.H, n=self._base_sel)
|
mi@0
|
109
|
mi@0
|
110 # set the number of subsampled data
|
mi@0
|
111 self.nsub = len(idx)
|
mi@0
|
112
|
mi@0
|
113 return idx
|
mi@0
|
114
|
mi@0
|
115 def kmeansselect(self):
|
mi@0
|
116 kmeans_mdl = Kmeans(self.data, num_bases=self._nsub)
|
mi@0
|
117 kmeans_mdl.initialization()
|
mi@0
|
118 kmeans_mdl.factorize()
|
mi@0
|
119
|
mi@0
|
120 # pick data samples closest to the centres
|
mi@0
|
121 idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W)
|
mi@0
|
122 return idx
|
mi@0
|
123
|
mi@0
|
124 def curselect(self):
|
mi@0
|
125 def sample_probability():
|
mi@0
|
126 dsquare = self.data[:,:]**2
|
mi@0
|
127
|
mi@0
|
128 pcol = np.array(dsquare.sum(axis=0))
|
mi@0
|
129 pcol /= pcol.sum()
|
mi@0
|
130
|
mi@0
|
131 return (pcol.reshape(-1,1))
|
mi@0
|
132
|
mi@0
|
133 probs = sample_probability()
|
mi@0
|
134 prob_cols = np.cumsum(probs.flatten()) #.flatten()
|
mi@0
|
135 temp_ind = np.zeros(self._nsub, np.int32)
|
mi@0
|
136
|
mi@0
|
137 for i in range(self._nsub):
|
mi@0
|
138 tempI = np.where(prob_cols >= np.random.rand())[0]
|
mi@0
|
139 temp_ind[i] = tempI[0]
|
mi@0
|
140
|
mi@0
|
141 return np.sort(temp_ind)
|
mi@0
|
142
|
mi@0
|
143 def sivmselect(self):
|
mi@0
|
144 sivmmdl = SIVM(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine')
|
mi@0
|
145
|
mi@0
|
146 sivmmdl.initialization()
|
mi@0
|
147 sivmmdl.factorize()
|
mi@0
|
148 idx = sivmmdl.select
|
mi@0
|
149 return idx
|
mi@0
|
150
|
mi@0
|
151 def laesaselect(self):
|
mi@0
|
152 laesamdl = LAESA(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine')
|
mi@0
|
153 laesamdl.initialization()
|
mi@0
|
154 laesamdl.factorize()
|
mi@0
|
155 idx = laesamdl.select
|
mi@0
|
156 return idx
|
mi@0
|
157
|
mi@0
|
158
|
mi@0
|
159 def randselect(self):
|
mi@0
|
160 idx = random.sample(xrange(self._num_samples), self._nsub)
|
mi@0
|
161 return np.sort(np.int32(idx))
|
mi@0
|
162
|
mi@0
|
163 def update_w(self):
|
mi@0
|
164
|
mi@0
|
165 idx = self._subfunc()
|
mi@0
|
166 idx = np.sort(np.int32(idx))
|
mi@0
|
167
|
mi@0
|
168
|
mi@0
|
169 mdl_small = self._mfmethod(self.data[:, idx],
|
mi@0
|
170 num_bases=self._num_bases,
|
mi@0
|
171 show_progress=self._show_progress,
|
mi@0
|
172 compute_w=True)
|
mi@0
|
173
|
mi@0
|
174 # initialize W, H, and beta
|
mi@0
|
175 mdl_small.initialization()
|
mi@0
|
176
|
mi@0
|
177 # determine W
|
mi@0
|
178 mdl_small.factorize()
|
mi@0
|
179
|
mi@0
|
180
|
mi@0
|
181 self.mdl = self._mfmethod(self.data[:, :],
|
mi@0
|
182 num_bases=self._num_bases ,
|
mi@0
|
183 show_progress=self._show_progress,
|
mi@0
|
184 compute_w=False)
|
mi@0
|
185
|
mi@0
|
186
|
mi@0
|
187 self.mdl.initialization()
|
mi@0
|
188
|
mi@0
|
189 if self._mapW:
|
mi@0
|
190 # compute pairwise distances
|
mi@0
|
191 #distance = vq(self.data, self.W)
|
mi@0
|
192 _Wmapped_index = dist.vq(self.mdl.data, mdl_small.W)
|
mi@0
|
193
|
mi@0
|
194 # do not directly assign, i.e. Wdist = self.data[:,sel]
|
mi@0
|
195 # as self might be unsorted (in non ascending order)
|
mi@0
|
196 # -> sorting sel would screw the matching to W if
|
mi@0
|
197 # self.data is stored as a hdf5 table (see h5py)
|
mi@0
|
198 for i,s in enumerate(_Wmapped_index):
|
mi@0
|
199 self.mdl.W[:,i] = self.mdl.data[:,s]
|
mi@0
|
200 else:
|
mi@0
|
201 self.mdl.W = np.copy(mdl_small.W)
|
mi@0
|
202
|
mi@0
|
203 def update_h(self):
|
mi@0
|
204 self.mdl.factorize()
|
mi@0
|
205
|
mi@0
|
206 def factorize(self):
|
mi@0
|
207 """Do factorization s.t. data = dot(dot(data,beta),H), under the convexity constraint
|
mi@0
|
208 beta >=0, sum(beta)=1, H >=0, sum(H)=1
|
mi@0
|
209 """
|
mi@0
|
210 # compute new coefficients for reconstructing data points
|
mi@0
|
211 self.update_w()
|
mi@0
|
212
|
mi@0
|
213 # for CHNMF it is sometimes useful to only compute
|
mi@0
|
214 # the basis vectors
|
mi@0
|
215 if self._compute_h:
|
mi@0
|
216 self.update_h()
|
mi@0
|
217
|
mi@0
|
218 self.W = self.mdl.W
|
mi@0
|
219 self.H = self.mdl.H
|
mi@0
|
220
|
mi@0
|
221 self.ferr = np.zeros(1)
|
mi@0
|
222 self.ferr[0] = self.mdl.frobenius_norm()
|
mi@0
|
223 self._print_cur_status(' Fro:' + str(self.ferr[0]))
|
mi@0
|
224
|
mi@0
|
225 if __name__ == "__main__":
|
mi@0
|
226 import doctest
|
mi@0
|
227 doctest.testmod()
|