mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Simplex Volume Maximization [1]
|
mi@0
|
8
|
mi@0
|
9 SIVM: class for SiVM
|
mi@0
|
10
|
mi@0
|
11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
|
mi@0
|
12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
|
mi@0
|
13 Conf. on Information and Knowledge Management. ACM. 2010.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import scipy.sparse
|
mi@0
|
18 import numpy as np
|
mi@0
|
19
|
mi@0
|
20 from dist import *
|
mi@0
|
21 from aa import AA
|
mi@0
|
22
|
mi@0
|
23 __all__ = ["SIVM"]
|
mi@0
|
24
|
mi@0
|
25 class SIVM(AA):
|
mi@0
|
26 """
|
mi@0
|
27 SIVM(data, num_bases=4, dist_measure='l2')
|
mi@0
|
28
|
mi@0
|
29
|
mi@0
|
30 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
|
mi@0
|
31 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
|
mi@0
|
32 found by maximizing the volume of the resulting simplex (see [1]).
|
mi@0
|
33
|
mi@0
|
34 Parameters
|
mi@0
|
35 ----------
|
mi@0
|
36 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
37 the input data
|
mi@0
|
38 num_bases: int, optional
|
mi@0
|
39 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
40 4 (default)
|
mi@0
|
41 dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
|
mi@0
|
42 Standard is 'l2' which maximizes the volume of the simplex. In contrast,
|
mi@0
|
43 'cosine' maximizes the volume of a cone (see [1] for details).
|
mi@0
|
44 init : string (default: 'fastmap')
|
mi@0
|
45 'fastmap' or 'origin'. Sets the method used for finding the very first
|
mi@0
|
46 basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
|
mi@0
|
47 the two vectors that have the largest pairwise distance.
|
mi@0
|
48 Attributes
|
mi@0
|
49 ----------
|
mi@0
|
50 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
51 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
52 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
53
|
mi@0
|
54 Example
|
mi@0
|
55 -------
|
mi@0
|
56 Applying SIVM to some rather stupid data set:
|
mi@0
|
57
|
mi@0
|
58 >>> import numpy as np
|
mi@0
|
59 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
60 >>> sivm_mdl = SIVM(data, num_bases=2)
|
mi@0
|
61 >>> sivm_mdl.factorize()
|
mi@0
|
62
|
mi@0
|
63 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
|
mi@0
|
64 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
65 to sivm_mdl.W, and set compute_w to False:
|
mi@0
|
66
|
mi@0
|
67 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
|
mi@0
|
68 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
69 >>> sivm_mdl = SIVM(data, num_bases=2)
|
mi@0
|
70 >>> sivm_mdl.W = W
|
mi@0
|
71 >>> sivm_mdl.factorize(compute_w=False)
|
mi@0
|
72
|
mi@0
|
73 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
|
mi@0
|
74 """
|
mi@0
|
75
|
mi@0
|
76 # always overwrite the default number of iterations
|
mi@0
|
77 # -> any value other does not make sense.
|
mi@0
|
78 _NITER = 1
|
mi@0
|
79
|
mi@0
|
80 def __init__(self, data, num_bases=4, dist_measure='l2', init='fastmap'):
|
mi@0
|
81
|
mi@0
|
82 AA.__init__(self, data, num_bases=num_bases)
|
mi@0
|
83
|
mi@0
|
84 self._dist_measure = dist_measure
|
mi@0
|
85 self._init = init
|
mi@0
|
86
|
mi@0
|
87 # assign the correct distance function
|
mi@0
|
88 if self._dist_measure == 'l1':
|
mi@0
|
89 self._distfunc = l1_distance
|
mi@0
|
90
|
mi@0
|
91 elif self._dist_measure == 'l2':
|
mi@0
|
92 self._distfunc = l2_distance
|
mi@0
|
93
|
mi@0
|
94 elif self._dist_measure == 'cosine':
|
mi@0
|
95 self._distfunc = cosine_distance
|
mi@0
|
96
|
mi@0
|
97 elif self._dist_measure == 'abs_cosine':
|
mi@0
|
98 self._distfunc = abs_cosine_distance
|
mi@0
|
99
|
mi@0
|
100 elif self._dist_measure == 'weighted_abs_cosine':
|
mi@0
|
101 self._distfunc = weighted_abs_cosine_distance
|
mi@0
|
102
|
mi@0
|
103 elif self._dist_measure == 'kl':
|
mi@0
|
104 self._distfunc = kl_divergence
|
mi@0
|
105
|
mi@0
|
106
|
mi@0
|
107 def _distance(self, idx):
|
mi@0
|
108 """ compute distances of a specific data point to all other samples"""
|
mi@0
|
109
|
mi@0
|
110 if scipy.sparse.issparse(self.data):
|
mi@0
|
111 step = self.data.shape[1]
|
mi@0
|
112 else:
|
mi@0
|
113 step = 50000
|
mi@0
|
114
|
mi@0
|
115 d = np.zeros((self.data.shape[1]))
|
mi@0
|
116 if idx == -1:
|
mi@0
|
117 # set vec to origin if idx=-1
|
mi@0
|
118 vec = np.zeros((self.data.shape[0], 1))
|
mi@0
|
119 if scipy.sparse.issparse(self.data):
|
mi@0
|
120 vec = scipy.sparse.csc_matrix(vec)
|
mi@0
|
121 else:
|
mi@0
|
122 vec = self.data[:, idx:idx+1]
|
mi@0
|
123
|
mi@0
|
124 self._logger.info('compute distance to node ' + str(idx))
|
mi@0
|
125
|
mi@0
|
126 # slice data into smaller chunks
|
mi@0
|
127 for idx_start in range(0, self.data.shape[1], step):
|
mi@0
|
128 if idx_start + step > self.data.shape[1]:
|
mi@0
|
129 idx_end = self.data.shape[1]
|
mi@0
|
130 else:
|
mi@0
|
131 idx_end = idx_start + step
|
mi@0
|
132
|
mi@0
|
133 d[idx_start:idx_end] = self._distfunc(
|
mi@0
|
134 self.data[:,idx_start:idx_end], vec)
|
mi@0
|
135 self._logger.info('completed:' +
|
mi@0
|
136 str(idx_end/(self.data.shape[1]/100.0)) + "%")
|
mi@0
|
137 return d
|
mi@0
|
138
|
mi@0
|
139 def init_h(self):
|
mi@0
|
140 self.H = np.zeros((self._num_bases, self._num_samples))
|
mi@0
|
141
|
mi@0
|
142 def init_w(self):
|
mi@0
|
143 self.W = np.zeros((self._data_dimension, self._num_bases))
|
mi@0
|
144
|
mi@0
|
145 def init_sivm(self):
|
mi@0
|
146 self.select = []
|
mi@0
|
147 if self._init == 'fastmap':
|
mi@0
|
148 # Fastmap like initialization
|
mi@0
|
149 # set the starting index for fastmap initialization
|
mi@0
|
150 cur_p = 0
|
mi@0
|
151
|
mi@0
|
152 # after 3 iterations the first "real" index is found
|
mi@0
|
153 for i in range(3):
|
mi@0
|
154 d = self._distance(cur_p)
|
mi@0
|
155 cur_p = np.argmax(d)
|
mi@0
|
156
|
mi@0
|
157 # store maximal found distance -> later used for "a" (->update_w)
|
mi@0
|
158 self._maxd = np.max(d)
|
mi@0
|
159 self.select.append(cur_p)
|
mi@0
|
160
|
mi@0
|
161 elif self._init == 'origin':
|
mi@0
|
162 # set first vertex to origin
|
mi@0
|
163 cur_p = -1
|
mi@0
|
164 d = self._distance(cur_p)
|
mi@0
|
165 self._maxd = np.max(d)
|
mi@0
|
166 self.select.append(cur_p)
|
mi@0
|
167
|
mi@0
|
168 def update_w(self):
|
mi@0
|
169 """ compute new W """
|
mi@0
|
170 EPS = 10**-8
|
mi@0
|
171 self.init_sivm()
|
mi@0
|
172
|
mi@0
|
173 # initialize some of the recursively updated distance measures ....
|
mi@0
|
174 d_square = np.zeros((self.data.shape[1]))
|
mi@0
|
175 d_sum = np.zeros((self.data.shape[1]))
|
mi@0
|
176 d_i_times_d_j = np.zeros((self.data.shape[1]))
|
mi@0
|
177 distiter = np.zeros((self.data.shape[1]))
|
mi@0
|
178 a = np.log(self._maxd)
|
mi@0
|
179 a_inc = a.copy()
|
mi@0
|
180
|
mi@0
|
181 for l in range(1, self._num_bases):
|
mi@0
|
182 d = self._distance(self.select[l-1])
|
mi@0
|
183
|
mi@0
|
184 # take the log of d (sually more stable that d)
|
mi@0
|
185 d = np.log(d + EPS)
|
mi@0
|
186
|
mi@0
|
187 d_i_times_d_j += d * d_sum
|
mi@0
|
188 d_sum += d
|
mi@0
|
189 d_square += d**2
|
mi@0
|
190 distiter = d_i_times_d_j + a*d_sum - (l/2.0) * d_square
|
mi@0
|
191
|
mi@0
|
192 # detect the next best data point
|
mi@0
|
193 self.select.append(np.argmax(distiter))
|
mi@0
|
194
|
mi@0
|
195 self._logger.info('cur_nodes: ' + str(self.select))
|
mi@0
|
196
|
mi@0
|
197 # sort indices, otherwise h5py won't work
|
mi@0
|
198 self.W = self.data[:, np.sort(self.select)]
|
mi@0
|
199
|
mi@0
|
200 # "unsort" it again to keep the correct order
|
mi@0
|
201 self.W = self.W[:, np.argsort(np.argsort(self.select))]
|
mi@0
|
202
|
mi@0
|
203 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
|
mi@0
|
204 compute_err=True, niter=1):
|
mi@0
|
205 """ Factorize s.t. WH = data
|
mi@0
|
206
|
mi@0
|
207 Parameters
|
mi@0
|
208 ----------
|
mi@0
|
209 show_progress : bool
|
mi@0
|
210 print some extra information to stdout.
|
mi@0
|
211 compute_h : bool
|
mi@0
|
212 iteratively update values for H.
|
mi@0
|
213 compute_w : bool
|
mi@0
|
214 iteratively update values for W.
|
mi@0
|
215 compute_err : bool
|
mi@0
|
216 compute Frobenius norm |data-WH| after each update and store
|
mi@0
|
217 it to .ferr[k].
|
mi@0
|
218
|
mi@0
|
219 Updated Values
|
mi@0
|
220 --------------
|
mi@0
|
221 .W : updated values for W.
|
mi@0
|
222 .H : updated values for H.
|
mi@0
|
223 .ferr : Frobenius norm |data-WH|.
|
mi@0
|
224 """
|
mi@0
|
225
|
mi@0
|
226 AA.factorize(self, niter=1, show_progress=show_progress,
|
mi@0
|
227 compute_w=compute_w, compute_h=compute_h,
|
mi@0
|
228 compute_err=compute_err)
|
mi@0
|
229
|
mi@0
|
230 if __name__ == "__main__":
|
mi@0
|
231 import doctest
|
mi@0
|
232 doctest.testmod()
|