mi@0
|
1 #!/usr/bin/python2.6
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Simplex Volume Maximization [1]
|
mi@0
|
8
|
mi@0
|
9 SIVM_GSAT: class for gsat-SiVM
|
mi@0
|
10
|
mi@0
|
11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
|
mi@0
|
12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
|
mi@0
|
13 Conf. on Information and Knowledge Management. ACM. 2010.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import logging
|
mi@0
|
18 import numpy as np
|
mi@0
|
19 from dist import *
|
mi@0
|
20 from vol import cmdet
|
mi@0
|
21 from sivm import SIVM
|
mi@0
|
22
|
mi@0
|
23 __all__ = ["SIVM_GSAT"]
|
mi@0
|
24
|
mi@0
|
25 class SIVM_GSAT(SIVM):
|
mi@0
|
26 """
|
mi@0
|
27 SIVM(data, num_bases=4, dist_measure='l2')
|
mi@0
|
28
|
mi@0
|
29
|
mi@0
|
30 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
|
mi@0
|
31 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
|
mi@0
|
32 found by maximizing the volume of the resulting simplex (see [1]). Can be
|
mi@0
|
33 applied to data streams using the .online_update_w(vec) function which decides
|
mi@0
|
34 on adding data sample "vec" to the already selected basis vectors.
|
mi@0
|
35
|
mi@0
|
36 Parameters
|
mi@0
|
37 ----------
|
mi@0
|
38 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
39 the input data
|
mi@0
|
40 num_bases: int, optional
|
mi@0
|
41 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
42 4 (default)
|
mi@0
|
43 dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
|
mi@0
|
44 Standard is 'l2' which maximizes the volume of the simplex. In contrast,
|
mi@0
|
45 'cosine' maximizes the volume of a cone (see [1] for details).
|
mi@0
|
46 init : string (default: 'fastmap')
|
mi@0
|
47 'fastmap' or 'origin'. Sets the method used for finding the very first
|
mi@0
|
48 basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
|
mi@0
|
49 the two vectors that have the largest pairwise distance.
|
mi@0
|
50 Attributes
|
mi@0
|
51 ----------
|
mi@0
|
52 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
53 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
54 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
55
|
mi@0
|
56 Example
|
mi@0
|
57 -------
|
mi@0
|
58 Applying SIVM to some rather stupid data set:
|
mi@0
|
59
|
mi@0
|
60 >>> import numpy as np
|
mi@0
|
61 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
62 >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
|
mi@0
|
63 >>> sivm_mdl.factorize()
|
mi@0
|
64
|
mi@0
|
65 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
|
mi@0
|
66 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
67 to sivm_mdl.W, and set compute_w to False:
|
mi@0
|
68
|
mi@0
|
69 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
|
mi@0
|
70 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
71 >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
|
mi@0
|
72 >>> sivm_mdl.W = W
|
mi@0
|
73 >>> sivm_mdl.factorize(compute_w=False)
|
mi@0
|
74
|
mi@0
|
75 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
|
mi@0
|
76 """
|
mi@0
|
77
|
mi@0
|
78 def init_w(self):
|
mi@0
|
79 self.select = range(self._num_bases)
|
mi@0
|
80 self.W = self.data[:, self.select]
|
mi@0
|
81
|
mi@0
|
82 def online_update_w(self, vec):
|
mi@0
|
83 # update D if it does not exist
|
mi@0
|
84 k = self._num_bases
|
mi@0
|
85 if not hasattr(self, 'D'):
|
mi@0
|
86 self.D = np.zeros((k + 1, k + 1))
|
mi@0
|
87 self.D[:k, :k] = pdist(self.W, self.W)
|
mi@0
|
88 self.V = cmdet(self.D[:k, :k])
|
mi@0
|
89
|
mi@0
|
90 tmp_d = self._distfunc(self.W, vec.reshape((-1,1)))
|
mi@0
|
91 self.D[k, :-1] = tmp_d
|
mi@0
|
92 self.D[:-1, k] = tmp_d
|
mi@0
|
93
|
mi@0
|
94 v = np.zeros((self._num_bases + 1))
|
mi@0
|
95
|
mi@0
|
96 for i in range(self._num_bases):
|
mi@0
|
97 # compute volume for each combination...
|
mi@0
|
98 s = np.setdiff1d(range(self._num_bases + 1), [i])
|
mi@0
|
99 v[i] = cmdet((self.D[s,:])[:,s])
|
mi@0
|
100
|
mi@0
|
101 # select index that maximizes the volume
|
mi@0
|
102 v[-1] = self.V
|
mi@0
|
103 s = np.argmax(v)
|
mi@0
|
104
|
mi@0
|
105 if s < self._num_bases:
|
mi@0
|
106 self.W[:,s] = vec
|
mi@0
|
107 self.D[:self._num_bases, :self._num_bases] = pdist(self.W, self.W)
|
mi@0
|
108
|
mi@0
|
109 if not hasattr(self, '_v'):
|
mi@0
|
110 self._v = [self.V]
|
mi@0
|
111 self.V = v[s]
|
mi@0
|
112 self._v.append(v[s])
|
mi@0
|
113
|
mi@0
|
114 self._logger.info('Volume increased:' + str(self.V))
|
mi@0
|
115 return True, s
|
mi@0
|
116
|
mi@0
|
117 return False,-1
|
mi@0
|
118
|
mi@0
|
119 def update_w(self):
|
mi@0
|
120 n = np.int(np.floor(np.random.random() * self._num_samples))
|
mi@0
|
121 if n not in self.select:
|
mi@0
|
122 updated, s = self.online_update_w(self.data[:,n])
|
mi@0
|
123 if updated:
|
mi@0
|
124 self.select[s] = n
|
mi@0
|
125 self._logger.info('Current selection:' + str(self.select))
|
mi@0
|
126
|
mi@0
|
127
|
mi@0
|
128 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
|
mi@0
|
129 compute_err=True, niter=1):
|
mi@0
|
130 """ Factorize s.t. WH = data
|
mi@0
|
131
|
mi@0
|
132 Parameters
|
mi@0
|
133 ----------
|
mi@0
|
134 show_progress : bool
|
mi@0
|
135 print some extra information to stdout.
|
mi@0
|
136 niter : int
|
mi@0
|
137 number of iterations.
|
mi@0
|
138 compute_h : bool
|
mi@0
|
139 iteratively update values for H.
|
mi@0
|
140 compute_w : bool
|
mi@0
|
141 iteratively update values for W.
|
mi@0
|
142 compute_err : bool
|
mi@0
|
143 compute Frobenius norm |data-WH| after each update and store
|
mi@0
|
144 it to .ferr[k].
|
mi@0
|
145
|
mi@0
|
146 Updated Values
|
mi@0
|
147 --------------
|
mi@0
|
148 .W : updated values for W.
|
mi@0
|
149 .H : updated values for H.
|
mi@0
|
150 .ferr : Frobenius norm |data-WH|.
|
mi@0
|
151 """
|
mi@0
|
152 if show_progress:
|
mi@0
|
153 self._logger.setLevel(logging.INFO)
|
mi@0
|
154 else:
|
mi@0
|
155 self._logger.setLevel(logging.ERROR)
|
mi@0
|
156
|
mi@0
|
157 # create W and H if they don't already exist
|
mi@0
|
158 # -> any custom initialization to W,H should be done before
|
mi@0
|
159 if not hasattr(self,'W'):
|
mi@0
|
160 self.init_w()
|
mi@0
|
161
|
mi@0
|
162 if not hasattr(self,'H'):
|
mi@0
|
163 self.init_h()
|
mi@0
|
164
|
mi@0
|
165 if compute_err:
|
mi@0
|
166 self.ferr = np.zeros(niter)
|
mi@0
|
167
|
mi@0
|
168 for i in xrange(niter):
|
mi@0
|
169 if compute_w:
|
mi@0
|
170 self.update_w()
|
mi@0
|
171
|
mi@0
|
172 if compute_h:
|
mi@0
|
173 self.update_h()
|
mi@0
|
174
|
mi@0
|
175 if compute_err:
|
mi@0
|
176 self.ferr[i] = self.frobenius_norm()
|
mi@0
|
177 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) +
|
mi@0
|
178 ' FN:' + str(self.ferr[i]))
|
mi@0
|
179 else:
|
mi@0
|
180 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
|
mi@0
|
181
|
mi@0
|
182
|
mi@0
|
183 if __name__ == "__main__":
|
mi@0
|
184 import doctest
|
mi@0
|
185 doctest.testmod()
|