mi@0
|
1 #!/usr/bin/python2.6
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Simplex Volume Maximization [1]
|
mi@0
|
8
|
mi@0
|
9 SIVM_SEARCH: class for search-SiVM
|
mi@0
|
10
|
mi@0
|
11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
|
mi@0
|
12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
|
mi@0
|
13 Conf. on Information and Knowledge Management. ACM. 2010.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import scipy.sparse
|
mi@0
|
18 import numpy as np
|
mi@0
|
19 from scipy import inf
|
mi@0
|
20 try:
|
mi@0
|
21 from scipy.misc.common import factorial
|
mi@0
|
22 except:
|
mi@0
|
23 from scipy.misc import factorial
|
mi@0
|
24
|
mi@0
|
25 from dist import *
|
mi@0
|
26 from vol import *
|
mi@0
|
27 from sivm import SIVM
|
mi@0
|
28
|
mi@0
|
29 __all__ = ["SIVM_SEARCH"]
|
mi@0
|
30
|
mi@0
|
31 class SIVM_SEARCH(SIVM):
|
mi@0
|
32 """
|
mi@0
|
33 SIVM_SEARCH(data, num_bases=4, dist_measure='l2')
|
mi@0
|
34
|
mi@0
|
35
|
mi@0
|
36 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
|
mi@0
|
37 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
|
mi@0
|
38 found by maximizing the volume of the resulting simplex (see [1]). A solution
|
mi@0
|
39 is found by employing a simple A-star like search strategy.
|
mi@0
|
40
|
mi@0
|
41 Parameters
|
mi@0
|
42 ----------
|
mi@0
|
43 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
44 the input data
|
mi@0
|
45 num_bases: int, optional
|
mi@0
|
46 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
47 4 (default)
|
mi@0
|
48 dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
|
mi@0
|
49 Standard is 'l2' which maximizes the volume of the simplex. In contrast,
|
mi@0
|
50 'cosine' maximizes the volume of a cone (see [1] for details).
|
mi@0
|
51 init : string (default: 'fastmap')
|
mi@0
|
52 'fastmap' or 'origin'. Sets the method used for finding the very first
|
mi@0
|
53 basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
|
mi@0
|
54 the two vectors that have the largest pairwise distance.
|
mi@0
|
55 Attributes
|
mi@0
|
56 ----------
|
mi@0
|
57 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
58 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
59 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
60
|
mi@0
|
61 Example
|
mi@0
|
62 -------
|
mi@0
|
63 Applying SIVM to some rather stupid data set:
|
mi@0
|
64
|
mi@0
|
65 >>> import numpy as np
|
mi@0
|
66 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
67 >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2)
|
mi@0
|
68 >>> sivm_mdl.factorize()
|
mi@0
|
69
|
mi@0
|
70 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
|
mi@0
|
71 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
72 to sivm_mdl.W, and set compute_w to False:
|
mi@0
|
73
|
mi@0
|
74 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
|
mi@0
|
75 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
76 >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2)
|
mi@0
|
77 >>> sivm_mdl.W = W
|
mi@0
|
78 >>> sivm_mdl.factorize(compute_w=False)
|
mi@0
|
79
|
mi@0
|
80 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
|
mi@0
|
81 """
|
mi@0
|
82
|
mi@0
|
83 def update_w(self):
|
mi@0
|
84 def h(sel,D,k):
|
mi@0
|
85 # compute the volume for a selection of sel columns
|
mi@0
|
86 # and a k-1 simplex (-> k columns have to be selected)
|
mi@0
|
87 mv = np.max(D)
|
mi@0
|
88
|
mi@0
|
89 # fill the remaining distance by the maximal overall found distance
|
mi@0
|
90 d = np.zeros((k,k)) + mv
|
mi@0
|
91 for i in range(k):
|
mi@0
|
92 d[i,i] = 0.0
|
mi@0
|
93
|
mi@0
|
94 for idx_i,i in enumerate(sel):
|
mi@0
|
95 for idx_j,j in enumerate(sel):
|
mi@0
|
96 d[idx_i,idx_j] = D[i,j]
|
mi@0
|
97
|
mi@0
|
98 return d
|
mi@0
|
99
|
mi@0
|
100 # compute distance matrix -> required for the volume
|
mi@0
|
101 D = pdist(self.data, self.data)
|
mi@0
|
102 Openset = {}
|
mi@0
|
103
|
mi@0
|
104 for i in range(self._num_samples):
|
mi@0
|
105 # compute volume for temp selection
|
mi@0
|
106 d = h([i],D,self._num_bases)
|
mi@0
|
107 Vtmp = cmdet(d)
|
mi@0
|
108 Openset[tuple([i])] = Vtmp
|
mi@0
|
109
|
mi@0
|
110 Closedset = {}
|
mi@0
|
111 finished = False
|
mi@0
|
112 self._v = []
|
mi@0
|
113 self.init_sivm()
|
mi@0
|
114 next_sel = np.array([self.select[0]])
|
mi@0
|
115 iter = 0
|
mi@0
|
116
|
mi@0
|
117 while not finished:
|
mi@0
|
118 # add the current selection to closedset
|
mi@0
|
119 Closedset[(tuple(next_sel))] = []
|
mi@0
|
120
|
mi@0
|
121 for i in range(D.shape[0]):
|
mi@0
|
122 # create a temp selection
|
mi@0
|
123 tmp_sel = np.array(next_sel).flatten()
|
mi@0
|
124 tmp_sel = np.concatenate((tmp_sel, [i]),axis=0)
|
mi@0
|
125 tmp_sel = np.unique(tmp_sel)
|
mi@0
|
126 tmp_sel = list(tmp_sel)
|
mi@0
|
127 hkey = tuple(tmp_sel)
|
mi@0
|
128
|
mi@0
|
129 if len(tmp_sel) > len(next_sel) and (
|
mi@0
|
130 not Closedset.has_key(hkey)) and (
|
mi@0
|
131 not Openset.has_key(hkey)):
|
mi@0
|
132
|
mi@0
|
133 # compute volume for temp selection
|
mi@0
|
134 d = h(tmp_sel, D, self._num_bases)
|
mi@0
|
135 Vtmp = cmdet(d)
|
mi@0
|
136
|
mi@0
|
137 # add to openset
|
mi@0
|
138 Openset[hkey] = Vtmp
|
mi@0
|
139
|
mi@0
|
140 # get next best tuple
|
mi@0
|
141 vmax = 0.0
|
mi@0
|
142 for (k,v) in Openset.iteritems():
|
mi@0
|
143 if v > vmax:
|
mi@0
|
144 next_sel = k
|
mi@0
|
145 vmax = v
|
mi@0
|
146
|
mi@0
|
147 self._logger.info('Iter:' + str(iter))
|
mi@0
|
148 self._logger.info('Current selection:' + str(next_sel))
|
mi@0
|
149 self._logger.info('Current volume:' + str(vmax))
|
mi@0
|
150 self._v.append(vmax)
|
mi@0
|
151
|
mi@0
|
152 # remove next_sel from openset
|
mi@0
|
153 Openset.pop(next_sel)
|
mi@0
|
154
|
mi@0
|
155 if len(list(next_sel)) == self._num_bases:
|
mi@0
|
156 finished = True
|
mi@0
|
157 iter += 1
|
mi@0
|
158
|
mi@0
|
159 # update some values ...
|
mi@0
|
160 self.select = list(next_sel)
|
mi@0
|
161 self.W = self.data[:, self.select]
|
mi@0
|
162
|
mi@0
|
163 if __name__ == "__main__":
|
mi@0
|
164 import doctest
|
mi@0
|
165 doctest.testmod()
|