mi@0
|
1 #!/usr/bin/python2.6
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Simplex Volume Maximization [1]
|
mi@0
|
8
|
mi@0
|
9 SIVM_SGREEDY: class for greedy-search SiVM
|
mi@0
|
10
|
mi@0
|
11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
|
mi@0
|
12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
|
mi@0
|
13 Conf. on Information and Knowledge Management. ACM. 2010.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import numpy as np
|
mi@0
|
18 import time
|
mi@0
|
19
|
mi@0
|
20 from dist import *
|
mi@0
|
21 from vol import *
|
mi@0
|
22 from sivm_search import SIVM_SEARCH
|
mi@0
|
23
|
mi@0
|
24 __all__ = ["SIVM_SGREEDY"]
|
mi@0
|
25
|
mi@0
|
26 class SIVM_SGREEDY(SIVM_SEARCH):
|
mi@0
|
27 """
|
mi@0
|
28 SIVM(data, num_bases=4, niter=100, show_progress=True, compW=True)
|
mi@0
|
29
|
mi@0
|
30
|
mi@0
|
31 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
|
mi@0
|
32 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
|
mi@0
|
33 found by maximizing the volume of the resulting simplex (see [1]). A solution
|
mi@0
|
34 is found by employing a simple greedy max-vol strategy.
|
mi@0
|
35
|
mi@0
|
36 Parameters
|
mi@0
|
37 ----------
|
mi@0
|
38 data : array_like
|
mi@0
|
39 the input data
|
mi@0
|
40 num_bases: int, optional
|
mi@0
|
41 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
42 4 (default)
|
mi@0
|
43 niter: int, optional
|
mi@0
|
44 Number of iterations of the alternating optimization.
|
mi@0
|
45 100 (default)
|
mi@0
|
46 show_progress: bool, optional
|
mi@0
|
47 Print some extra information
|
mi@0
|
48 False (default)
|
mi@0
|
49 compW: bool, optional
|
mi@0
|
50 Compute W (True) or only H (False). Useful for using basis vectors
|
mi@0
|
51 from another convexity constrained matrix factorization function
|
mi@0
|
52 (e.g. svmnmf) (if set to "True" niter can be set to "1")
|
mi@0
|
53 compH: bool, optional
|
mi@0
|
54 Compute H (True) or only H (False). Useful for using precomputed
|
mi@0
|
55 basis vectors.
|
mi@0
|
56 dist_measure: string, optional
|
mi@0
|
57 The distance measure for finding the next best candidate that
|
mi@0
|
58 maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
|
mi@0
|
59 'l2' (default)
|
mi@0
|
60 optimize_lower_bound: bool, optional
|
mi@0
|
61 Use the alternative selection criterion that optimizes the lower
|
mi@0
|
62 bound (see [1])
|
mi@0
|
63 False (default)
|
mi@0
|
64
|
mi@0
|
65 Attributes
|
mi@0
|
66 ----------
|
mi@0
|
67 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
68 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
69
|
mi@0
|
70 ferr : frobenius norm (after applying .factoriz())
|
mi@0
|
71
|
mi@0
|
72 Example
|
mi@0
|
73 -------
|
mi@0
|
74 Applying SIVM to some rather stupid data set:
|
mi@0
|
75
|
mi@0
|
76 >>> import numpy as np
|
mi@0
|
77 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
78 >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=10)
|
mi@0
|
79 >>> sivm_mdl.initialization()
|
mi@0
|
80 >>> sivm_mdl.factorize()
|
mi@0
|
81
|
mi@0
|
82 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
|
mi@0
|
83 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
84 to sivm_mdl.W, and set compW to False:
|
mi@0
|
85
|
mi@0
|
86 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
|
mi@0
|
87 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
88 >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=1, compW=False)
|
mi@0
|
89 >>> sivm_mdl.initialization()
|
mi@0
|
90 >>> sivm_mdl.W = W
|
mi@0
|
91 >>> sivm_mdl.factorize()
|
mi@0
|
92
|
mi@0
|
93 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
|
mi@0
|
94 """
|
mi@0
|
95
|
mi@0
|
96 def update_w(self):
|
mi@0
|
97 # compute distance matrix -> requiresd for the volume
|
mi@0
|
98 self.init_sivm()
|
mi@0
|
99 next_sel = list([self.select[0]])
|
mi@0
|
100 self.select = []
|
mi@0
|
101
|
mi@0
|
102 self._v = []
|
mi@0
|
103 self._t = []
|
mi@0
|
104 stime = time.time()
|
mi@0
|
105
|
mi@0
|
106 for iter in range(self._num_bases-1):
|
mi@0
|
107 # add new selections to openset
|
mi@0
|
108 next_sel = list(np.sort(next_sel))
|
mi@0
|
109 D = pdist(self.data[:, next_sel], self.data[:, next_sel])
|
mi@0
|
110 V = np.zeros(self.data.shape[1])
|
mi@0
|
111 d = np.zeros((D.shape[0]+1,D.shape[1]+1))
|
mi@0
|
112 d[:D.shape[0], :D.shape[1]] = D[:,:]
|
mi@0
|
113
|
mi@0
|
114 for i in range(self.data.shape[1]):
|
mi@0
|
115 # create a temp selection
|
mi@0
|
116 dtmp = l2_distance(self.data[:,next_sel], self.data[:,i:i+1])
|
mi@0
|
117 d[:-1,-1] = dtmp
|
mi@0
|
118 d[-1,:-1] = dtmp
|
mi@0
|
119 # compute volume for temp selection
|
mi@0
|
120 V[i] = cmdet(d)
|
mi@0
|
121
|
mi@0
|
122 next_index = np.argmax(V)
|
mi@0
|
123 next_sel.append(next_index)
|
mi@0
|
124 self._v.append(np.max(V))
|
mi@0
|
125
|
mi@0
|
126 self._logger.info('Iter:' + str(iter))
|
mi@0
|
127 self._logger.info('Current selection:' + str(next_sel))
|
mi@0
|
128 self._logger.info('Current volume:' + str(self._v[-1]))
|
mi@0
|
129 self._t.append(time.time() - stime)
|
mi@0
|
130
|
mi@0
|
131 # update some values ...
|
mi@0
|
132 self.select = list(next_sel)
|
mi@0
|
133 self.W = self.data[:, self.select]
|
mi@0
|
134
|
mi@0
|
135
|
mi@0
|
136
|
mi@0
|
137 if __name__ == "__main__":
|
mi@0
|
138 import doctest
|
mi@0
|
139 doctest.testmod()
|