mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Simplex Volume Maximization for CUR [1]
|
mi@0
|
8
|
mi@0
|
9 SIVMCUR: class for SiVM-CUR
|
mi@0
|
10
|
mi@0
|
11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
|
mi@0
|
12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
|
mi@0
|
13 Conf. on Information and Knowledge Management. ACM. 2010.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import numpy as np
|
mi@0
|
18 import scipy
|
mi@0
|
19 from sivm import SIVM
|
mi@0
|
20 from cur import CUR
|
mi@0
|
21
|
mi@0
|
22 __all__ = ["SIVM_CUR"]
|
mi@0
|
23
|
mi@0
|
24 class SIVM_CUR(CUR):
|
mi@0
|
25 '''
|
mi@0
|
26 SIVM_CUR(data, num_bases=4, dist_measure='l2')
|
mi@0
|
27
|
mi@0
|
28 Simplex Volume based CUR Decomposition. Factorize a data matrix into three
|
mi@0
|
29 matrices s.t. F = | data - USV| is minimal. Unlike CUR, SIVMCUR selects the
|
mi@0
|
30 rows and columns using SIVM, i.e. it tries to maximize the volume of the
|
mi@0
|
31 enclosed simplex.
|
mi@0
|
32
|
mi@0
|
33 Parameters
|
mi@0
|
34 ----------
|
mi@0
|
35 data : array_like [data_dimension x num_samples]
|
mi@0
|
36 the input data
|
mi@0
|
37 rrank: int, optional
|
mi@0
|
38 Number of rows to sample from data.
|
mi@0
|
39 4 (default)crank
|
mi@0
|
40 crank: int, optional
|
mi@0
|
41 Number of columns to sample from data.
|
mi@0
|
42 4 (default)
|
mi@0
|
43 dist_measure: string, optional
|
mi@0
|
44 The distance measure for finding the next best candidate that
|
mi@0
|
45 maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
|
mi@0
|
46 'l2' (default)
|
mi@0
|
47
|
mi@0
|
48 Attributes
|
mi@0
|
49 ----------
|
mi@0
|
50 U,S,V : submatrices s.t. data = USV
|
mi@0
|
51
|
mi@0
|
52 Example
|
mi@0
|
53 -------
|
mi@0
|
54 >>> import numpy as np
|
mi@0
|
55 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
56 >>> sivmcur_mdl = SIVM_CUR(data, show_progress=False, rrank=1, crank=2)
|
mi@0
|
57 >>> sivmcur_mdl.factorize()
|
mi@0
|
58 '''
|
mi@0
|
59
|
mi@0
|
60 def __init__(self, data, k=-1, rrank=0, crank=0, dist_measure='l2', init='origin'):
|
mi@0
|
61 CUR.__init__(self, data, k=k, rrank=rrank, crank=rrank)
|
mi@0
|
62 self._dist_measure = dist_measure
|
mi@0
|
63 self.init = init
|
mi@0
|
64
|
mi@0
|
65 def sample(self, A, c):
|
mi@0
|
66 # for optimizing the volume of the submatrix, set init to 'origin' (otherwise the volume of
|
mi@0
|
67 # the ordinary simplex would be optimized)
|
mi@0
|
68 sivm_mdl = SIVM(A, num_bases=c, dist_measure=self._dist_measure,
|
mi@0
|
69 init=self.init)
|
mi@0
|
70 sivm_mdl.factorize(show_progress=False, compute_w=True, niter=1,
|
mi@0
|
71 compute_h=False, compute_err=False)
|
mi@0
|
72
|
mi@0
|
73 return sivm_mdl.select
|
mi@0
|
74
|
mi@0
|
75
|
mi@0
|
76 def factorize(self):
|
mi@0
|
77 """ Factorize s.t. CUR = data
|
mi@0
|
78
|
mi@0
|
79 Updated Values
|
mi@0
|
80 --------------
|
mi@0
|
81 .C : updated values for C.
|
mi@0
|
82 .U : updated values for U.
|
mi@0
|
83 .R : updated values for R.
|
mi@0
|
84 """
|
mi@0
|
85 # sample row and column indices that maximize the volume of the submatrix
|
mi@0
|
86 self._rid = self.sample(self.data.transpose(), self._rrank)
|
mi@0
|
87 self._cid = self.sample(self.data, self._crank)
|
mi@0
|
88
|
mi@0
|
89 self._rcnt = np.ones(len(self._rid))
|
mi@0
|
90 self._ccnt = np.ones(len(self._cid))
|
mi@0
|
91
|
mi@0
|
92 self.computeUCR()
|
mi@0
|
93
|
mi@0
|
94
|
mi@0
|
95 if __name__ == "__main__":
|
mi@0
|
96 import doctest
|
mi@0
|
97 doctest.testmod()
|