mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Compact Matrix Decomposition [1]
|
mi@0
|
8
|
mi@0
|
9 CMD(CUR): Class for Compact Matrix Decomposition
|
mi@0
|
10
|
mi@0
|
11 [1] Sun, J., Xie, Y., Zhang, H. and Faloutsos, C. (2007), Less is More: Compact Matrix Decomposition for Large
|
mi@0
|
12 Sparse Graphs, in Proc. SIAM Int. Conf. on Data Mining.
|
mi@0
|
13 """
|
mi@0
|
14
|
mi@0
|
15
|
mi@0
|
16 import numpy as np
|
mi@0
|
17 from cur import CUR
|
mi@0
|
18
|
mi@0
|
19 __all__ = ["CMD"]
|
mi@0
|
20
|
mi@0
|
21 class CMD(CUR):
|
mi@0
|
22 """
|
mi@0
|
23 CMD(data, rrank=0, crank=0)
|
mi@0
|
24
|
mi@0
|
25
|
mi@0
|
26 Compact Matrix Decomposition. Factorize a data matrix into three matrices s.t.
|
mi@0
|
27 F = | data - USV| is minimal. CMD randomly selects rows and columns from
|
mi@0
|
28 data for building U and V, respectively.
|
mi@0
|
29
|
mi@0
|
30 Parameters
|
mi@0
|
31 ----------
|
mi@0
|
32 data : array_like [data_dimension x num_samples]
|
mi@0
|
33 the input data
|
mi@0
|
34 rrank: int, optional
|
mi@0
|
35 Number of rows to sample from data. Double entries are eliminiated s.t.
|
mi@0
|
36 the resulting rank might be lower.
|
mi@0
|
37 4 (default)
|
mi@0
|
38 crank: int, optional
|
mi@0
|
39 Number of columns to sample from data. Double entries are eliminiated s.t.
|
mi@0
|
40 the resulting rank might be lower.
|
mi@0
|
41 4 (default)
|
mi@0
|
42
|
mi@0
|
43 Attributes
|
mi@0
|
44 ----------
|
mi@0
|
45 U,S,V : submatrices s.t. data = USV
|
mi@0
|
46
|
mi@0
|
47 Example
|
mi@0
|
48 -------
|
mi@0
|
49 >>> import numpy as np
|
mi@0
|
50 >>> from cmd import CMD
|
mi@0
|
51 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
52 >>> cmd_mdl = CMD(data, show_progress=False, rrank=1, crank=2)
|
mi@0
|
53 >>> cmd_mdl.factorize()
|
mi@0
|
54 """
|
mi@0
|
55
|
mi@0
|
56 def _cmdinit(self):
|
mi@0
|
57 nrids = np.unique(self._rid)
|
mi@0
|
58 ncids = np.unique(self._cid)
|
mi@0
|
59
|
mi@0
|
60 self._rcnt = np.zeros(len(nrids))
|
mi@0
|
61 self._ccnt = np.zeros(len(ncids))
|
mi@0
|
62
|
mi@0
|
63 for i,idx in enumerate(nrids):
|
mi@0
|
64 self._rcnt[i] = len(np.where(self._rid == idx)[0])
|
mi@0
|
65
|
mi@0
|
66 for i,idx in enumerate(ncids):
|
mi@0
|
67 self._ccnt[i] = len(np.where(self._cid == idx)[0])
|
mi@0
|
68
|
mi@0
|
69 self._rid = np.int32(list(nrids))
|
mi@0
|
70 self._cid = np.int32(list(ncids))
|
mi@0
|
71
|
mi@0
|
72 def factorize(self):
|
mi@0
|
73 """ Factorize s.t. CUR = data
|
mi@0
|
74
|
mi@0
|
75 Updated Values
|
mi@0
|
76 --------------
|
mi@0
|
77 .C : updated values for C.
|
mi@0
|
78 .U : updated values for U.
|
mi@0
|
79 .R : updated values for R.
|
mi@0
|
80 """
|
mi@0
|
81
|
mi@0
|
82 [prow, pcol] = self.sample_probability()
|
mi@0
|
83
|
mi@0
|
84 self._rid = self.sample(self._rrank, prow)
|
mi@0
|
85 self._cid = self.sample(self._crank, pcol)
|
mi@0
|
86
|
mi@0
|
87 self._cmdinit()
|
mi@0
|
88
|
mi@0
|
89 self.computeUCR()
|
mi@0
|
90
|
mi@0
|
91
|
mi@0
|
92 if __name__ == "__main__":
|
mi@0
|
93 import doctest
|
mi@0
|
94 doctest.testmod()
|