mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 #$Id$
|
mi@0
|
7 """
|
mi@0
|
8 PyMF CUR Decomposition [1]
|
mi@0
|
9
|
mi@0
|
10 CURSL(SVD) : Class for CUR Decomposition (uses statistical leverage based sampling)
|
mi@0
|
11
|
mi@0
|
12 [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing
|
mi@0
|
13 a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import numpy as np
|
mi@0
|
18 import scipy.sparse
|
mi@0
|
19
|
mi@0
|
20 from svd import pinv, SVD
|
mi@0
|
21 from cmd import CMD
|
mi@0
|
22
|
mi@0
|
23 __all__ = ["CURSL"]
|
mi@0
|
24
|
mi@0
|
25 class CURSL(CMD):
|
mi@0
|
26 """
|
mi@0
|
27 CURSL(data, data, rrank=0, crank=0)
|
mi@0
|
28
|
mi@0
|
29 CUR/CMD Decomposition. Factorize a data matrix into three matrices s.t.
|
mi@0
|
30 F = | data - USV| is minimal. CURSL randomly selects rows and columns from
|
mi@0
|
31 data for building U and V, respectively. The importance sampling is based
|
mi@0
|
32 on a statistical leverage score from the top-k singular vectors (k is
|
mi@0
|
33 currently set to 4/5*rrank and 4/5*crank).
|
mi@0
|
34
|
mi@0
|
35 Parameters
|
mi@0
|
36 ----------
|
mi@0
|
37 data : array_like [data_dimension x num_samples]
|
mi@0
|
38 the input data
|
mi@0
|
39 rrank: int, optional
|
mi@0
|
40 Number of rows to sample from data.
|
mi@0
|
41 4 (default)
|
mi@0
|
42 crank: int, optional
|
mi@0
|
43 Number of columns to sample from data.
|
mi@0
|
44 4 (default)
|
mi@0
|
45 show_progress: bool, optional
|
mi@0
|
46 Print some extra information
|
mi@0
|
47 False (default)
|
mi@0
|
48
|
mi@0
|
49 Attributes
|
mi@0
|
50 ----------
|
mi@0
|
51 U,S,V : submatrices s.t. data = USV (or _C _U _R)
|
mi@0
|
52
|
mi@0
|
53 Example
|
mi@0
|
54 -------
|
mi@0
|
55 >>> import numpy as np
|
mi@0
|
56 >>> from cur import CUR
|
mi@0
|
57 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
58 >>> cur_mdl = CURSL(data, show_progress=False, rrank=1, crank=2)
|
mi@0
|
59 >>> cur_mdl.factorize()
|
mi@0
|
60 """
|
mi@0
|
61
|
mi@0
|
62 def __init__(self, data, k=-1, rrank=0, crank=0):
|
mi@0
|
63 SVD.__init__(self, data, k=k, rrank=rrank, crank=rrank)
|
mi@0
|
64
|
mi@0
|
65 def sample_probability(self):
|
mi@0
|
66 def comp_prob(d, k):
|
mi@0
|
67 # compute statistical leverage score
|
mi@0
|
68 c = np.round(k - k/5.0)
|
mi@0
|
69
|
mi@0
|
70 svd_mdl = SVD(d, k=c)
|
mi@0
|
71 svd_mdl.factorize()
|
mi@0
|
72
|
mi@0
|
73 if scipy.sparse.issparse(self.data):
|
mi@0
|
74 A = svd_mdl.V.multiply(svd_mdl.V)
|
mi@0
|
75 ## Rule 1
|
mi@0
|
76 pcol = np.array(A.sum(axis=0)/k)
|
mi@0
|
77 else:
|
mi@0
|
78 A = svd_mdl.V[:k,:]**2.0
|
mi@0
|
79 ## Rule 1
|
mi@0
|
80 pcol = A.sum(axis=0)/k
|
mi@0
|
81
|
mi@0
|
82 #c = k * np.log(k/ (self._eps**2.0))
|
mi@0
|
83 #pcol = c * pcol.reshape((-1,1))
|
mi@0
|
84 pcol /= np.sum(pcol)
|
mi@0
|
85 return pcol
|
mi@0
|
86
|
mi@0
|
87 pcol = comp_prob(self.data, self._rrank)
|
mi@0
|
88 prow = comp_prob(self.data.transpose(), self._crank)
|
mi@0
|
89
|
mi@0
|
90
|
mi@0
|
91 return (prow.reshape(-1,1), pcol.reshape(-1,1))
|