mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF CUR Decomposition [1]
|
mi@0
|
8
|
mi@0
|
9 CUR(SVD) : Class for CUR Decomposition
|
mi@0
|
10
|
mi@0
|
11 [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing
|
mi@0
|
12 a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
|
mi@0
|
13 """
|
mi@0
|
14
|
mi@0
|
15
|
mi@0
|
16 import numpy as np
|
mi@0
|
17 import scipy.sparse
|
mi@0
|
18
|
mi@0
|
19 from svd import pinv, SVD
|
mi@0
|
20
|
mi@0
|
21
|
mi@0
|
22 __all__ = ["CUR"]
|
mi@0
|
23
|
mi@0
|
24 class CUR(SVD):
|
mi@0
|
25 """
|
mi@0
|
26 CUR(data, data, k=-1, rrank=0, crank=0)
|
mi@0
|
27
|
mi@0
|
28 CUR Decomposition. Factorize a data matrix into three matrices s.t.
|
mi@0
|
29 F = | data - USV| is minimal. CUR randomly selects rows and columns from
|
mi@0
|
30 data for building U and V, respectively.
|
mi@0
|
31
|
mi@0
|
32 Parameters
|
mi@0
|
33 ----------
|
mi@0
|
34 data : array_like [data_dimension x num_samples]
|
mi@0
|
35 the input data
|
mi@0
|
36 rrank: int, optional
|
mi@0
|
37 Number of rows to sample from data.
|
mi@0
|
38 4 (default)
|
mi@0
|
39 crank: int, optional
|
mi@0
|
40 Number of columns to sample from data.
|
mi@0
|
41 4 (default)
|
mi@0
|
42 show_progress: bool, optional
|
mi@0
|
43 Print some extra information
|
mi@0
|
44 False (default)
|
mi@0
|
45
|
mi@0
|
46 Attributes
|
mi@0
|
47 ----------
|
mi@0
|
48 U,S,V : submatrices s.t. data = USV
|
mi@0
|
49
|
mi@0
|
50 Example
|
mi@0
|
51 -------
|
mi@0
|
52 >>> import numpy as np
|
mi@0
|
53 >>> from cur import CUR
|
mi@0
|
54 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
55 >>> cur_mdl = CUR(data, show_progress=False, rrank=1, crank=2)
|
mi@0
|
56 >>> cur_mdl.factorize()
|
mi@0
|
57 """
|
mi@0
|
58
|
mi@0
|
59 def __init__(self, data, k=-1, rrank=0, crank=0):
|
mi@0
|
60 SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank)
|
mi@0
|
61
|
mi@0
|
62 # select all data samples for computing the error:
|
mi@0
|
63 # note that this might take very long, adjust self._rset and self._cset
|
mi@0
|
64 # for faster computations.
|
mi@0
|
65 self._rset = range(self._rows)
|
mi@0
|
66 self._cset = range(self._cols)
|
mi@0
|
67
|
mi@0
|
68
|
mi@0
|
69 def sample(self, s, probs):
|
mi@0
|
70 prob_rows = np.cumsum(probs.flatten())
|
mi@0
|
71 temp_ind = np.zeros(s, np.int32)
|
mi@0
|
72
|
mi@0
|
73 for i in range(s):
|
mi@0
|
74 v = np.random.rand()
|
mi@0
|
75
|
mi@0
|
76 try:
|
mi@0
|
77 tempI = np.where(prob_rows >= v)[0]
|
mi@0
|
78 temp_ind[i] = tempI[0]
|
mi@0
|
79 except:
|
mi@0
|
80 temp_ind[i] = len(prob_rows)
|
mi@0
|
81
|
mi@0
|
82 return np.sort(temp_ind)
|
mi@0
|
83
|
mi@0
|
84 def sample_probability(self):
|
mi@0
|
85
|
mi@0
|
86 if scipy.sparse.issparse(self.data):
|
mi@0
|
87 dsquare = self.data.multiply(self.data)
|
mi@0
|
88 else:
|
mi@0
|
89 dsquare = self.data[:,:]**2
|
mi@0
|
90
|
mi@0
|
91 prow = np.array(dsquare.sum(axis=1), np.float64)
|
mi@0
|
92 pcol = np.array(dsquare.sum(axis=0), np.float64)
|
mi@0
|
93
|
mi@0
|
94 prow /= prow.sum()
|
mi@0
|
95 pcol /= pcol.sum()
|
mi@0
|
96
|
mi@0
|
97 return (prow.reshape(-1,1), pcol.reshape(-1,1))
|
mi@0
|
98
|
mi@0
|
99 def computeUCR(self):
|
mi@0
|
100 # the next lines do NOT work with h5py if CUR is used -> double indices in self.cid or self.rid
|
mi@0
|
101 # can occur and are not supported by h5py. When using h5py data, always use CMD which ignores
|
mi@0
|
102 # reoccuring row/column selections.
|
mi@0
|
103
|
mi@0
|
104 if scipy.sparse.issparse(self.data):
|
mi@0
|
105 self._C = self.data[:, self._cid] * scipy.sparse.csc_matrix(np.diag(self._ccnt**(1/2)))
|
mi@0
|
106 self._R = scipy.sparse.csc_matrix(np.diag(self._rcnt**(1/2))) * self.data[self._rid,:]
|
mi@0
|
107
|
mi@0
|
108 self._U = pinv(self._C, self._k) * self.data[:,:] * pinv(self._R, self._k)
|
mi@0
|
109
|
mi@0
|
110 else:
|
mi@0
|
111 self._C = np.dot(self.data[:, self._cid].reshape((self._rows, len(self._cid))), np.diag(self._ccnt**(1/2)))
|
mi@0
|
112 self._R = np.dot(np.diag(self._rcnt**(1/2)), self.data[self._rid,:].reshape((len(self._rid), self._cols)))
|
mi@0
|
113
|
mi@0
|
114 self._U = np.dot(np.dot(pinv(self._C, self._k), self.data[:,:]),
|
mi@0
|
115 pinv(self._R, self._k))
|
mi@0
|
116
|
mi@0
|
117 # set some standard (with respect to SVD) variable names
|
mi@0
|
118 self.U = self._C
|
mi@0
|
119 self.S = self._U
|
mi@0
|
120 self.V = self._R
|
mi@0
|
121
|
mi@0
|
122 def factorize(self):
|
mi@0
|
123 """ Factorize s.t. CUR = data
|
mi@0
|
124
|
mi@0
|
125 Updated Values
|
mi@0
|
126 --------------
|
mi@0
|
127 .C : updated values for C.
|
mi@0
|
128 .U : updated values for U.
|
mi@0
|
129 .R : updated values for R.
|
mi@0
|
130 """
|
mi@0
|
131 [prow, pcol] = self.sample_probability()
|
mi@0
|
132 self._rid = self.sample(self._rrank, prow)
|
mi@0
|
133 self._cid = self.sample(self._crank, pcol)
|
mi@0
|
134
|
mi@0
|
135 self._rcnt = np.ones(len(self._rid))
|
mi@0
|
136 self._ccnt = np.ones(len(self._cid))
|
mi@0
|
137
|
mi@0
|
138 self.computeUCR()
|
mi@0
|
139
|
mi@0
|
140
|
mi@0
|
141 if __name__ == "__main__":
|
mi@0
|
142 import doctest
|
mi@0
|
143 doctest.testmod()
|