annotate pymf/cmd.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Compact Matrix Decomposition [1]
mi@0 8
mi@0 9 CMD(CUR): Class for Compact Matrix Decomposition
mi@0 10
mi@0 11 [1] Sun, J., Xie, Y., Zhang, H. and Faloutsos, C. (2007), Less is More: Compact Matrix Decomposition for Large
mi@0 12 Sparse Graphs, in Proc. SIAM Int. Conf. on Data Mining.
mi@0 13 """
mi@0 14
mi@0 15
mi@0 16 import numpy as np
mi@0 17 from cur import CUR
mi@0 18
mi@0 19 __all__ = ["CMD"]
mi@0 20
mi@0 21 class CMD(CUR):
mi@0 22 """
mi@0 23 CMD(data, rrank=0, crank=0)
mi@0 24
mi@0 25
mi@0 26 Compact Matrix Decomposition. Factorize a data matrix into three matrices s.t.
mi@0 27 F = | data - USV| is minimal. CMD randomly selects rows and columns from
mi@0 28 data for building U and V, respectively.
mi@0 29
mi@0 30 Parameters
mi@0 31 ----------
mi@0 32 data : array_like [data_dimension x num_samples]
mi@0 33 the input data
mi@0 34 rrank: int, optional
mi@0 35 Number of rows to sample from data. Double entries are eliminiated s.t.
mi@0 36 the resulting rank might be lower.
mi@0 37 4 (default)
mi@0 38 crank: int, optional
mi@0 39 Number of columns to sample from data. Double entries are eliminiated s.t.
mi@0 40 the resulting rank might be lower.
mi@0 41 4 (default)
mi@0 42
mi@0 43 Attributes
mi@0 44 ----------
mi@0 45 U,S,V : submatrices s.t. data = USV
mi@0 46
mi@0 47 Example
mi@0 48 -------
mi@0 49 >>> import numpy as np
mi@0 50 >>> from cmd import CMD
mi@0 51 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 52 >>> cmd_mdl = CMD(data, show_progress=False, rrank=1, crank=2)
mi@0 53 >>> cmd_mdl.factorize()
mi@0 54 """
mi@0 55
mi@0 56 def _cmdinit(self):
mi@0 57 nrids = np.unique(self._rid)
mi@0 58 ncids = np.unique(self._cid)
mi@0 59
mi@0 60 self._rcnt = np.zeros(len(nrids))
mi@0 61 self._ccnt = np.zeros(len(ncids))
mi@0 62
mi@0 63 for i,idx in enumerate(nrids):
mi@0 64 self._rcnt[i] = len(np.where(self._rid == idx)[0])
mi@0 65
mi@0 66 for i,idx in enumerate(ncids):
mi@0 67 self._ccnt[i] = len(np.where(self._cid == idx)[0])
mi@0 68
mi@0 69 self._rid = np.int32(list(nrids))
mi@0 70 self._cid = np.int32(list(ncids))
mi@0 71
mi@0 72 def factorize(self):
mi@0 73 """ Factorize s.t. CUR = data
mi@0 74
mi@0 75 Updated Values
mi@0 76 --------------
mi@0 77 .C : updated values for C.
mi@0 78 .U : updated values for U.
mi@0 79 .R : updated values for R.
mi@0 80 """
mi@0 81
mi@0 82 [prow, pcol] = self.sample_probability()
mi@0 83
mi@0 84 self._rid = self.sample(self._rrank, prow)
mi@0 85 self._cid = self.sample(self._crank, pcol)
mi@0 86
mi@0 87 self._cmdinit()
mi@0 88
mi@0 89 self.computeUCR()
mi@0 90
mi@0 91
mi@0 92 if __name__ == "__main__":
mi@0 93 import doctest
mi@0 94 doctest.testmod()