annotate pymf/sivm_cur.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Simplex Volume Maximization for CUR [1]
mi@0 8
mi@0 9 SIVMCUR: class for SiVM-CUR
mi@0 10
mi@0 11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
mi@0 12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
mi@0 13 Conf. on Information and Knowledge Management. ACM. 2010.
mi@0 14 """
mi@0 15
mi@0 16
mi@0 17 import numpy as np
mi@0 18 import scipy
mi@0 19 from sivm import SIVM
mi@0 20 from cur import CUR
mi@0 21
mi@0 22 __all__ = ["SIVM_CUR"]
mi@0 23
mi@0 24 class SIVM_CUR(CUR):
mi@0 25 '''
mi@0 26 SIVM_CUR(data, num_bases=4, dist_measure='l2')
mi@0 27
mi@0 28 Simplex Volume based CUR Decomposition. Factorize a data matrix into three
mi@0 29 matrices s.t. F = | data - USV| is minimal. Unlike CUR, SIVMCUR selects the
mi@0 30 rows and columns using SIVM, i.e. it tries to maximize the volume of the
mi@0 31 enclosed simplex.
mi@0 32
mi@0 33 Parameters
mi@0 34 ----------
mi@0 35 data : array_like [data_dimension x num_samples]
mi@0 36 the input data
mi@0 37 rrank: int, optional
mi@0 38 Number of rows to sample from data.
mi@0 39 4 (default)crank
mi@0 40 crank: int, optional
mi@0 41 Number of columns to sample from data.
mi@0 42 4 (default)
mi@0 43 dist_measure: string, optional
mi@0 44 The distance measure for finding the next best candidate that
mi@0 45 maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
mi@0 46 'l2' (default)
mi@0 47
mi@0 48 Attributes
mi@0 49 ----------
mi@0 50 U,S,V : submatrices s.t. data = USV
mi@0 51
mi@0 52 Example
mi@0 53 -------
mi@0 54 >>> import numpy as np
mi@0 55 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 56 >>> sivmcur_mdl = SIVM_CUR(data, show_progress=False, rrank=1, crank=2)
mi@0 57 >>> sivmcur_mdl.factorize()
mi@0 58 '''
mi@0 59
mi@0 60 def __init__(self, data, k=-1, rrank=0, crank=0, dist_measure='l2', init='origin'):
mi@0 61 CUR.__init__(self, data, k=k, rrank=rrank, crank=rrank)
mi@0 62 self._dist_measure = dist_measure
mi@0 63 self.init = init
mi@0 64
mi@0 65 def sample(self, A, c):
mi@0 66 # for optimizing the volume of the submatrix, set init to 'origin' (otherwise the volume of
mi@0 67 # the ordinary simplex would be optimized)
mi@0 68 sivm_mdl = SIVM(A, num_bases=c, dist_measure=self._dist_measure,
mi@0 69 init=self.init)
mi@0 70 sivm_mdl.factorize(show_progress=False, compute_w=True, niter=1,
mi@0 71 compute_h=False, compute_err=False)
mi@0 72
mi@0 73 return sivm_mdl.select
mi@0 74
mi@0 75
mi@0 76 def factorize(self):
mi@0 77 """ Factorize s.t. CUR = data
mi@0 78
mi@0 79 Updated Values
mi@0 80 --------------
mi@0 81 .C : updated values for C.
mi@0 82 .U : updated values for U.
mi@0 83 .R : updated values for R.
mi@0 84 """
mi@0 85 # sample row and column indices that maximize the volume of the submatrix
mi@0 86 self._rid = self.sample(self.data.transpose(), self._rrank)
mi@0 87 self._cid = self.sample(self.data, self._crank)
mi@0 88
mi@0 89 self._rcnt = np.ones(len(self._rid))
mi@0 90 self._ccnt = np.ones(len(self._cid))
mi@0 91
mi@0 92 self.computeUCR()
mi@0 93
mi@0 94
mi@0 95 if __name__ == "__main__":
mi@0 96 import doctest
mi@0 97 doctest.testmod()