mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Simplex Volume Maximization for CUR [1] mi@0: mi@0: SIVMCUR: class for SiVM-CUR mi@0: mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. mi@0: Conf. on Information and Knowledge Management. ACM. 2010. mi@0: """ mi@0: mi@0: mi@0: import numpy as np mi@0: import scipy mi@0: from sivm import SIVM mi@0: from cur import CUR mi@0: mi@0: __all__ = ["SIVM_CUR"] mi@0: mi@0: class SIVM_CUR(CUR): mi@0: ''' mi@0: SIVM_CUR(data, num_bases=4, dist_measure='l2') mi@0: mi@0: Simplex Volume based CUR Decomposition. Factorize a data matrix into three mi@0: matrices s.t. F = | data - USV| is minimal. Unlike CUR, SIVMCUR selects the mi@0: rows and columns using SIVM, i.e. it tries to maximize the volume of the mi@0: enclosed simplex. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like [data_dimension x num_samples] mi@0: the input data mi@0: rrank: int, optional mi@0: Number of rows to sample from data. mi@0: 4 (default)crank mi@0: crank: int, optional mi@0: Number of columns to sample from data. mi@0: 4 (default) mi@0: dist_measure: string, optional mi@0: The distance measure for finding the next best candidate that mi@0: maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2'] mi@0: 'l2' (default) mi@0: mi@0: Attributes mi@0: ---------- mi@0: U,S,V : submatrices s.t. data = USV mi@0: mi@0: Example mi@0: ------- mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> sivmcur_mdl = SIVM_CUR(data, show_progress=False, rrank=1, crank=2) mi@0: >>> sivmcur_mdl.factorize() mi@0: ''' mi@0: mi@0: def __init__(self, data, k=-1, rrank=0, crank=0, dist_measure='l2', init='origin'): mi@0: CUR.__init__(self, data, k=k, rrank=rrank, crank=rrank) mi@0: self._dist_measure = dist_measure mi@0: self.init = init mi@0: mi@0: def sample(self, A, c): mi@0: # for optimizing the volume of the submatrix, set init to 'origin' (otherwise the volume of mi@0: # the ordinary simplex would be optimized) mi@0: sivm_mdl = SIVM(A, num_bases=c, dist_measure=self._dist_measure, mi@0: init=self.init) mi@0: sivm_mdl.factorize(show_progress=False, compute_w=True, niter=1, mi@0: compute_h=False, compute_err=False) mi@0: mi@0: return sivm_mdl.select mi@0: mi@0: mi@0: def factorize(self): mi@0: """ Factorize s.t. CUR = data mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .C : updated values for C. mi@0: .U : updated values for U. mi@0: .R : updated values for R. mi@0: """ mi@0: # sample row and column indices that maximize the volume of the submatrix mi@0: self._rid = self.sample(self.data.transpose(), self._rrank) mi@0: self._cid = self.sample(self.data, self._crank) mi@0: mi@0: self._rcnt = np.ones(len(self._rid)) mi@0: self._ccnt = np.ones(len(self._cid)) mi@0: mi@0: self.computeUCR() mi@0: mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()