mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Compact Matrix Decomposition [1] mi@0: mi@0: CMD(CUR): Class for Compact Matrix Decomposition mi@0: mi@0: [1] Sun, J., Xie, Y., Zhang, H. and Faloutsos, C. (2007), Less is More: Compact Matrix Decomposition for Large mi@0: Sparse Graphs, in Proc. SIAM Int. Conf. on Data Mining. mi@0: """ mi@0: mi@0: mi@0: import numpy as np mi@0: from cur import CUR mi@0: mi@0: __all__ = ["CMD"] mi@0: mi@0: class CMD(CUR): mi@0: """ mi@0: CMD(data, rrank=0, crank=0) mi@0: mi@0: mi@0: Compact Matrix Decomposition. Factorize a data matrix into three matrices s.t. mi@0: F = | data - USV| is minimal. CMD randomly selects rows and columns from mi@0: data for building U and V, respectively. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like [data_dimension x num_samples] mi@0: the input data mi@0: rrank: int, optional mi@0: Number of rows to sample from data. Double entries are eliminiated s.t. mi@0: the resulting rank might be lower. mi@0: 4 (default) mi@0: crank: int, optional mi@0: Number of columns to sample from data. Double entries are eliminiated s.t. mi@0: the resulting rank might be lower. mi@0: 4 (default) mi@0: mi@0: Attributes mi@0: ---------- mi@0: U,S,V : submatrices s.t. data = USV mi@0: mi@0: Example mi@0: ------- mi@0: >>> import numpy as np mi@0: >>> from cmd import CMD mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> cmd_mdl = CMD(data, show_progress=False, rrank=1, crank=2) mi@0: >>> cmd_mdl.factorize() mi@0: """ mi@0: mi@0: def _cmdinit(self): mi@0: nrids = np.unique(self._rid) mi@0: ncids = np.unique(self._cid) mi@0: mi@0: self._rcnt = np.zeros(len(nrids)) mi@0: self._ccnt = np.zeros(len(ncids)) mi@0: mi@0: for i,idx in enumerate(nrids): mi@0: self._rcnt[i] = len(np.where(self._rid == idx)[0]) mi@0: mi@0: for i,idx in enumerate(ncids): mi@0: self._ccnt[i] = len(np.where(self._cid == idx)[0]) mi@0: mi@0: self._rid = np.int32(list(nrids)) mi@0: self._cid = np.int32(list(ncids)) mi@0: mi@0: def factorize(self): mi@0: """ Factorize s.t. CUR = data mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .C : updated values for C. mi@0: .U : updated values for U. mi@0: .R : updated values for R. mi@0: """ mi@0: mi@0: [prow, pcol] = self.sample_probability() mi@0: mi@0: self._rid = self.sample(self._rrank, prow) mi@0: self._cid = self.sample(self._crank, pcol) mi@0: mi@0: self._cmdinit() mi@0: mi@0: self.computeUCR() mi@0: mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()