segmentation: pymf/cur.py annotate

annotate pymf/cur.py @ 19:890cfe424f4a tip

added annotations

author	mitian
date	Fri, 11 Dec 2015 09:47:40 +0000
parents	26838b1f560f
children

rev	line source
mi@0	1 #!/usr/bin/python
mi@0	2 #
mi@0	3 # Copyright (C) Christian Thurau, 2010.
mi@0	4 # Licensed under the GNU General Public License (GPL).
mi@0	5 # http://www.gnu.org/licenses/gpl.txt
mi@0	6 """
mi@0	7 PyMF CUR Decomposition [1]
mi@0	8
mi@0	9 CUR(SVD) : Class for CUR Decomposition
mi@0	10
mi@0	11 [1] Drineas, P., Kannan, R. and Mahoney, M. (2006), 'Fast Monte Carlo Algorithms III: Computing
mi@0	12 a Compressed Approixmate Matrix Decomposition', SIAM J. Computing 36(1), 184-206.
mi@0	13 """
mi@0	14
mi@0	15
mi@0	16 import numpy as np
mi@0	17 import scipy.sparse
mi@0	18
mi@0	19 from svd import pinv, SVD
mi@0	20
mi@0	21
mi@0	22 __all__ = ["CUR"]
mi@0	23
mi@0	24 class CUR(SVD):
mi@0	25 """
mi@0	26 CUR(data, data, k=-1, rrank=0, crank=0)
mi@0	27
mi@0	28 CUR Decomposition. Factorize a data matrix into three matrices s.t.
mi@0	29 F = \| data - USV\| is minimal. CUR randomly selects rows and columns from
mi@0	30 data for building U and V, respectively.
mi@0	31
mi@0	32 Parameters
mi@0	33 ----------
mi@0	34 data : array_like [data_dimension x num_samples]
mi@0	35 the input data
mi@0	36 rrank: int, optional
mi@0	37 Number of rows to sample from data.
mi@0	38 4 (default)
mi@0	39 crank: int, optional
mi@0	40 Number of columns to sample from data.
mi@0	41 4 (default)
mi@0	42 show_progress: bool, optional
mi@0	43 Print some extra information
mi@0	44 False (default)
mi@0	45
mi@0	46 Attributes
mi@0	47 ----------
mi@0	48 U,S,V : submatrices s.t. data = USV
mi@0	49
mi@0	50 Example
mi@0	51 -------
mi@0	52 >>> import numpy as np
mi@0	53 >>> from cur import CUR
mi@0	54 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0	55 >>> cur_mdl = CUR(data, show_progress=False, rrank=1, crank=2)
mi@0	56 >>> cur_mdl.factorize()
mi@0	57 """
mi@0	58
mi@0	59 def __init__(self, data, k=-1, rrank=0, crank=0):
mi@0	60 SVD.__init__(self, data,k=k,rrank=rrank, crank=rrank)
mi@0	61
mi@0	62 # select all data samples for computing the error:
mi@0	63 # note that this might take very long, adjust self._rset and self._cset
mi@0	64 # for faster computations.
mi@0	65 self._rset = range(self._rows)
mi@0	66 self._cset = range(self._cols)
mi@0	67
mi@0	68
mi@0	69 def sample(self, s, probs):
mi@0	70 prob_rows = np.cumsum(probs.flatten())
mi@0	71 temp_ind = np.zeros(s, np.int32)
mi@0	72
mi@0	73 for i in range(s):
mi@0	74 v = np.random.rand()
mi@0	75
mi@0	76 try:
mi@0	77 tempI = np.where(prob_rows >= v)[0]
mi@0	78 temp_ind[i] = tempI[0]
mi@0	79 except:
mi@0	80 temp_ind[i] = len(prob_rows)
mi@0	81
mi@0	82 return np.sort(temp_ind)
mi@0	83
mi@0	84 def sample_probability(self):
mi@0	85
mi@0	86 if scipy.sparse.issparse(self.data):
mi@0	87 dsquare = self.data.multiply(self.data)
mi@0	88 else:
mi@0	89 dsquare = self.data[:,:]**2
mi@0	90
mi@0	91 prow = np.array(dsquare.sum(axis=1), np.float64)
mi@0	92 pcol = np.array(dsquare.sum(axis=0), np.float64)
mi@0	93
mi@0	94 prow /= prow.sum()
mi@0	95 pcol /= pcol.sum()
mi@0	96
mi@0	97 return (prow.reshape(-1,1), pcol.reshape(-1,1))
mi@0	98
mi@0	99 def computeUCR(self):
mi@0	100 # the next lines do NOT work with h5py if CUR is used -> double indices in self.cid or self.rid
mi@0	101 # can occur and are not supported by h5py. When using h5py data, always use CMD which ignores
mi@0	102 # reoccuring row/column selections.
mi@0	103
mi@0	104 if scipy.sparse.issparse(self.data):
mi@0	105 self._C = self.data[:, self._cid] * scipy.sparse.csc_matrix(np.diag(self._ccnt**(1/2)))
mi@0	106 self._R = scipy.sparse.csc_matrix(np.diag(self._rcnt*(1/2))) self.data[self._rid,:]
mi@0	107
mi@0	108 self._U = pinv(self._C, self._k) * self.data[:,:] * pinv(self._R, self._k)
mi@0	109
mi@0	110 else:
mi@0	111 self._C = np.dot(self.data[:, self._cid].reshape((self._rows, len(self._cid))), np.diag(self._ccnt**(1/2)))
mi@0	112 self._R = np.dot(np.diag(self._rcnt**(1/2)), self.data[self._rid,:].reshape((len(self._rid), self._cols)))
mi@0	113
mi@0	114 self._U = np.dot(np.dot(pinv(self._C, self._k), self.data[:,:]),
mi@0	115 pinv(self._R, self._k))
mi@0	116
mi@0	117 # set some standard (with respect to SVD) variable names
mi@0	118 self.U = self._C
mi@0	119 self.S = self._U
mi@0	120 self.V = self._R
mi@0	121
mi@0	122 def factorize(self):
mi@0	123 """ Factorize s.t. CUR = data
mi@0	124
mi@0	125 Updated Values
mi@0	126 --------------
mi@0	127 .C : updated values for C.
mi@0	128 .U : updated values for U.
mi@0	129 .R : updated values for R.
mi@0	130 """
mi@0	131 [prow, pcol] = self.sample_probability()
mi@0	132 self._rid = self.sample(self._rrank, prow)
mi@0	133 self._cid = self.sample(self._crank, pcol)
mi@0	134
mi@0	135 self._rcnt = np.ones(len(self._rid))
mi@0	136 self._ccnt = np.ones(len(self._cid))
mi@0	137
mi@0	138 self.computeUCR()
mi@0	139
mi@0	140
mi@0	141 if __name__ == "__main__":
mi@0	142 import doctest
mi@0	143 doctest.testmod()

Mercurial > hg > segmentation

annotate pymf/cur.py @ 19:890cfe424f4a tip