annotate pymf/sivm_gsat.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python2.6
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Simplex Volume Maximization [1]
mi@0 8
mi@0 9 SIVM_GSAT: class for gsat-SiVM
mi@0 10
mi@0 11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
mi@0 12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
mi@0 13 Conf. on Information and Knowledge Management. ACM. 2010.
mi@0 14 """
mi@0 15
mi@0 16
mi@0 17 import logging
mi@0 18 import numpy as np
mi@0 19 from dist import *
mi@0 20 from vol import cmdet
mi@0 21 from sivm import SIVM
mi@0 22
mi@0 23 __all__ = ["SIVM_GSAT"]
mi@0 24
mi@0 25 class SIVM_GSAT(SIVM):
mi@0 26 """
mi@0 27 SIVM(data, num_bases=4, dist_measure='l2')
mi@0 28
mi@0 29
mi@0 30 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
mi@0 31 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
mi@0 32 found by maximizing the volume of the resulting simplex (see [1]). Can be
mi@0 33 applied to data streams using the .online_update_w(vec) function which decides
mi@0 34 on adding data sample "vec" to the already selected basis vectors.
mi@0 35
mi@0 36 Parameters
mi@0 37 ----------
mi@0 38 data : array_like, shape (_data_dimension, _num_samples)
mi@0 39 the input data
mi@0 40 num_bases: int, optional
mi@0 41 Number of bases to compute (column rank of W and row rank of H).
mi@0 42 4 (default)
mi@0 43 dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
mi@0 44 Standard is 'l2' which maximizes the volume of the simplex. In contrast,
mi@0 45 'cosine' maximizes the volume of a cone (see [1] for details).
mi@0 46 init : string (default: 'fastmap')
mi@0 47 'fastmap' or 'origin'. Sets the method used for finding the very first
mi@0 48 basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
mi@0 49 the two vectors that have the largest pairwise distance.
mi@0 50 Attributes
mi@0 51 ----------
mi@0 52 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 53 H : "num bases x num_samples" matrix of coefficients
mi@0 54 ferr : frobenius norm (after calling .factorize())
mi@0 55
mi@0 56 Example
mi@0 57 -------
mi@0 58 Applying SIVM to some rather stupid data set:
mi@0 59
mi@0 60 >>> import numpy as np
mi@0 61 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 62 >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
mi@0 63 >>> sivm_mdl.factorize()
mi@0 64
mi@0 65 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
mi@0 66 To compute coefficients for an existing set of basis vectors simply copy W
mi@0 67 to sivm_mdl.W, and set compute_w to False:
mi@0 68
mi@0 69 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0 70 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0 71 >>> sivm_mdl = SIVM_GSAT(data, num_bases=2)
mi@0 72 >>> sivm_mdl.W = W
mi@0 73 >>> sivm_mdl.factorize(compute_w=False)
mi@0 74
mi@0 75 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
mi@0 76 """
mi@0 77
mi@0 78 def init_w(self):
mi@0 79 self.select = range(self._num_bases)
mi@0 80 self.W = self.data[:, self.select]
mi@0 81
mi@0 82 def online_update_w(self, vec):
mi@0 83 # update D if it does not exist
mi@0 84 k = self._num_bases
mi@0 85 if not hasattr(self, 'D'):
mi@0 86 self.D = np.zeros((k + 1, k + 1))
mi@0 87 self.D[:k, :k] = pdist(self.W, self.W)
mi@0 88 self.V = cmdet(self.D[:k, :k])
mi@0 89
mi@0 90 tmp_d = self._distfunc(self.W, vec.reshape((-1,1)))
mi@0 91 self.D[k, :-1] = tmp_d
mi@0 92 self.D[:-1, k] = tmp_d
mi@0 93
mi@0 94 v = np.zeros((self._num_bases + 1))
mi@0 95
mi@0 96 for i in range(self._num_bases):
mi@0 97 # compute volume for each combination...
mi@0 98 s = np.setdiff1d(range(self._num_bases + 1), [i])
mi@0 99 v[i] = cmdet((self.D[s,:])[:,s])
mi@0 100
mi@0 101 # select index that maximizes the volume
mi@0 102 v[-1] = self.V
mi@0 103 s = np.argmax(v)
mi@0 104
mi@0 105 if s < self._num_bases:
mi@0 106 self.W[:,s] = vec
mi@0 107 self.D[:self._num_bases, :self._num_bases] = pdist(self.W, self.W)
mi@0 108
mi@0 109 if not hasattr(self, '_v'):
mi@0 110 self._v = [self.V]
mi@0 111 self.V = v[s]
mi@0 112 self._v.append(v[s])
mi@0 113
mi@0 114 self._logger.info('Volume increased:' + str(self.V))
mi@0 115 return True, s
mi@0 116
mi@0 117 return False,-1
mi@0 118
mi@0 119 def update_w(self):
mi@0 120 n = np.int(np.floor(np.random.random() * self._num_samples))
mi@0 121 if n not in self.select:
mi@0 122 updated, s = self.online_update_w(self.data[:,n])
mi@0 123 if updated:
mi@0 124 self.select[s] = n
mi@0 125 self._logger.info('Current selection:' + str(self.select))
mi@0 126
mi@0 127
mi@0 128 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0 129 compute_err=True, niter=1):
mi@0 130 """ Factorize s.t. WH = data
mi@0 131
mi@0 132 Parameters
mi@0 133 ----------
mi@0 134 show_progress : bool
mi@0 135 print some extra information to stdout.
mi@0 136 niter : int
mi@0 137 number of iterations.
mi@0 138 compute_h : bool
mi@0 139 iteratively update values for H.
mi@0 140 compute_w : bool
mi@0 141 iteratively update values for W.
mi@0 142 compute_err : bool
mi@0 143 compute Frobenius norm |data-WH| after each update and store
mi@0 144 it to .ferr[k].
mi@0 145
mi@0 146 Updated Values
mi@0 147 --------------
mi@0 148 .W : updated values for W.
mi@0 149 .H : updated values for H.
mi@0 150 .ferr : Frobenius norm |data-WH|.
mi@0 151 """
mi@0 152 if show_progress:
mi@0 153 self._logger.setLevel(logging.INFO)
mi@0 154 else:
mi@0 155 self._logger.setLevel(logging.ERROR)
mi@0 156
mi@0 157 # create W and H if they don't already exist
mi@0 158 # -> any custom initialization to W,H should be done before
mi@0 159 if not hasattr(self,'W'):
mi@0 160 self.init_w()
mi@0 161
mi@0 162 if not hasattr(self,'H'):
mi@0 163 self.init_h()
mi@0 164
mi@0 165 if compute_err:
mi@0 166 self.ferr = np.zeros(niter)
mi@0 167
mi@0 168 for i in xrange(niter):
mi@0 169 if compute_w:
mi@0 170 self.update_w()
mi@0 171
mi@0 172 if compute_h:
mi@0 173 self.update_h()
mi@0 174
mi@0 175 if compute_err:
mi@0 176 self.ferr[i] = self.frobenius_norm()
mi@0 177 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) +
mi@0 178 ' FN:' + str(self.ferr[i]))
mi@0 179 else:
mi@0 180 self._logger.info('Iteration ' + str(i+1) + '/' + str(niter))
mi@0 181
mi@0 182
mi@0 183 if __name__ == "__main__":
mi@0 184 import doctest
mi@0 185 doctest.testmod()