annotate pymf/sivm.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Simplex Volume Maximization [1]
mi@0 8
mi@0 9 SIVM: class for SiVM
mi@0 10
mi@0 11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
mi@0 12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
mi@0 13 Conf. on Information and Knowledge Management. ACM. 2010.
mi@0 14 """
mi@0 15
mi@0 16
mi@0 17 import scipy.sparse
mi@0 18 import numpy as np
mi@0 19
mi@0 20 from dist import *
mi@0 21 from aa import AA
mi@0 22
mi@0 23 __all__ = ["SIVM"]
mi@0 24
mi@0 25 class SIVM(AA):
mi@0 26 """
mi@0 27 SIVM(data, num_bases=4, dist_measure='l2')
mi@0 28
mi@0 29
mi@0 30 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
mi@0 31 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
mi@0 32 found by maximizing the volume of the resulting simplex (see [1]).
mi@0 33
mi@0 34 Parameters
mi@0 35 ----------
mi@0 36 data : array_like, shape (_data_dimension, _num_samples)
mi@0 37 the input data
mi@0 38 num_bases: int, optional
mi@0 39 Number of bases to compute (column rank of W and row rank of H).
mi@0 40 4 (default)
mi@0 41 dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
mi@0 42 Standard is 'l2' which maximizes the volume of the simplex. In contrast,
mi@0 43 'cosine' maximizes the volume of a cone (see [1] for details).
mi@0 44 init : string (default: 'fastmap')
mi@0 45 'fastmap' or 'origin'. Sets the method used for finding the very first
mi@0 46 basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
mi@0 47 the two vectors that have the largest pairwise distance.
mi@0 48 Attributes
mi@0 49 ----------
mi@0 50 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 51 H : "num bases x num_samples" matrix of coefficients
mi@0 52 ferr : frobenius norm (after calling .factorize())
mi@0 53
mi@0 54 Example
mi@0 55 -------
mi@0 56 Applying SIVM to some rather stupid data set:
mi@0 57
mi@0 58 >>> import numpy as np
mi@0 59 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 60 >>> sivm_mdl = SIVM(data, num_bases=2)
mi@0 61 >>> sivm_mdl.factorize()
mi@0 62
mi@0 63 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
mi@0 64 To compute coefficients for an existing set of basis vectors simply copy W
mi@0 65 to sivm_mdl.W, and set compute_w to False:
mi@0 66
mi@0 67 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0 68 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0 69 >>> sivm_mdl = SIVM(data, num_bases=2)
mi@0 70 >>> sivm_mdl.W = W
mi@0 71 >>> sivm_mdl.factorize(compute_w=False)
mi@0 72
mi@0 73 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
mi@0 74 """
mi@0 75
mi@0 76 # always overwrite the default number of iterations
mi@0 77 # -> any value other does not make sense.
mi@0 78 _NITER = 1
mi@0 79
mi@0 80 def __init__(self, data, num_bases=4, dist_measure='l2', init='fastmap'):
mi@0 81
mi@0 82 AA.__init__(self, data, num_bases=num_bases)
mi@0 83
mi@0 84 self._dist_measure = dist_measure
mi@0 85 self._init = init
mi@0 86
mi@0 87 # assign the correct distance function
mi@0 88 if self._dist_measure == 'l1':
mi@0 89 self._distfunc = l1_distance
mi@0 90
mi@0 91 elif self._dist_measure == 'l2':
mi@0 92 self._distfunc = l2_distance
mi@0 93
mi@0 94 elif self._dist_measure == 'cosine':
mi@0 95 self._distfunc = cosine_distance
mi@0 96
mi@0 97 elif self._dist_measure == 'abs_cosine':
mi@0 98 self._distfunc = abs_cosine_distance
mi@0 99
mi@0 100 elif self._dist_measure == 'weighted_abs_cosine':
mi@0 101 self._distfunc = weighted_abs_cosine_distance
mi@0 102
mi@0 103 elif self._dist_measure == 'kl':
mi@0 104 self._distfunc = kl_divergence
mi@0 105
mi@0 106
mi@0 107 def _distance(self, idx):
mi@0 108 """ compute distances of a specific data point to all other samples"""
mi@0 109
mi@0 110 if scipy.sparse.issparse(self.data):
mi@0 111 step = self.data.shape[1]
mi@0 112 else:
mi@0 113 step = 50000
mi@0 114
mi@0 115 d = np.zeros((self.data.shape[1]))
mi@0 116 if idx == -1:
mi@0 117 # set vec to origin if idx=-1
mi@0 118 vec = np.zeros((self.data.shape[0], 1))
mi@0 119 if scipy.sparse.issparse(self.data):
mi@0 120 vec = scipy.sparse.csc_matrix(vec)
mi@0 121 else:
mi@0 122 vec = self.data[:, idx:idx+1]
mi@0 123
mi@0 124 self._logger.info('compute distance to node ' + str(idx))
mi@0 125
mi@0 126 # slice data into smaller chunks
mi@0 127 for idx_start in range(0, self.data.shape[1], step):
mi@0 128 if idx_start + step > self.data.shape[1]:
mi@0 129 idx_end = self.data.shape[1]
mi@0 130 else:
mi@0 131 idx_end = idx_start + step
mi@0 132
mi@0 133 d[idx_start:idx_end] = self._distfunc(
mi@0 134 self.data[:,idx_start:idx_end], vec)
mi@0 135 self._logger.info('completed:' +
mi@0 136 str(idx_end/(self.data.shape[1]/100.0)) + "%")
mi@0 137 return d
mi@0 138
mi@0 139 def init_h(self):
mi@0 140 self.H = np.zeros((self._num_bases, self._num_samples))
mi@0 141
mi@0 142 def init_w(self):
mi@0 143 self.W = np.zeros((self._data_dimension, self._num_bases))
mi@0 144
mi@0 145 def init_sivm(self):
mi@0 146 self.select = []
mi@0 147 if self._init == 'fastmap':
mi@0 148 # Fastmap like initialization
mi@0 149 # set the starting index for fastmap initialization
mi@0 150 cur_p = 0
mi@0 151
mi@0 152 # after 3 iterations the first "real" index is found
mi@0 153 for i in range(3):
mi@0 154 d = self._distance(cur_p)
mi@0 155 cur_p = np.argmax(d)
mi@0 156
mi@0 157 # store maximal found distance -> later used for "a" (->update_w)
mi@0 158 self._maxd = np.max(d)
mi@0 159 self.select.append(cur_p)
mi@0 160
mi@0 161 elif self._init == 'origin':
mi@0 162 # set first vertex to origin
mi@0 163 cur_p = -1
mi@0 164 d = self._distance(cur_p)
mi@0 165 self._maxd = np.max(d)
mi@0 166 self.select.append(cur_p)
mi@0 167
mi@0 168 def update_w(self):
mi@0 169 """ compute new W """
mi@0 170 EPS = 10**-8
mi@0 171 self.init_sivm()
mi@0 172
mi@0 173 # initialize some of the recursively updated distance measures ....
mi@0 174 d_square = np.zeros((self.data.shape[1]))
mi@0 175 d_sum = np.zeros((self.data.shape[1]))
mi@0 176 d_i_times_d_j = np.zeros((self.data.shape[1]))
mi@0 177 distiter = np.zeros((self.data.shape[1]))
mi@0 178 a = np.log(self._maxd)
mi@0 179 a_inc = a.copy()
mi@0 180
mi@0 181 for l in range(1, self._num_bases):
mi@0 182 d = self._distance(self.select[l-1])
mi@0 183
mi@0 184 # take the log of d (sually more stable that d)
mi@0 185 d = np.log(d + EPS)
mi@0 186
mi@0 187 d_i_times_d_j += d * d_sum
mi@0 188 d_sum += d
mi@0 189 d_square += d**2
mi@0 190 distiter = d_i_times_d_j + a*d_sum - (l/2.0) * d_square
mi@0 191
mi@0 192 # detect the next best data point
mi@0 193 self.select.append(np.argmax(distiter))
mi@0 194
mi@0 195 self._logger.info('cur_nodes: ' + str(self.select))
mi@0 196
mi@0 197 # sort indices, otherwise h5py won't work
mi@0 198 self.W = self.data[:, np.sort(self.select)]
mi@0 199
mi@0 200 # "unsort" it again to keep the correct order
mi@0 201 self.W = self.W[:, np.argsort(np.argsort(self.select))]
mi@0 202
mi@0 203 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0 204 compute_err=True, niter=1):
mi@0 205 """ Factorize s.t. WH = data
mi@0 206
mi@0 207 Parameters
mi@0 208 ----------
mi@0 209 show_progress : bool
mi@0 210 print some extra information to stdout.
mi@0 211 compute_h : bool
mi@0 212 iteratively update values for H.
mi@0 213 compute_w : bool
mi@0 214 iteratively update values for W.
mi@0 215 compute_err : bool
mi@0 216 compute Frobenius norm |data-WH| after each update and store
mi@0 217 it to .ferr[k].
mi@0 218
mi@0 219 Updated Values
mi@0 220 --------------
mi@0 221 .W : updated values for W.
mi@0 222 .H : updated values for H.
mi@0 223 .ferr : Frobenius norm |data-WH|.
mi@0 224 """
mi@0 225
mi@0 226 AA.factorize(self, niter=1, show_progress=show_progress,
mi@0 227 compute_w=compute_w, compute_h=compute_h,
mi@0 228 compute_err=compute_err)
mi@0 229
mi@0 230 if __name__ == "__main__":
mi@0 231 import doctest
mi@0 232 doctest.testmod()