annotate pymf/sivm_search.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python2.6
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Simplex Volume Maximization [1]
mi@0 8
mi@0 9 SIVM_SEARCH: class for search-SiVM
mi@0 10
mi@0 11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
mi@0 12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
mi@0 13 Conf. on Information and Knowledge Management. ACM. 2010.
mi@0 14 """
mi@0 15
mi@0 16
mi@0 17 import scipy.sparse
mi@0 18 import numpy as np
mi@0 19 from scipy import inf
mi@0 20 try:
mi@0 21 from scipy.misc.common import factorial
mi@0 22 except:
mi@0 23 from scipy.misc import factorial
mi@0 24
mi@0 25 from dist import *
mi@0 26 from vol import *
mi@0 27 from sivm import SIVM
mi@0 28
mi@0 29 __all__ = ["SIVM_SEARCH"]
mi@0 30
mi@0 31 class SIVM_SEARCH(SIVM):
mi@0 32 """
mi@0 33 SIVM_SEARCH(data, num_bases=4, dist_measure='l2')
mi@0 34
mi@0 35
mi@0 36 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
mi@0 37 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
mi@0 38 found by maximizing the volume of the resulting simplex (see [1]). A solution
mi@0 39 is found by employing a simple A-star like search strategy.
mi@0 40
mi@0 41 Parameters
mi@0 42 ----------
mi@0 43 data : array_like, shape (_data_dimension, _num_samples)
mi@0 44 the input data
mi@0 45 num_bases: int, optional
mi@0 46 Number of bases to compute (column rank of W and row rank of H).
mi@0 47 4 (default)
mi@0 48 dist_measure : one of 'l2' ,'cosine', 'l1', 'kl'
mi@0 49 Standard is 'l2' which maximizes the volume of the simplex. In contrast,
mi@0 50 'cosine' maximizes the volume of a cone (see [1] for details).
mi@0 51 init : string (default: 'fastmap')
mi@0 52 'fastmap' or 'origin'. Sets the method used for finding the very first
mi@0 53 basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of
mi@0 54 the two vectors that have the largest pairwise distance.
mi@0 55 Attributes
mi@0 56 ----------
mi@0 57 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 58 H : "num bases x num_samples" matrix of coefficients
mi@0 59 ferr : frobenius norm (after calling .factorize())
mi@0 60
mi@0 61 Example
mi@0 62 -------
mi@0 63 Applying SIVM to some rather stupid data set:
mi@0 64
mi@0 65 >>> import numpy as np
mi@0 66 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 67 >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2)
mi@0 68 >>> sivm_mdl.factorize()
mi@0 69
mi@0 70 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
mi@0 71 To compute coefficients for an existing set of basis vectors simply copy W
mi@0 72 to sivm_mdl.W, and set compute_w to False:
mi@0 73
mi@0 74 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0 75 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0 76 >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2)
mi@0 77 >>> sivm_mdl.W = W
mi@0 78 >>> sivm_mdl.factorize(compute_w=False)
mi@0 79
mi@0 80 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
mi@0 81 """
mi@0 82
mi@0 83 def update_w(self):
mi@0 84 def h(sel,D,k):
mi@0 85 # compute the volume for a selection of sel columns
mi@0 86 # and a k-1 simplex (-> k columns have to be selected)
mi@0 87 mv = np.max(D)
mi@0 88
mi@0 89 # fill the remaining distance by the maximal overall found distance
mi@0 90 d = np.zeros((k,k)) + mv
mi@0 91 for i in range(k):
mi@0 92 d[i,i] = 0.0
mi@0 93
mi@0 94 for idx_i,i in enumerate(sel):
mi@0 95 for idx_j,j in enumerate(sel):
mi@0 96 d[idx_i,idx_j] = D[i,j]
mi@0 97
mi@0 98 return d
mi@0 99
mi@0 100 # compute distance matrix -> required for the volume
mi@0 101 D = pdist(self.data, self.data)
mi@0 102 Openset = {}
mi@0 103
mi@0 104 for i in range(self._num_samples):
mi@0 105 # compute volume for temp selection
mi@0 106 d = h([i],D,self._num_bases)
mi@0 107 Vtmp = cmdet(d)
mi@0 108 Openset[tuple([i])] = Vtmp
mi@0 109
mi@0 110 Closedset = {}
mi@0 111 finished = False
mi@0 112 self._v = []
mi@0 113 self.init_sivm()
mi@0 114 next_sel = np.array([self.select[0]])
mi@0 115 iter = 0
mi@0 116
mi@0 117 while not finished:
mi@0 118 # add the current selection to closedset
mi@0 119 Closedset[(tuple(next_sel))] = []
mi@0 120
mi@0 121 for i in range(D.shape[0]):
mi@0 122 # create a temp selection
mi@0 123 tmp_sel = np.array(next_sel).flatten()
mi@0 124 tmp_sel = np.concatenate((tmp_sel, [i]),axis=0)
mi@0 125 tmp_sel = np.unique(tmp_sel)
mi@0 126 tmp_sel = list(tmp_sel)
mi@0 127 hkey = tuple(tmp_sel)
mi@0 128
mi@0 129 if len(tmp_sel) > len(next_sel) and (
mi@0 130 not Closedset.has_key(hkey)) and (
mi@0 131 not Openset.has_key(hkey)):
mi@0 132
mi@0 133 # compute volume for temp selection
mi@0 134 d = h(tmp_sel, D, self._num_bases)
mi@0 135 Vtmp = cmdet(d)
mi@0 136
mi@0 137 # add to openset
mi@0 138 Openset[hkey] = Vtmp
mi@0 139
mi@0 140 # get next best tuple
mi@0 141 vmax = 0.0
mi@0 142 for (k,v) in Openset.iteritems():
mi@0 143 if v > vmax:
mi@0 144 next_sel = k
mi@0 145 vmax = v
mi@0 146
mi@0 147 self._logger.info('Iter:' + str(iter))
mi@0 148 self._logger.info('Current selection:' + str(next_sel))
mi@0 149 self._logger.info('Current volume:' + str(vmax))
mi@0 150 self._v.append(vmax)
mi@0 151
mi@0 152 # remove next_sel from openset
mi@0 153 Openset.pop(next_sel)
mi@0 154
mi@0 155 if len(list(next_sel)) == self._num_bases:
mi@0 156 finished = True
mi@0 157 iter += 1
mi@0 158
mi@0 159 # update some values ...
mi@0 160 self.select = list(next_sel)
mi@0 161 self.W = self.data[:, self.select]
mi@0 162
mi@0 163 if __name__ == "__main__":
mi@0 164 import doctest
mi@0 165 doctest.testmod()