annotate pymf/sivm_sgreedy.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python2.6
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Simplex Volume Maximization [1]
mi@0 8
mi@0 9 SIVM_SGREEDY: class for greedy-search SiVM
mi@0 10
mi@0 11 [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume
mi@0 12 Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int.
mi@0 13 Conf. on Information and Knowledge Management. ACM. 2010.
mi@0 14 """
mi@0 15
mi@0 16
mi@0 17 import numpy as np
mi@0 18 import time
mi@0 19
mi@0 20 from dist import *
mi@0 21 from vol import *
mi@0 22 from sivm_search import SIVM_SEARCH
mi@0 23
mi@0 24 __all__ = ["SIVM_SGREEDY"]
mi@0 25
mi@0 26 class SIVM_SGREEDY(SIVM_SEARCH):
mi@0 27 """
mi@0 28 SIVM(data, num_bases=4, niter=100, show_progress=True, compW=True)
mi@0 29
mi@0 30
mi@0 31 Simplex Volume Maximization. Factorize a data matrix into two matrices s.t.
mi@0 32 F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively
mi@0 33 found by maximizing the volume of the resulting simplex (see [1]). A solution
mi@0 34 is found by employing a simple greedy max-vol strategy.
mi@0 35
mi@0 36 Parameters
mi@0 37 ----------
mi@0 38 data : array_like
mi@0 39 the input data
mi@0 40 num_bases: int, optional
mi@0 41 Number of bases to compute (column rank of W and row rank of H).
mi@0 42 4 (default)
mi@0 43 niter: int, optional
mi@0 44 Number of iterations of the alternating optimization.
mi@0 45 100 (default)
mi@0 46 show_progress: bool, optional
mi@0 47 Print some extra information
mi@0 48 False (default)
mi@0 49 compW: bool, optional
mi@0 50 Compute W (True) or only H (False). Useful for using basis vectors
mi@0 51 from another convexity constrained matrix factorization function
mi@0 52 (e.g. svmnmf) (if set to "True" niter can be set to "1")
mi@0 53 compH: bool, optional
mi@0 54 Compute H (True) or only H (False). Useful for using precomputed
mi@0 55 basis vectors.
mi@0 56 dist_measure: string, optional
mi@0 57 The distance measure for finding the next best candidate that
mi@0 58 maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2']
mi@0 59 'l2' (default)
mi@0 60 optimize_lower_bound: bool, optional
mi@0 61 Use the alternative selection criterion that optimizes the lower
mi@0 62 bound (see [1])
mi@0 63 False (default)
mi@0 64
mi@0 65 Attributes
mi@0 66 ----------
mi@0 67 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 68 H : "num bases x num_samples" matrix of coefficients
mi@0 69
mi@0 70 ferr : frobenius norm (after applying .factoriz())
mi@0 71
mi@0 72 Example
mi@0 73 -------
mi@0 74 Applying SIVM to some rather stupid data set:
mi@0 75
mi@0 76 >>> import numpy as np
mi@0 77 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 78 >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=10)
mi@0 79 >>> sivm_mdl.initialization()
mi@0 80 >>> sivm_mdl.factorize()
mi@0 81
mi@0 82 The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H.
mi@0 83 To compute coefficients for an existing set of basis vectors simply copy W
mi@0 84 to sivm_mdl.W, and set compW to False:
mi@0 85
mi@0 86 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0 87 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0 88 >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=1, compW=False)
mi@0 89 >>> sivm_mdl.initialization()
mi@0 90 >>> sivm_mdl.W = W
mi@0 91 >>> sivm_mdl.factorize()
mi@0 92
mi@0 93 The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H.
mi@0 94 """
mi@0 95
mi@0 96 def update_w(self):
mi@0 97 # compute distance matrix -> requiresd for the volume
mi@0 98 self.init_sivm()
mi@0 99 next_sel = list([self.select[0]])
mi@0 100 self.select = []
mi@0 101
mi@0 102 self._v = []
mi@0 103 self._t = []
mi@0 104 stime = time.time()
mi@0 105
mi@0 106 for iter in range(self._num_bases-1):
mi@0 107 # add new selections to openset
mi@0 108 next_sel = list(np.sort(next_sel))
mi@0 109 D = pdist(self.data[:, next_sel], self.data[:, next_sel])
mi@0 110 V = np.zeros(self.data.shape[1])
mi@0 111 d = np.zeros((D.shape[0]+1,D.shape[1]+1))
mi@0 112 d[:D.shape[0], :D.shape[1]] = D[:,:]
mi@0 113
mi@0 114 for i in range(self.data.shape[1]):
mi@0 115 # create a temp selection
mi@0 116 dtmp = l2_distance(self.data[:,next_sel], self.data[:,i:i+1])
mi@0 117 d[:-1,-1] = dtmp
mi@0 118 d[-1,:-1] = dtmp
mi@0 119 # compute volume for temp selection
mi@0 120 V[i] = cmdet(d)
mi@0 121
mi@0 122 next_index = np.argmax(V)
mi@0 123 next_sel.append(next_index)
mi@0 124 self._v.append(np.max(V))
mi@0 125
mi@0 126 self._logger.info('Iter:' + str(iter))
mi@0 127 self._logger.info('Current selection:' + str(next_sel))
mi@0 128 self._logger.info('Current volume:' + str(self._v[-1]))
mi@0 129 self._t.append(time.time() - stime)
mi@0 130
mi@0 131 # update some values ...
mi@0 132 self.select = list(next_sel)
mi@0 133 self.W = self.data[:, self.select]
mi@0 134
mi@0 135
mi@0 136
mi@0 137 if __name__ == "__main__":
mi@0 138 import doctest
mi@0 139 doctest.testmod()