mi@0: #!/usr/bin/python2.6 mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Simplex Volume Maximization [1] mi@0: mi@0: SIVM_SGREEDY: class for greedy-search SiVM mi@0: mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. mi@0: Conf. on Information and Knowledge Management. ACM. 2010. mi@0: """ mi@0: mi@0: mi@0: import numpy as np mi@0: import time mi@0: mi@0: from dist import * mi@0: from vol import * mi@0: from sivm_search import SIVM_SEARCH mi@0: mi@0: __all__ = ["SIVM_SGREEDY"] mi@0: mi@0: class SIVM_SGREEDY(SIVM_SEARCH): mi@0: """ mi@0: SIVM(data, num_bases=4, niter=100, show_progress=True, compW=True) mi@0: mi@0: mi@0: Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively mi@0: found by maximizing the volume of the resulting simplex (see [1]). A solution mi@0: is found by employing a simple greedy max-vol strategy. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: niter: int, optional mi@0: Number of iterations of the alternating optimization. mi@0: 100 (default) mi@0: show_progress: bool, optional mi@0: Print some extra information mi@0: False (default) mi@0: compW: bool, optional mi@0: Compute W (True) or only H (False). Useful for using basis vectors mi@0: from another convexity constrained matrix factorization function mi@0: (e.g. svmnmf) (if set to "True" niter can be set to "1") mi@0: compH: bool, optional mi@0: Compute H (True) or only H (False). Useful for using precomputed mi@0: basis vectors. mi@0: dist_measure: string, optional mi@0: The distance measure for finding the next best candidate that mi@0: maximizes the simplex volume ['l2','l1','cosine','sparse_graph_l2'] mi@0: 'l2' (default) mi@0: optimize_lower_bound: bool, optional mi@0: Use the alternative selection criterion that optimizes the lower mi@0: bound (see [1]) mi@0: False (default) mi@0: mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: mi@0: ferr : frobenius norm (after applying .factoriz()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying SIVM to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=10) mi@0: >>> sivm_mdl.initialization() mi@0: >>> sivm_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to sivm_mdl.W, and set compW to False: mi@0: mi@0: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM_SGREEDY(data, num_bases=2, niter=1, compW=False) mi@0: >>> sivm_mdl.initialization() mi@0: >>> sivm_mdl.W = W mi@0: >>> sivm_mdl.factorize() mi@0: mi@0: The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. mi@0: """ mi@0: mi@0: def update_w(self): mi@0: # compute distance matrix -> requiresd for the volume mi@0: self.init_sivm() mi@0: next_sel = list([self.select[0]]) mi@0: self.select = [] mi@0: mi@0: self._v = [] mi@0: self._t = [] mi@0: stime = time.time() mi@0: mi@0: for iter in range(self._num_bases-1): mi@0: # add new selections to openset mi@0: next_sel = list(np.sort(next_sel)) mi@0: D = pdist(self.data[:, next_sel], self.data[:, next_sel]) mi@0: V = np.zeros(self.data.shape[1]) mi@0: d = np.zeros((D.shape[0]+1,D.shape[1]+1)) mi@0: d[:D.shape[0], :D.shape[1]] = D[:,:] mi@0: mi@0: for i in range(self.data.shape[1]): mi@0: # create a temp selection mi@0: dtmp = l2_distance(self.data[:,next_sel], self.data[:,i:i+1]) mi@0: d[:-1,-1] = dtmp mi@0: d[-1,:-1] = dtmp mi@0: # compute volume for temp selection mi@0: V[i] = cmdet(d) mi@0: mi@0: next_index = np.argmax(V) mi@0: next_sel.append(next_index) mi@0: self._v.append(np.max(V)) mi@0: mi@0: self._logger.info('Iter:' + str(iter)) mi@0: self._logger.info('Current selection:' + str(next_sel)) mi@0: self._logger.info('Current volume:' + str(self._v[-1])) mi@0: self._t.append(time.time() - stime) mi@0: mi@0: # update some values ... mi@0: self.select = list(next_sel) mi@0: self.W = self.data[:, self.select] mi@0: mi@0: mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()