mi@0: #!/usr/bin/python2.6 mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Simplex Volume Maximization [1] mi@0: mi@0: SIVM_SEARCH: class for search-SiVM mi@0: mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. mi@0: Conf. on Information and Knowledge Management. ACM. 2010. mi@0: """ mi@0: mi@0: mi@0: import scipy.sparse mi@0: import numpy as np mi@0: from scipy import inf mi@0: try: mi@0: from scipy.misc.common import factorial mi@0: except: mi@0: from scipy.misc import factorial mi@0: mi@0: from dist import * mi@0: from vol import * mi@0: from sivm import SIVM mi@0: mi@0: __all__ = ["SIVM_SEARCH"] mi@0: mi@0: class SIVM_SEARCH(SIVM): mi@0: """ mi@0: SIVM_SEARCH(data, num_bases=4, dist_measure='l2') mi@0: mi@0: mi@0: Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively mi@0: found by maximizing the volume of the resulting simplex (see [1]). A solution mi@0: is found by employing a simple A-star like search strategy. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: dist_measure : one of 'l2' ,'cosine', 'l1', 'kl' mi@0: Standard is 'l2' which maximizes the volume of the simplex. In contrast, mi@0: 'cosine' maximizes the volume of a cone (see [1] for details). mi@0: init : string (default: 'fastmap') mi@0: 'fastmap' or 'origin'. Sets the method used for finding the very first mi@0: basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of mi@0: the two vectors that have the largest pairwise distance. mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying SIVM to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2) mi@0: >>> sivm_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to sivm_mdl.W, and set compute_w to False: mi@0: mi@0: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM_SEARCH(data, num_bases=2) mi@0: >>> sivm_mdl.W = W mi@0: >>> sivm_mdl.factorize(compute_w=False) mi@0: mi@0: The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. mi@0: """ mi@0: mi@0: def update_w(self): mi@0: def h(sel,D,k): mi@0: # compute the volume for a selection of sel columns mi@0: # and a k-1 simplex (-> k columns have to be selected) mi@0: mv = np.max(D) mi@0: mi@0: # fill the remaining distance by the maximal overall found distance mi@0: d = np.zeros((k,k)) + mv mi@0: for i in range(k): mi@0: d[i,i] = 0.0 mi@0: mi@0: for idx_i,i in enumerate(sel): mi@0: for idx_j,j in enumerate(sel): mi@0: d[idx_i,idx_j] = D[i,j] mi@0: mi@0: return d mi@0: mi@0: # compute distance matrix -> required for the volume mi@0: D = pdist(self.data, self.data) mi@0: Openset = {} mi@0: mi@0: for i in range(self._num_samples): mi@0: # compute volume for temp selection mi@0: d = h([i],D,self._num_bases) mi@0: Vtmp = cmdet(d) mi@0: Openset[tuple([i])] = Vtmp mi@0: mi@0: Closedset = {} mi@0: finished = False mi@0: self._v = [] mi@0: self.init_sivm() mi@0: next_sel = np.array([self.select[0]]) mi@0: iter = 0 mi@0: mi@0: while not finished: mi@0: # add the current selection to closedset mi@0: Closedset[(tuple(next_sel))] = [] mi@0: mi@0: for i in range(D.shape[0]): mi@0: # create a temp selection mi@0: tmp_sel = np.array(next_sel).flatten() mi@0: tmp_sel = np.concatenate((tmp_sel, [i]),axis=0) mi@0: tmp_sel = np.unique(tmp_sel) mi@0: tmp_sel = list(tmp_sel) mi@0: hkey = tuple(tmp_sel) mi@0: mi@0: if len(tmp_sel) > len(next_sel) and ( mi@0: not Closedset.has_key(hkey)) and ( mi@0: not Openset.has_key(hkey)): mi@0: mi@0: # compute volume for temp selection mi@0: d = h(tmp_sel, D, self._num_bases) mi@0: Vtmp = cmdet(d) mi@0: mi@0: # add to openset mi@0: Openset[hkey] = Vtmp mi@0: mi@0: # get next best tuple mi@0: vmax = 0.0 mi@0: for (k,v) in Openset.iteritems(): mi@0: if v > vmax: mi@0: next_sel = k mi@0: vmax = v mi@0: mi@0: self._logger.info('Iter:' + str(iter)) mi@0: self._logger.info('Current selection:' + str(next_sel)) mi@0: self._logger.info('Current volume:' + str(vmax)) mi@0: self._v.append(vmax) mi@0: mi@0: # remove next_sel from openset mi@0: Openset.pop(next_sel) mi@0: mi@0: if len(list(next_sel)) == self._num_bases: mi@0: finished = True mi@0: iter += 1 mi@0: mi@0: # update some values ... mi@0: self.select = list(next_sel) mi@0: self.W = self.data[:, self.select] mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()