mi@0: #!/usr/bin/python2.6 mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Simplex Volume Maximization [1] mi@0: mi@0: SIVM_GSAT: class for gsat-SiVM mi@0: mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. mi@0: Conf. on Information and Knowledge Management. ACM. 2010. mi@0: """ mi@0: mi@0: mi@0: import logging mi@0: import numpy as np mi@0: from dist import * mi@0: from vol import cmdet mi@0: from sivm import SIVM mi@0: mi@0: __all__ = ["SIVM_GSAT"] mi@0: mi@0: class SIVM_GSAT(SIVM): mi@0: """ mi@0: SIVM(data, num_bases=4, dist_measure='l2') mi@0: mi@0: mi@0: Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively mi@0: found by maximizing the volume of the resulting simplex (see [1]). Can be mi@0: applied to data streams using the .online_update_w(vec) function which decides mi@0: on adding data sample "vec" to the already selected basis vectors. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: dist_measure : one of 'l2' ,'cosine', 'l1', 'kl' mi@0: Standard is 'l2' which maximizes the volume of the simplex. In contrast, mi@0: 'cosine' maximizes the volume of a cone (see [1] for details). mi@0: init : string (default: 'fastmap') mi@0: 'fastmap' or 'origin'. Sets the method used for finding the very first mi@0: basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of mi@0: the two vectors that have the largest pairwise distance. mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying SIVM to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM_GSAT(data, num_bases=2) mi@0: >>> sivm_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to sivm_mdl.W, and set compute_w to False: mi@0: mi@0: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM_GSAT(data, num_bases=2) mi@0: >>> sivm_mdl.W = W mi@0: >>> sivm_mdl.factorize(compute_w=False) mi@0: mi@0: The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. mi@0: """ mi@0: mi@0: def init_w(self): mi@0: self.select = range(self._num_bases) mi@0: self.W = self.data[:, self.select] mi@0: mi@0: def online_update_w(self, vec): mi@0: # update D if it does not exist mi@0: k = self._num_bases mi@0: if not hasattr(self, 'D'): mi@0: self.D = np.zeros((k + 1, k + 1)) mi@0: self.D[:k, :k] = pdist(self.W, self.W) mi@0: self.V = cmdet(self.D[:k, :k]) mi@0: mi@0: tmp_d = self._distfunc(self.W, vec.reshape((-1,1))) mi@0: self.D[k, :-1] = tmp_d mi@0: self.D[:-1, k] = tmp_d mi@0: mi@0: v = np.zeros((self._num_bases + 1)) mi@0: mi@0: for i in range(self._num_bases): mi@0: # compute volume for each combination... mi@0: s = np.setdiff1d(range(self._num_bases + 1), [i]) mi@0: v[i] = cmdet((self.D[s,:])[:,s]) mi@0: mi@0: # select index that maximizes the volume mi@0: v[-1] = self.V mi@0: s = np.argmax(v) mi@0: mi@0: if s < self._num_bases: mi@0: self.W[:,s] = vec mi@0: self.D[:self._num_bases, :self._num_bases] = pdist(self.W, self.W) mi@0: mi@0: if not hasattr(self, '_v'): mi@0: self._v = [self.V] mi@0: self.V = v[s] mi@0: self._v.append(v[s]) mi@0: mi@0: self._logger.info('Volume increased:' + str(self.V)) mi@0: return True, s mi@0: mi@0: return False,-1 mi@0: mi@0: def update_w(self): mi@0: n = np.int(np.floor(np.random.random() * self._num_samples)) mi@0: if n not in self.select: mi@0: updated, s = self.online_update_w(self.data[:,n]) mi@0: if updated: mi@0: self.select[s] = n mi@0: self._logger.info('Current selection:' + str(self.select)) mi@0: mi@0: mi@0: def factorize(self, show_progress=False, compute_w=True, compute_h=True, mi@0: compute_err=True, niter=1): mi@0: """ Factorize s.t. WH = data mi@0: mi@0: Parameters mi@0: ---------- mi@0: show_progress : bool mi@0: print some extra information to stdout. mi@0: niter : int mi@0: number of iterations. mi@0: compute_h : bool mi@0: iteratively update values for H. mi@0: compute_w : bool mi@0: iteratively update values for W. mi@0: compute_err : bool mi@0: compute Frobenius norm |data-WH| after each update and store mi@0: it to .ferr[k]. mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .W : updated values for W. mi@0: .H : updated values for H. mi@0: .ferr : Frobenius norm |data-WH|. mi@0: """ mi@0: if show_progress: mi@0: self._logger.setLevel(logging.INFO) mi@0: else: mi@0: self._logger.setLevel(logging.ERROR) mi@0: mi@0: # create W and H if they don't already exist mi@0: # -> any custom initialization to W,H should be done before mi@0: if not hasattr(self,'W'): mi@0: self.init_w() mi@0: mi@0: if not hasattr(self,'H'): mi@0: self.init_h() mi@0: mi@0: if compute_err: mi@0: self.ferr = np.zeros(niter) mi@0: mi@0: for i in xrange(niter): mi@0: if compute_w: mi@0: self.update_w() mi@0: mi@0: if compute_h: mi@0: self.update_h() mi@0: mi@0: if compute_err: mi@0: self.ferr[i] = self.frobenius_norm() mi@0: self._logger.info('Iteration ' + str(i+1) + '/' + str(niter) + mi@0: ' FN:' + str(self.ferr[i])) mi@0: else: mi@0: self._logger.info('Iteration ' + str(i+1) + '/' + str(niter)) mi@0: mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()