mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Simplex Volume Maximization [1] mi@0: mi@0: SIVM: class for SiVM mi@0: mi@0: [1] C. Thurau, K. Kersting, and C. Bauckhage. Yes We Can - Simplex Volume mi@0: Maximization for Descriptive Web-Scale Matrix Factorization. In Proc. Int. mi@0: Conf. on Information and Knowledge Management. ACM. 2010. mi@0: """ mi@0: mi@0: mi@0: import scipy.sparse mi@0: import numpy as np mi@0: mi@0: from dist import * mi@0: from aa import AA mi@0: mi@0: __all__ = ["SIVM"] mi@0: mi@0: class SIVM(AA): mi@0: """ mi@0: SIVM(data, num_bases=4, dist_measure='l2') mi@0: mi@0: mi@0: Simplex Volume Maximization. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. H is restricted to convexity. W is iteratively mi@0: found by maximizing the volume of the resulting simplex (see [1]). mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: dist_measure : one of 'l2' ,'cosine', 'l1', 'kl' mi@0: Standard is 'l2' which maximizes the volume of the simplex. In contrast, mi@0: 'cosine' maximizes the volume of a cone (see [1] for details). mi@0: init : string (default: 'fastmap') mi@0: 'fastmap' or 'origin'. Sets the method used for finding the very first mi@0: basis vector. 'Origin' assumes the zero vector, 'Fastmap' picks one of mi@0: the two vectors that have the largest pairwise distance. mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying SIVM to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM(data, num_bases=2) mi@0: >>> sivm_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in sivm_mdl.W, the coefficients in sivm_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to sivm_mdl.W, and set compute_w to False: mi@0: mi@0: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> sivm_mdl = SIVM(data, num_bases=2) mi@0: >>> sivm_mdl.W = W mi@0: >>> sivm_mdl.factorize(compute_w=False) mi@0: mi@0: The result is a set of coefficients sivm_mdl.H, s.t. data = W * sivm_mdl.H. mi@0: """ mi@0: mi@0: # always overwrite the default number of iterations mi@0: # -> any value other does not make sense. mi@0: _NITER = 1 mi@0: mi@0: def __init__(self, data, num_bases=4, dist_measure='l2', init='fastmap'): mi@0: mi@0: AA.__init__(self, data, num_bases=num_bases) mi@0: mi@0: self._dist_measure = dist_measure mi@0: self._init = init mi@0: mi@0: # assign the correct distance function mi@0: if self._dist_measure == 'l1': mi@0: self._distfunc = l1_distance mi@0: mi@0: elif self._dist_measure == 'l2': mi@0: self._distfunc = l2_distance mi@0: mi@0: elif self._dist_measure == 'cosine': mi@0: self._distfunc = cosine_distance mi@0: mi@0: elif self._dist_measure == 'abs_cosine': mi@0: self._distfunc = abs_cosine_distance mi@0: mi@0: elif self._dist_measure == 'weighted_abs_cosine': mi@0: self._distfunc = weighted_abs_cosine_distance mi@0: mi@0: elif self._dist_measure == 'kl': mi@0: self._distfunc = kl_divergence mi@0: mi@0: mi@0: def _distance(self, idx): mi@0: """ compute distances of a specific data point to all other samples""" mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: step = self.data.shape[1] mi@0: else: mi@0: step = 50000 mi@0: mi@0: d = np.zeros((self.data.shape[1])) mi@0: if idx == -1: mi@0: # set vec to origin if idx=-1 mi@0: vec = np.zeros((self.data.shape[0], 1)) mi@0: if scipy.sparse.issparse(self.data): mi@0: vec = scipy.sparse.csc_matrix(vec) mi@0: else: mi@0: vec = self.data[:, idx:idx+1] mi@0: mi@0: self._logger.info('compute distance to node ' + str(idx)) mi@0: mi@0: # slice data into smaller chunks mi@0: for idx_start in range(0, self.data.shape[1], step): mi@0: if idx_start + step > self.data.shape[1]: mi@0: idx_end = self.data.shape[1] mi@0: else: mi@0: idx_end = idx_start + step mi@0: mi@0: d[idx_start:idx_end] = self._distfunc( mi@0: self.data[:,idx_start:idx_end], vec) mi@0: self._logger.info('completed:' + mi@0: str(idx_end/(self.data.shape[1]/100.0)) + "%") mi@0: return d mi@0: mi@0: def init_h(self): mi@0: self.H = np.zeros((self._num_bases, self._num_samples)) mi@0: mi@0: def init_w(self): mi@0: self.W = np.zeros((self._data_dimension, self._num_bases)) mi@0: mi@0: def init_sivm(self): mi@0: self.select = [] mi@0: if self._init == 'fastmap': mi@0: # Fastmap like initialization mi@0: # set the starting index for fastmap initialization mi@0: cur_p = 0 mi@0: mi@0: # after 3 iterations the first "real" index is found mi@0: for i in range(3): mi@0: d = self._distance(cur_p) mi@0: cur_p = np.argmax(d) mi@0: mi@0: # store maximal found distance -> later used for "a" (->update_w) mi@0: self._maxd = np.max(d) mi@0: self.select.append(cur_p) mi@0: mi@0: elif self._init == 'origin': mi@0: # set first vertex to origin mi@0: cur_p = -1 mi@0: d = self._distance(cur_p) mi@0: self._maxd = np.max(d) mi@0: self.select.append(cur_p) mi@0: mi@0: def update_w(self): mi@0: """ compute new W """ mi@0: EPS = 10**-8 mi@0: self.init_sivm() mi@0: mi@0: # initialize some of the recursively updated distance measures .... mi@0: d_square = np.zeros((self.data.shape[1])) mi@0: d_sum = np.zeros((self.data.shape[1])) mi@0: d_i_times_d_j = np.zeros((self.data.shape[1])) mi@0: distiter = np.zeros((self.data.shape[1])) mi@0: a = np.log(self._maxd) mi@0: a_inc = a.copy() mi@0: mi@0: for l in range(1, self._num_bases): mi@0: d = self._distance(self.select[l-1]) mi@0: mi@0: # take the log of d (sually more stable that d) mi@0: d = np.log(d + EPS) mi@0: mi@0: d_i_times_d_j += d * d_sum mi@0: d_sum += d mi@0: d_square += d**2 mi@0: distiter = d_i_times_d_j + a*d_sum - (l/2.0) * d_square mi@0: mi@0: # detect the next best data point mi@0: self.select.append(np.argmax(distiter)) mi@0: mi@0: self._logger.info('cur_nodes: ' + str(self.select)) mi@0: mi@0: # sort indices, otherwise h5py won't work mi@0: self.W = self.data[:, np.sort(self.select)] mi@0: mi@0: # "unsort" it again to keep the correct order mi@0: self.W = self.W[:, np.argsort(np.argsort(self.select))] mi@0: mi@0: def factorize(self, show_progress=False, compute_w=True, compute_h=True, mi@0: compute_err=True, niter=1): mi@0: """ Factorize s.t. WH = data mi@0: mi@0: Parameters mi@0: ---------- mi@0: show_progress : bool mi@0: print some extra information to stdout. mi@0: compute_h : bool mi@0: iteratively update values for H. mi@0: compute_w : bool mi@0: iteratively update values for W. mi@0: compute_err : bool mi@0: compute Frobenius norm |data-WH| after each update and store mi@0: it to .ferr[k]. mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .W : updated values for W. mi@0: .H : updated values for H. mi@0: .ferr : Frobenius norm |data-WH|. mi@0: """ mi@0: mi@0: AA.factorize(self, niter=1, show_progress=show_progress, mi@0: compute_w=compute_w, compute_h=compute_h, mi@0: compute_err=compute_err) mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()