mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Geometric-Map mi@0: mi@0: GMAP: Class for Geometric-Map mi@0: """ mi@0: mi@0: mi@0: import scipy.sparse mi@0: import numpy as np mi@0: mi@0: from dist import * mi@0: from aa import AA mi@0: from kmeans import Kmeans mi@0: mi@0: __all__ = ["GMAP"] mi@0: mi@0: class GMAP(AA): mi@0: """ mi@0: GMAP(data, num_bases=4, dist_measure='l2') mi@0: mi@0: mi@0: Geometric-Map. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. G-MAP can emulate/approximate several mi@0: standard methods including PCA, NMF, and AA. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: method : one of 'pca' ,'nmf', 'aa', default is 'pca' which emulates mi@0: Principal Component Analysis using the geometric map method ('nmf' mi@0: emulates Non-negative Matrix Factorization, 'aa' emulates Archetypal mi@0: Analysis). mi@0: robust_map : bool, optional mi@0: use robust_map or the standard max-val selection mi@0: [see "On FastMap and the Convex Hull of Multivariate Data: Toward mi@0: Fast and Robust Dimension Reduction", Ostrouchov and Samatova, PAMI mi@0: 2005] mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying GMAP to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> gmap_mdl = GMAP(data, num_bases=2) mi@0: >>> gmap_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in gmap_mdl.W, the coefficients in gmap_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to gmap_mdl.W, and set compute_w to False: mi@0: mi@0: >>> data = np.array([[1.5, 1.3], [1.2, 0.3]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> gmap_mdl = GMAP(data, num_bases=2) mi@0: >>> gmap_mdl.W = W mi@0: >>> gmap_mdl.factorize(compute_w=False) mi@0: mi@0: The result is a set of coefficients gmap_mdl.H, s.t. data = W * gmap_mdl.H. mi@0: """ mi@0: mi@0: # always overwrite the default number of iterations mi@0: # -> any value other does not make sense. mi@0: _NITER = 1 mi@0: mi@0: def __init__(self, data, num_bases=4, method='pca', robust_map=True): mi@0: mi@0: AA.__init__(self, data, num_bases=num_bases) mi@0: self.sub = [] mi@0: self._robust_map = robust_map mi@0: self._method = method mi@0: mi@0: mi@0: def init_h(self): mi@0: self.H = np.zeros((self._num_bases, self._num_samples)) mi@0: mi@0: def init_w(self): mi@0: self.W = np.zeros((self._data_dimension, self._num_bases)) mi@0: mi@0: def update_w(self): mi@0: """ compute new W """ mi@0: mi@0: def select_next(iterval): mi@0: """ select the next best data sample using robust map mi@0: or simply the max iterval ... """ mi@0: mi@0: if self._robust_map: mi@0: k = np.argsort(iterval)[::-1] mi@0: d_sub = self.data[:,k[:self._robust_nselect]] mi@0: self.sub.extend(k[:self._robust_nselect]) mi@0: mi@0: # cluster d_sub mi@0: kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster) mi@0: kmeans_mdl.factorize(niter=10) mi@0: mi@0: # get largest cluster mi@0: h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0] mi@0: largest_cluster = np.argmax(h) mi@0: sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub) mi@0: sel = k[np.argmin(sel)] mi@0: else: mi@0: sel = np.argmax(iterval) mi@0: mi@0: return sel mi@0: mi@0: EPS = 10**-8 mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: norm_data = np.sqrt(self.data.multiply(self.data).sum(axis=0)) mi@0: norm_data = np.array(norm_data).reshape((-1)) mi@0: else: mi@0: norm_data = np.sqrt(np.sum(self.data**2, axis=0)) mi@0: mi@0: mi@0: self.select = [] mi@0: mi@0: if self._method == 'pca' or self._method == 'aa': mi@0: iterval = norm_data.copy() mi@0: mi@0: if self._method == 'nmf': mi@0: iterval = np.sum(self.data, axis=0)/(np.sqrt(self.data.shape[0])*norm_data) mi@0: iterval = 1.0 - iterval mi@0: mi@0: self.select.append(select_next(iterval)) mi@0: mi@0: mi@0: for l in range(1, self._num_bases): mi@0: mi@0: if scipy.sparse.issparse(self.data): mi@0: c = self.data[:, self.select[-1]:self.select[-1]+1].T * self.data mi@0: c = np.array(c.todense()) mi@0: else: mi@0: c = np.dot(self.data[:,self.select[-1]], self.data) mi@0: mi@0: c = c/(norm_data * norm_data[self.select[-1]]) mi@0: mi@0: if self._method == 'pca': mi@0: c = 1.0 - np.abs(c) mi@0: c = c * norm_data mi@0: mi@0: elif self._method == 'aa': mi@0: c = (c*-1.0 + 1.0)/2.0 mi@0: c = c * norm_data mi@0: mi@0: elif self._method == 'nmf': mi@0: c = 1.0 - np.abs(c) mi@0: mi@0: ### update the estimated volume mi@0: iterval = c * iterval mi@0: mi@0: # detect the next best data point mi@0: self.select.append(select_next(iterval)) mi@0: mi@0: self._logger.info('cur_nodes: ' + str(self.select)) mi@0: mi@0: # sort indices, otherwise h5py won't work mi@0: self.W = self.data[:, np.sort(self.select)] mi@0: mi@0: # "unsort" it again to keep the correct order mi@0: self.W = self.W[:, np.argsort(np.argsort(self.select))] mi@0: mi@0: def factorize(self, show_progress=False, compute_w=True, compute_h=True, mi@0: compute_err=True, robust_cluster=3, niter=1, robust_nselect=-1): mi@0: """ Factorize s.t. WH = data mi@0: mi@0: Parameters mi@0: ---------- mi@0: show_progress : bool mi@0: print some extra information to stdout. mi@0: False, default mi@0: compute_h : bool mi@0: iteratively update values for H. mi@0: True, default mi@0: compute_w : bool mi@0: iteratively update values for W. mi@0: default, True mi@0: compute_err : bool mi@0: compute Frobenius norm |data-WH| after each update and store mi@0: it to .ferr[k]. mi@0: robust_cluster : int, optional mi@0: set the number of clusters for robust map selection. mi@0: 3, default mi@0: robust_nselect : int, optional mi@0: set the number of samples to consider for robust map mi@0: selection. mi@0: -1, default (automatically determine suitable number) mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .W : updated values for W. mi@0: .H : updated values for H. mi@0: .ferr : Frobenius norm |data-WH|. mi@0: """ mi@0: self._robust_cluster = robust_cluster mi@0: self._robust_nselect = robust_nselect mi@0: mi@0: if self._robust_nselect == -1: mi@0: self._robust_nselect = np.round(np.log(self.data.shape[1])*2) mi@0: mi@0: AA.factorize(self, niter=1, show_progress=show_progress, mi@0: compute_w=compute_w, compute_h=compute_h, mi@0: compute_err=compute_err) mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()