mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Matrix sampling methods mi@0: mi@0: SUB: apply one of the matrix factorization methods of PyMF mi@0: on sampled data for computing W, then compute H. mi@0: mi@0: Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL). mi@0: """ mi@0: mi@0: mi@0: mi@0: import numpy as np mi@0: import random mi@0: #from itertools import combinations mi@0: from chnmf import combinations mi@0: mi@0: import dist mi@0: from chnmf import quickhull mi@0: from nmf import NMF mi@0: from pca import PCA mi@0: from kmeans import Kmeans mi@0: from laesa import LAESA mi@0: from sivm import SIVM mi@0: mi@0: __all__ = ["SUB"] mi@0: mi@0: class SUB(NMF): mi@0: """ mi@0: SUB(data, mfmethod, sstrategy='rand', nsub=20, show_progress=True, mapW=False, mi@0: base_sel=2, num_bases=3 , niterH=1, niter=100, compute_h=True, compute_w=True, ) mi@0: mi@0: Evaluate a matrix factorization method "mfmethod" for a certain sampling mi@0: strategy "sstrategy". This is particular useful for very large datasets. mi@0: mi@0: Parameters mi@0: ---------- mi@0: todo ... mi@0: mi@0: Attributes mi@0: ---------- mi@0: todo .... mi@0: """ mi@0: mi@0: def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2, mi@0: num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'): mi@0: NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w) mi@0: mi@0: self._niterH = niterH mi@0: self._nsub = nsub mi@0: self.data = data mi@0: self._mfmethod = mfmethod mi@0: self._mapW = mapW mi@0: self._sstrategy = sstrategy mi@0: self._base_sel = base_sel mi@0: mi@0: # assign the correct distance function mi@0: if self._sstrategy == 'cur': mi@0: self._subfunc = self.curselect mi@0: mi@0: elif self._sstrategy == 'kmeans': mi@0: self._subfunc = self.kmeansselect mi@0: mi@0: elif self._sstrategy == 'hull': mi@0: self._subfunc = self.hullselect mi@0: mi@0: elif self._sstrategy == 'laesa': mi@0: self._subfunc = self.laesaselect mi@0: mi@0: elif self._sstrategy == 'sivm': mi@0: self._subfunc = self.sivmselect mi@0: mi@0: else: mi@0: self._subfunc = self.randselect mi@0: mi@0: def hullselect(self): mi@0: mi@0: def selectHullPoints(data, n=20): mi@0: """ select data points for pairwise projections of the first n mi@0: dimensions """ mi@0: mi@0: # iterate over all projections and select data points mi@0: idx = np.array([]) mi@0: mi@0: # iterate over some pairwise combinations of dimensions mi@0: for i in combinations(range(n), 2): mi@0: mi@0: # sample convex hull points in 2D projection mi@0: convex_hull_d = quickhull(data[i, :].T) mi@0: mi@0: # get indices for convex hull data points mi@0: idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T)) mi@0: idx = np.unique(idx) mi@0: mi@0: return np.int32(idx) mi@0: mi@0: mi@0: # determine convex hull data points only if the total mi@0: # amount of available data is >50 mi@0: #if self.data.shape[1] > 50: mi@0: pcamodel = PCA(self.data, show_progress=self._show_progress) mi@0: pcamodel.factorize() mi@0: mi@0: idx = selectHullPoints(pcamodel.H, n=self._base_sel) mi@0: mi@0: # set the number of subsampled data mi@0: self.nsub = len(idx) mi@0: mi@0: return idx mi@0: mi@0: def kmeansselect(self): mi@0: kmeans_mdl = Kmeans(self.data, num_bases=self._nsub) mi@0: kmeans_mdl.initialization() mi@0: kmeans_mdl.factorize() mi@0: mi@0: # pick data samples closest to the centres mi@0: idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W) mi@0: return idx mi@0: mi@0: def curselect(self): mi@0: def sample_probability(): mi@0: dsquare = self.data[:,:]**2 mi@0: mi@0: pcol = np.array(dsquare.sum(axis=0)) mi@0: pcol /= pcol.sum() mi@0: mi@0: return (pcol.reshape(-1,1)) mi@0: mi@0: probs = sample_probability() mi@0: prob_cols = np.cumsum(probs.flatten()) #.flatten() mi@0: temp_ind = np.zeros(self._nsub, np.int32) mi@0: mi@0: for i in range(self._nsub): mi@0: tempI = np.where(prob_cols >= np.random.rand())[0] mi@0: temp_ind[i] = tempI[0] mi@0: mi@0: return np.sort(temp_ind) mi@0: mi@0: def sivmselect(self): mi@0: sivmmdl = SIVM(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine') mi@0: mi@0: sivmmdl.initialization() mi@0: sivmmdl.factorize() mi@0: idx = sivmmdl.select mi@0: return idx mi@0: mi@0: def laesaselect(self): mi@0: laesamdl = LAESA(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine') mi@0: laesamdl.initialization() mi@0: laesamdl.factorize() mi@0: idx = laesamdl.select mi@0: return idx mi@0: mi@0: mi@0: def randselect(self): mi@0: idx = random.sample(xrange(self._num_samples), self._nsub) mi@0: return np.sort(np.int32(idx)) mi@0: mi@0: def update_w(self): mi@0: mi@0: idx = self._subfunc() mi@0: idx = np.sort(np.int32(idx)) mi@0: mi@0: mi@0: mdl_small = self._mfmethod(self.data[:, idx], mi@0: num_bases=self._num_bases, mi@0: show_progress=self._show_progress, mi@0: compute_w=True) mi@0: mi@0: # initialize W, H, and beta mi@0: mdl_small.initialization() mi@0: mi@0: # determine W mi@0: mdl_small.factorize() mi@0: mi@0: mi@0: self.mdl = self._mfmethod(self.data[:, :], mi@0: num_bases=self._num_bases , mi@0: show_progress=self._show_progress, mi@0: compute_w=False) mi@0: mi@0: mi@0: self.mdl.initialization() mi@0: mi@0: if self._mapW: mi@0: # compute pairwise distances mi@0: #distance = vq(self.data, self.W) mi@0: _Wmapped_index = dist.vq(self.mdl.data, mdl_small.W) mi@0: mi@0: # do not directly assign, i.e. Wdist = self.data[:,sel] mi@0: # as self might be unsorted (in non ascending order) mi@0: # -> sorting sel would screw the matching to W if mi@0: # self.data is stored as a hdf5 table (see h5py) mi@0: for i,s in enumerate(_Wmapped_index): mi@0: self.mdl.W[:,i] = self.mdl.data[:,s] mi@0: else: mi@0: self.mdl.W = np.copy(mdl_small.W) mi@0: mi@0: def update_h(self): mi@0: self.mdl.factorize() mi@0: mi@0: def factorize(self): mi@0: """Do factorization s.t. data = dot(dot(data,beta),H), under the convexity constraint mi@0: beta >=0, sum(beta)=1, H >=0, sum(H)=1 mi@0: """ mi@0: # compute new coefficients for reconstructing data points mi@0: self.update_w() mi@0: mi@0: # for CHNMF it is sometimes useful to only compute mi@0: # the basis vectors mi@0: if self._compute_h: mi@0: self.update_h() mi@0: mi@0: self.W = self.mdl.W mi@0: self.H = self.mdl.H mi@0: mi@0: self.ferr = np.zeros(1) mi@0: self.ferr[0] = self.mdl.frobenius_norm() mi@0: self._print_cur_status(' Fro:' + str(self.ferr[0])) mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()