annotate pymf/sub.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Matrix sampling methods
mi@0 8
mi@0 9 SUB: apply one of the matrix factorization methods of PyMF
mi@0 10 on sampled data for computing W, then compute H.
mi@0 11
mi@0 12 Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL).
mi@0 13 """
mi@0 14
mi@0 15
mi@0 16
mi@0 17 import numpy as np
mi@0 18 import random
mi@0 19 #from itertools import combinations
mi@0 20 from chnmf import combinations
mi@0 21
mi@0 22 import dist
mi@0 23 from chnmf import quickhull
mi@0 24 from nmf import NMF
mi@0 25 from pca import PCA
mi@0 26 from kmeans import Kmeans
mi@0 27 from laesa import LAESA
mi@0 28 from sivm import SIVM
mi@0 29
mi@0 30 __all__ = ["SUB"]
mi@0 31
mi@0 32 class SUB(NMF):
mi@0 33 """
mi@0 34 SUB(data, mfmethod, sstrategy='rand', nsub=20, show_progress=True, mapW=False,
mi@0 35 base_sel=2, num_bases=3 , niterH=1, niter=100, compute_h=True, compute_w=True, )
mi@0 36
mi@0 37 Evaluate a matrix factorization method "mfmethod" for a certain sampling
mi@0 38 strategy "sstrategy". This is particular useful for very large datasets.
mi@0 39
mi@0 40 Parameters
mi@0 41 ----------
mi@0 42 todo ...
mi@0 43
mi@0 44 Attributes
mi@0 45 ----------
mi@0 46 todo ....
mi@0 47 """
mi@0 48
mi@0 49 def __init__(self, data, mfmethod, nsub=20, show_progress=True, mapW=False, base_sel=2,
mi@0 50 num_bases=3 , niterH=1, compute_h=True, compute_w=True, sstrategy='rand'):
mi@0 51 NMF.__init__(self, data, num_bases=num_bases, compute_h=compute_h, show_progress=show_progress, compute_w=compute_w)
mi@0 52
mi@0 53 self._niterH = niterH
mi@0 54 self._nsub = nsub
mi@0 55 self.data = data
mi@0 56 self._mfmethod = mfmethod
mi@0 57 self._mapW = mapW
mi@0 58 self._sstrategy = sstrategy
mi@0 59 self._base_sel = base_sel
mi@0 60
mi@0 61 # assign the correct distance function
mi@0 62 if self._sstrategy == 'cur':
mi@0 63 self._subfunc = self.curselect
mi@0 64
mi@0 65 elif self._sstrategy == 'kmeans':
mi@0 66 self._subfunc = self.kmeansselect
mi@0 67
mi@0 68 elif self._sstrategy == 'hull':
mi@0 69 self._subfunc = self.hullselect
mi@0 70
mi@0 71 elif self._sstrategy == 'laesa':
mi@0 72 self._subfunc = self.laesaselect
mi@0 73
mi@0 74 elif self._sstrategy == 'sivm':
mi@0 75 self._subfunc = self.sivmselect
mi@0 76
mi@0 77 else:
mi@0 78 self._subfunc = self.randselect
mi@0 79
mi@0 80 def hullselect(self):
mi@0 81
mi@0 82 def selectHullPoints(data, n=20):
mi@0 83 """ select data points for pairwise projections of the first n
mi@0 84 dimensions """
mi@0 85
mi@0 86 # iterate over all projections and select data points
mi@0 87 idx = np.array([])
mi@0 88
mi@0 89 # iterate over some pairwise combinations of dimensions
mi@0 90 for i in combinations(range(n), 2):
mi@0 91
mi@0 92 # sample convex hull points in 2D projection
mi@0 93 convex_hull_d = quickhull(data[i, :].T)
mi@0 94
mi@0 95 # get indices for convex hull data points
mi@0 96 idx = np.append(idx, dist.vq(data[i, :], convex_hull_d.T))
mi@0 97 idx = np.unique(idx)
mi@0 98
mi@0 99 return np.int32(idx)
mi@0 100
mi@0 101
mi@0 102 # determine convex hull data points only if the total
mi@0 103 # amount of available data is >50
mi@0 104 #if self.data.shape[1] > 50:
mi@0 105 pcamodel = PCA(self.data, show_progress=self._show_progress)
mi@0 106 pcamodel.factorize()
mi@0 107
mi@0 108 idx = selectHullPoints(pcamodel.H, n=self._base_sel)
mi@0 109
mi@0 110 # set the number of subsampled data
mi@0 111 self.nsub = len(idx)
mi@0 112
mi@0 113 return idx
mi@0 114
mi@0 115 def kmeansselect(self):
mi@0 116 kmeans_mdl = Kmeans(self.data, num_bases=self._nsub)
mi@0 117 kmeans_mdl.initialization()
mi@0 118 kmeans_mdl.factorize()
mi@0 119
mi@0 120 # pick data samples closest to the centres
mi@0 121 idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W)
mi@0 122 return idx
mi@0 123
mi@0 124 def curselect(self):
mi@0 125 def sample_probability():
mi@0 126 dsquare = self.data[:,:]**2
mi@0 127
mi@0 128 pcol = np.array(dsquare.sum(axis=0))
mi@0 129 pcol /= pcol.sum()
mi@0 130
mi@0 131 return (pcol.reshape(-1,1))
mi@0 132
mi@0 133 probs = sample_probability()
mi@0 134 prob_cols = np.cumsum(probs.flatten()) #.flatten()
mi@0 135 temp_ind = np.zeros(self._nsub, np.int32)
mi@0 136
mi@0 137 for i in range(self._nsub):
mi@0 138 tempI = np.where(prob_cols >= np.random.rand())[0]
mi@0 139 temp_ind[i] = tempI[0]
mi@0 140
mi@0 141 return np.sort(temp_ind)
mi@0 142
mi@0 143 def sivmselect(self):
mi@0 144 sivmmdl = SIVM(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine')
mi@0 145
mi@0 146 sivmmdl.initialization()
mi@0 147 sivmmdl.factorize()
mi@0 148 idx = sivmmdl.select
mi@0 149 return idx
mi@0 150
mi@0 151 def laesaselect(self):
mi@0 152 laesamdl = LAESA(self.data, num_bases=self._nsub, compute_w=True, compute_h=False, dist_measure='cosine')
mi@0 153 laesamdl.initialization()
mi@0 154 laesamdl.factorize()
mi@0 155 idx = laesamdl.select
mi@0 156 return idx
mi@0 157
mi@0 158
mi@0 159 def randselect(self):
mi@0 160 idx = random.sample(xrange(self._num_samples), self._nsub)
mi@0 161 return np.sort(np.int32(idx))
mi@0 162
mi@0 163 def update_w(self):
mi@0 164
mi@0 165 idx = self._subfunc()
mi@0 166 idx = np.sort(np.int32(idx))
mi@0 167
mi@0 168
mi@0 169 mdl_small = self._mfmethod(self.data[:, idx],
mi@0 170 num_bases=self._num_bases,
mi@0 171 show_progress=self._show_progress,
mi@0 172 compute_w=True)
mi@0 173
mi@0 174 # initialize W, H, and beta
mi@0 175 mdl_small.initialization()
mi@0 176
mi@0 177 # determine W
mi@0 178 mdl_small.factorize()
mi@0 179
mi@0 180
mi@0 181 self.mdl = self._mfmethod(self.data[:, :],
mi@0 182 num_bases=self._num_bases ,
mi@0 183 show_progress=self._show_progress,
mi@0 184 compute_w=False)
mi@0 185
mi@0 186
mi@0 187 self.mdl.initialization()
mi@0 188
mi@0 189 if self._mapW:
mi@0 190 # compute pairwise distances
mi@0 191 #distance = vq(self.data, self.W)
mi@0 192 _Wmapped_index = dist.vq(self.mdl.data, mdl_small.W)
mi@0 193
mi@0 194 # do not directly assign, i.e. Wdist = self.data[:,sel]
mi@0 195 # as self might be unsorted (in non ascending order)
mi@0 196 # -> sorting sel would screw the matching to W if
mi@0 197 # self.data is stored as a hdf5 table (see h5py)
mi@0 198 for i,s in enumerate(_Wmapped_index):
mi@0 199 self.mdl.W[:,i] = self.mdl.data[:,s]
mi@0 200 else:
mi@0 201 self.mdl.W = np.copy(mdl_small.W)
mi@0 202
mi@0 203 def update_h(self):
mi@0 204 self.mdl.factorize()
mi@0 205
mi@0 206 def factorize(self):
mi@0 207 """Do factorization s.t. data = dot(dot(data,beta),H), under the convexity constraint
mi@0 208 beta >=0, sum(beta)=1, H >=0, sum(H)=1
mi@0 209 """
mi@0 210 # compute new coefficients for reconstructing data points
mi@0 211 self.update_w()
mi@0 212
mi@0 213 # for CHNMF it is sometimes useful to only compute
mi@0 214 # the basis vectors
mi@0 215 if self._compute_h:
mi@0 216 self.update_h()
mi@0 217
mi@0 218 self.W = self.mdl.W
mi@0 219 self.H = self.mdl.H
mi@0 220
mi@0 221 self.ferr = np.zeros(1)
mi@0 222 self.ferr[0] = self.mdl.frobenius_norm()
mi@0 223 self._print_cur_status(' Fro:' + str(self.ferr[0]))
mi@0 224
mi@0 225 if __name__ == "__main__":
mi@0 226 import doctest
mi@0 227 doctest.testmod()