segmentation: pymf/gmap.py annotate

annotate pymf/gmap.py @ 19:890cfe424f4a tip

added annotations

author	mitian
date	Fri, 11 Dec 2015 09:47:40 +0000
parents	26838b1f560f
children

rev	line source
mi@0	1 #!/usr/bin/python
mi@0	2 #
mi@0	3 # Copyright (C) Christian Thurau, 2010.
mi@0	4 # Licensed under the GNU General Public License (GPL).
mi@0	5 # http://www.gnu.org/licenses/gpl.txt
mi@0	6 """
mi@0	7 PyMF Geometric-Map
mi@0	8
mi@0	9 GMAP: Class for Geometric-Map
mi@0	10 """
mi@0	11
mi@0	12
mi@0	13 import scipy.sparse
mi@0	14 import numpy as np
mi@0	15
mi@0	16 from dist import *
mi@0	17 from aa import AA
mi@0	18 from kmeans import Kmeans
mi@0	19
mi@0	20 __all__ = ["GMAP"]
mi@0	21
mi@0	22 class GMAP(AA):
mi@0	23 """
mi@0	24 GMAP(data, num_bases=4, dist_measure='l2')
mi@0	25
mi@0	26
mi@0	27 Geometric-Map. Factorize a data matrix into two matrices s.t.
mi@0	28 F = \| data - W*H \| is minimal. G-MAP can emulate/approximate several
mi@0	29 standard methods including PCA, NMF, and AA.
mi@0	30
mi@0	31 Parameters
mi@0	32 ----------
mi@0	33 data : array_like, shape (_data_dimension, _num_samples)
mi@0	34 the input data
mi@0	35 num_bases: int, optional
mi@0	36 Number of bases to compute (column rank of W and row rank of H).
mi@0	37 4 (default)
mi@0	38 method : one of 'pca' ,'nmf', 'aa', default is 'pca' which emulates
mi@0	39 Principal Component Analysis using the geometric map method ('nmf'
mi@0	40 emulates Non-negative Matrix Factorization, 'aa' emulates Archetypal
mi@0	41 Analysis).
mi@0	42 robust_map : bool, optional
mi@0	43 use robust_map or the standard max-val selection
mi@0	44 [see "On FastMap and the Convex Hull of Multivariate Data: Toward
mi@0	45 Fast and Robust Dimension Reduction", Ostrouchov and Samatova, PAMI
mi@0	46 2005]
mi@0	47 Attributes
mi@0	48 ----------
mi@0	49 W : "data_dimension x num_bases" matrix of basis vectors
mi@0	50 H : "num bases x num_samples" matrix of coefficients
mi@0	51 ferr : frobenius norm (after calling .factorize())
mi@0	52
mi@0	53 Example
mi@0	54 -------
mi@0	55 Applying GMAP to some rather stupid data set:
mi@0	56
mi@0	57 >>> import numpy as np
mi@0	58 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0	59 >>> gmap_mdl = GMAP(data, num_bases=2)
mi@0	60 >>> gmap_mdl.factorize()
mi@0	61
mi@0	62 The basis vectors are now stored in gmap_mdl.W, the coefficients in gmap_mdl.H.
mi@0	63 To compute coefficients for an existing set of basis vectors simply copy W
mi@0	64 to gmap_mdl.W, and set compute_w to False:
mi@0	65
mi@0	66 >>> data = np.array([[1.5, 1.3], [1.2, 0.3]])
mi@0	67 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0	68 >>> gmap_mdl = GMAP(data, num_bases=2)
mi@0	69 >>> gmap_mdl.W = W
mi@0	70 >>> gmap_mdl.factorize(compute_w=False)
mi@0	71
mi@0	72 The result is a set of coefficients gmap_mdl.H, s.t. data = W * gmap_mdl.H.
mi@0	73 """
mi@0	74
mi@0	75 # always overwrite the default number of iterations
mi@0	76 # -> any value other does not make sense.
mi@0	77 _NITER = 1
mi@0	78
mi@0	79 def __init__(self, data, num_bases=4, method='pca', robust_map=True):
mi@0	80
mi@0	81 AA.__init__(self, data, num_bases=num_bases)
mi@0	82 self.sub = []
mi@0	83 self._robust_map = robust_map
mi@0	84 self._method = method
mi@0	85
mi@0	86
mi@0	87 def init_h(self):
mi@0	88 self.H = np.zeros((self._num_bases, self._num_samples))
mi@0	89
mi@0	90 def init_w(self):
mi@0	91 self.W = np.zeros((self._data_dimension, self._num_bases))
mi@0	92
mi@0	93 def update_w(self):
mi@0	94 """ compute new W """
mi@0	95
mi@0	96 def select_next(iterval):
mi@0	97 """ select the next best data sample using robust map
mi@0	98 or simply the max iterval ... """
mi@0	99
mi@0	100 if self._robust_map:
mi@0	101 k = np.argsort(iterval)[::-1]
mi@0	102 d_sub = self.data[:,k[:self._robust_nselect]]
mi@0	103 self.sub.extend(k[:self._robust_nselect])
mi@0	104
mi@0	105 # cluster d_sub
mi@0	106 kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster)
mi@0	107 kmeans_mdl.factorize(niter=10)
mi@0	108
mi@0	109 # get largest cluster
mi@0	110 h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0]
mi@0	111 largest_cluster = np.argmax(h)
mi@0	112 sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub)
mi@0	113 sel = k[np.argmin(sel)]
mi@0	114 else:
mi@0	115 sel = np.argmax(iterval)
mi@0	116
mi@0	117 return sel
mi@0	118
mi@0	119 EPS = 10**-8
mi@0	120
mi@0	121 if scipy.sparse.issparse(self.data):
mi@0	122 norm_data = np.sqrt(self.data.multiply(self.data).sum(axis=0))
mi@0	123 norm_data = np.array(norm_data).reshape((-1))
mi@0	124 else:
mi@0	125 norm_data = np.sqrt(np.sum(self.data**2, axis=0))
mi@0	126
mi@0	127
mi@0	128 self.select = []
mi@0	129
mi@0	130 if self._method == 'pca' or self._method == 'aa':
mi@0	131 iterval = norm_data.copy()
mi@0	132
mi@0	133 if self._method == 'nmf':
mi@0	134 iterval = np.sum(self.data, axis=0)/(np.sqrt(self.data.shape[0])*norm_data)
mi@0	135 iterval = 1.0 - iterval
mi@0	136
mi@0	137 self.select.append(select_next(iterval))
mi@0	138
mi@0	139
mi@0	140 for l in range(1, self._num_bases):
mi@0	141
mi@0	142 if scipy.sparse.issparse(self.data):
mi@0	143 c = self.data[:, self.select[-1]:self.select[-1]+1].T * self.data
mi@0	144 c = np.array(c.todense())
mi@0	145 else:
mi@0	146 c = np.dot(self.data[:,self.select[-1]], self.data)
mi@0	147
mi@0	148 c = c/(norm_data * norm_data[self.select[-1]])
mi@0	149
mi@0	150 if self._method == 'pca':
mi@0	151 c = 1.0 - np.abs(c)
mi@0	152 c = c * norm_data
mi@0	153
mi@0	154 elif self._method == 'aa':
mi@0	155 c = (c*-1.0 + 1.0)/2.0
mi@0	156 c = c * norm_data
mi@0	157
mi@0	158 elif self._method == 'nmf':
mi@0	159 c = 1.0 - np.abs(c)
mi@0	160
mi@0	161 ### update the estimated volume
mi@0	162 iterval = c * iterval
mi@0	163
mi@0	164 # detect the next best data point
mi@0	165 self.select.append(select_next(iterval))
mi@0	166
mi@0	167 self._logger.info('cur_nodes: ' + str(self.select))
mi@0	168
mi@0	169 # sort indices, otherwise h5py won't work
mi@0	170 self.W = self.data[:, np.sort(self.select)]
mi@0	171
mi@0	172 # "unsort" it again to keep the correct order
mi@0	173 self.W = self.W[:, np.argsort(np.argsort(self.select))]
mi@0	174
mi@0	175 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0	176 compute_err=True, robust_cluster=3, niter=1, robust_nselect=-1):
mi@0	177 """ Factorize s.t. WH = data
mi@0	178
mi@0	179 Parameters
mi@0	180 ----------
mi@0	181 show_progress : bool
mi@0	182 print some extra information to stdout.
mi@0	183 False, default
mi@0	184 compute_h : bool
mi@0	185 iteratively update values for H.
mi@0	186 True, default
mi@0	187 compute_w : bool
mi@0	188 iteratively update values for W.
mi@0	189 default, True
mi@0	190 compute_err : bool
mi@0	191 compute Frobenius norm \|data-WH\| after each update and store
mi@0	192 it to .ferr[k].
mi@0	193 robust_cluster : int, optional
mi@0	194 set the number of clusters for robust map selection.
mi@0	195 3, default
mi@0	196 robust_nselect : int, optional
mi@0	197 set the number of samples to consider for robust map
mi@0	198 selection.
mi@0	199 -1, default (automatically determine suitable number)
mi@0	200
mi@0	201 Updated Values
mi@0	202 --------------
mi@0	203 .W : updated values for W.
mi@0	204 .H : updated values for H.
mi@0	205 .ferr : Frobenius norm \|data-WH\|.
mi@0	206 """
mi@0	207 self._robust_cluster = robust_cluster
mi@0	208 self._robust_nselect = robust_nselect
mi@0	209
mi@0	210 if self._robust_nselect == -1:
mi@0	211 self._robust_nselect = np.round(np.log(self.data.shape[1])*2)
mi@0	212
mi@0	213 AA.factorize(self, niter=1, show_progress=show_progress,
mi@0	214 compute_w=compute_w, compute_h=compute_h,
mi@0	215 compute_err=compute_err)
mi@0	216
mi@0	217 if __name__ == "__main__":
mi@0	218 import doctest
mi@0	219 doctest.testmod()

Mercurial > hg > segmentation

annotate pymf/gmap.py @ 19:890cfe424f4a tip