annotate pymf/chnmf.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF Convex Hull Non-negative Matrix Factorization [1]
mi@0 8
mi@0 9 CHNMF(NMF) : Class for Convex-hull NMF
mi@0 10 quickhull : Function for finding the convex hull in 2D
mi@0 11
mi@0 12 [1] C. Thurau, K. Kersting, and C. Bauckhage. Convex Non-Negative Matrix
mi@0 13 Factorization in the Wild. ICDM 2009.
mi@0 14 """
mi@0 15
mi@0 16
mi@0 17 import numpy as np
mi@0 18
mi@0 19 from itertools import combinations
mi@0 20 from dist import vq
mi@0 21 from pca import PCA
mi@0 22 from aa import AA
mi@0 23
mi@0 24 __all__ = ["CHNMF"]
mi@0 25
mi@0 26
mi@0 27 def quickhull(sample):
mi@0 28 """ Find data points on the convex hull of a supplied data set
mi@0 29
mi@0 30 Args:
mi@0 31 sample: data points as column vectors n x d
mi@0 32 n - number samples
mi@0 33 d - data dimension (should be two)
mi@0 34
mi@0 35 Returns:
mi@0 36 a k x d matrix containint the convex hull data points
mi@0 37 """
mi@0 38
mi@0 39 link = lambda a, b: np.concatenate((a, b[1:]))
mi@0 40 edge = lambda a, b: np.concatenate(([a], [b]))
mi@0 41
mi@0 42 def dome(sample, base):
mi@0 43 h, t = base
mi@0 44 dists = np.dot(sample - h, np.dot(((0, -1), (1, 0)), (t - h)))
mi@0 45 outer = np.repeat(sample, dists > 0, axis=0)
mi@0 46
mi@0 47 if len(outer):
mi@0 48 pivot = sample[np.argmax(dists)]
mi@0 49 return link(dome(outer, edge(h, pivot)),
mi@0 50 dome(outer, edge(pivot, t)))
mi@0 51 else:
mi@0 52 return base
mi@0 53
mi@0 54 if len(sample) > 2:
mi@0 55 axis = sample[:, 0]
mi@0 56 base = np.take(sample, [np.argmin(axis), np.argmax(axis)], axis=0)
mi@0 57 return link(dome(sample, base),
mi@0 58 dome(sample, base[::-1]))
mi@0 59 else:
mi@0 60 return sample
mi@0 61
mi@0 62 class CHNMF(AA):
mi@0 63 """
mi@0 64 CHNMF(data, num_bases=4)
mi@0 65
mi@0 66 Convex Hull Non-negative Matrix Factorization. Factorize a data matrix into
mi@0 67 two matrices s.t. F = | data - W*H | is minimal. H is restricted to convexity
mi@0 68 (H >=0, sum(H, axis=1) = [1 .. 1]) and W resides on actual data points.
mi@0 69 Factorization is solved via an alternating least squares optimization using
mi@0 70 the quadratic programming solver from cvxopt. The results are usually
mi@0 71 equivalent to Archetypal Analysis (pymf.AA) but CHNMF also works for very
mi@0 72 large datasets.
mi@0 73
mi@0 74 Parameters
mi@0 75 ----------
mi@0 76 data : array_like, shape (_data_dimension, _num_samples)
mi@0 77 the input data
mi@0 78 num_bases: int, optional
mi@0 79 Number of bases to compute (column rank of W and row rank of H).
mi@0 80 4 (default)
mi@0 81 base_sel: int,
mi@0 82 Number of pairwise basis vector projections. Set to a value< rank(data).
mi@0 83 Computation time scale exponentially with this value, usually rather low
mi@0 84 values are sufficient (3-10).
mi@0 85
mi@0 86 Attributes
mi@0 87 ----------
mi@0 88 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 89 H : "num bases x num_samples" matrix of coefficients
mi@0 90 ferr : frobenius norm (after calling .factorize())
mi@0 91
mi@0 92 Example
mi@0 93 -------
mi@0 94 Applying CHNMF to some rather stupid data set:
mi@0 95
mi@0 96 >>> import numpy as np
mi@0 97 >>> from chnmf import CHNMF
mi@0 98 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 99
mi@0 100 Use 2 basis vectors -> W shape(data_dimension, 2).
mi@0 101
mi@0 102 >>> chnmf_mdl = CHNMF(data, num_bases=2)
mi@0 103
mi@0 104 And start computing the factorization.
mi@0 105
mi@0 106 >>> chnmf_mdl.factorize()
mi@0 107
mi@0 108 The basis vectors are now stored in chnmf_mdl.W, the coefficients in
mi@0 109 chnmf_mdl.H. To compute coefficients for an existing set of basis vectors
mi@0 110 simply copy W to chnmf_mdl.W, and set compute_w to False:
mi@0 111
mi@0 112 >>> data = np.array([[1.5, 2.0], [1.2, 1.8]])
mi@0 113 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
mi@0 114 >>> chnmf_mdl = CHNMF(data, num_bases=2)
mi@0 115 >>> chnmf_mdl.W = W
mi@0 116 >>> chnmf_mdl.factorize(compute_w=False)
mi@0 117
mi@0 118 The result is a set of coefficients chnmf_mdl.H, s.t. data = W * chnmf_mdl.H.
mi@0 119 """
mi@0 120
mi@0 121 def __init__(self, data, num_bases=4, base_sel=3):
mi@0 122
mi@0 123 # call inherited method
mi@0 124 AA.__init__(self, data, num_bases=num_bases)
mi@0 125
mi@0 126 # base sel should never be larger than the actual data dimension
mi@0 127 self._base_sel = base_sel
mi@0 128 if base_sel > self.data.shape[0]:
mi@0 129 self._base_sel = self.data.shape[0]
mi@0 130
mi@0 131 def init_h(self):
mi@0 132 self.H = np.zeros((self._num_bases, self._num_samples))
mi@0 133
mi@0 134 def init_w(self):
mi@0 135 self.W = np.zeros((self._data_dimension, self._num_bases))
mi@0 136
mi@0 137 def _map_w_to_data(self):
mi@0 138 """ Return data points that are most similar to basis vectors W
mi@0 139 """
mi@0 140
mi@0 141 # assign W to the next best data sample
mi@0 142 self._Wmapped_index = vq(self.data, self.W)
mi@0 143 self.Wmapped = np.zeros(self.W.shape)
mi@0 144
mi@0 145 # do not directly assign, i.e. Wdist = self.data[:,sel]
mi@0 146 # as self might be unsorted (in non ascending order)
mi@0 147 # -> sorting sel would screw the matching to W if
mi@0 148 # self.data is stored as a hdf5 table (see h5py)
mi@0 149 for i, s in enumerate(self._Wmapped_index):
mi@0 150 self.Wmapped[:,i] = self.data[:,s]
mi@0 151
mi@0 152 def update_w(self):
mi@0 153 """ compute new W """
mi@0 154 def select_hull_points(data, n=3):
mi@0 155 """ select data points for pairwise projections of the first n
mi@0 156 dimensions """
mi@0 157
mi@0 158 # iterate over all projections and select data points
mi@0 159 idx = np.array([])
mi@0 160
mi@0 161 # iterate over some pairwise combinations of dimensions
mi@0 162 for i in combinations(range(n), 2):
mi@0 163 # sample convex hull points in 2D projection
mi@0 164 convex_hull_d = quickhull(data[i, :].T)
mi@0 165
mi@0 166 # get indices for convex hull data points
mi@0 167 idx = np.append(idx, vq(data[i, :], convex_hull_d.T))
mi@0 168 idx = np.unique(idx)
mi@0 169
mi@0 170 return np.int32(idx)
mi@0 171
mi@0 172 # determine convex hull data points using either PCA or random
mi@0 173 # projections
mi@0 174 method = 'randomprojection'
mi@0 175 if method == 'pca':
mi@0 176 pcamodel = PCA(self.data)
mi@0 177 pcamodel.factorize(show_progress=False)
mi@0 178 proj = pcamodel.H
mi@0 179 else:
mi@0 180 R = np.random.randn(self._base_sel, self._data_dimension)
mi@0 181 proj = np.dot(R, self.data)
mi@0 182
mi@0 183 self._hull_idx = select_hull_points(proj, n=self._base_sel)
mi@0 184 aa_mdl = AA(self.data[:, self._hull_idx], num_bases=self._num_bases)
mi@0 185
mi@0 186 # determine W
mi@0 187 aa_mdl.factorize(niter=50, compute_h=True, compute_w=True,
mi@0 188 compute_err=True, show_progress=False)
mi@0 189
mi@0 190 self.W = aa_mdl.W
mi@0 191 self._map_w_to_data()
mi@0 192
mi@0 193 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
mi@0 194 compute_err=True, niter=1):
mi@0 195 """ Factorize s.t. WH = data
mi@0 196
mi@0 197 Parameters
mi@0 198 ----------
mi@0 199 show_progress : bool
mi@0 200 print some extra information to stdout.
mi@0 201 compute_h : bool
mi@0 202 iteratively update values for H.
mi@0 203 compute_w : bool
mi@0 204 iteratively update values for W.
mi@0 205 compute_err : bool
mi@0 206 compute Frobenius norm |data-WH| after each update and store
mi@0 207 it to .ferr[k].
mi@0 208
mi@0 209 Updated Values
mi@0 210 --------------
mi@0 211 .W : updated values for W.
mi@0 212 .H : updated values for H.
mi@0 213 .ferr : Frobenius norm |data-WH|.
mi@0 214 """
mi@0 215
mi@0 216 AA.factorize(self, niter=1, show_progress=show_progress,
mi@0 217 compute_w=compute_w, compute_h=compute_h,
mi@0 218 compute_err=compute_err)
mi@0 219
mi@0 220
mi@0 221 if __name__ == "__main__":
mi@0 222 import doctest
mi@0 223 doctest.testmod()