mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Convex Hull Non-negative Matrix Factorization [1]
|
mi@0
|
8
|
mi@0
|
9 CHNMF(NMF) : Class for Convex-hull NMF
|
mi@0
|
10 quickhull : Function for finding the convex hull in 2D
|
mi@0
|
11
|
mi@0
|
12 [1] C. Thurau, K. Kersting, and C. Bauckhage. Convex Non-Negative Matrix
|
mi@0
|
13 Factorization in the Wild. ICDM 2009.
|
mi@0
|
14 """
|
mi@0
|
15
|
mi@0
|
16
|
mi@0
|
17 import numpy as np
|
mi@0
|
18
|
mi@0
|
19 from itertools import combinations
|
mi@0
|
20 from dist import vq
|
mi@0
|
21 from pca import PCA
|
mi@0
|
22 from aa import AA
|
mi@0
|
23
|
mi@0
|
24 __all__ = ["CHNMF"]
|
mi@0
|
25
|
mi@0
|
26
|
mi@0
|
27 def quickhull(sample):
|
mi@0
|
28 """ Find data points on the convex hull of a supplied data set
|
mi@0
|
29
|
mi@0
|
30 Args:
|
mi@0
|
31 sample: data points as column vectors n x d
|
mi@0
|
32 n - number samples
|
mi@0
|
33 d - data dimension (should be two)
|
mi@0
|
34
|
mi@0
|
35 Returns:
|
mi@0
|
36 a k x d matrix containint the convex hull data points
|
mi@0
|
37 """
|
mi@0
|
38
|
mi@0
|
39 link = lambda a, b: np.concatenate((a, b[1:]))
|
mi@0
|
40 edge = lambda a, b: np.concatenate(([a], [b]))
|
mi@0
|
41
|
mi@0
|
42 def dome(sample, base):
|
mi@0
|
43 h, t = base
|
mi@0
|
44 dists = np.dot(sample - h, np.dot(((0, -1), (1, 0)), (t - h)))
|
mi@0
|
45 outer = np.repeat(sample, dists > 0, axis=0)
|
mi@0
|
46
|
mi@0
|
47 if len(outer):
|
mi@0
|
48 pivot = sample[np.argmax(dists)]
|
mi@0
|
49 return link(dome(outer, edge(h, pivot)),
|
mi@0
|
50 dome(outer, edge(pivot, t)))
|
mi@0
|
51 else:
|
mi@0
|
52 return base
|
mi@0
|
53
|
mi@0
|
54 if len(sample) > 2:
|
mi@0
|
55 axis = sample[:, 0]
|
mi@0
|
56 base = np.take(sample, [np.argmin(axis), np.argmax(axis)], axis=0)
|
mi@0
|
57 return link(dome(sample, base),
|
mi@0
|
58 dome(sample, base[::-1]))
|
mi@0
|
59 else:
|
mi@0
|
60 return sample
|
mi@0
|
61
|
mi@0
|
62 class CHNMF(AA):
|
mi@0
|
63 """
|
mi@0
|
64 CHNMF(data, num_bases=4)
|
mi@0
|
65
|
mi@0
|
66 Convex Hull Non-negative Matrix Factorization. Factorize a data matrix into
|
mi@0
|
67 two matrices s.t. F = | data - W*H | is minimal. H is restricted to convexity
|
mi@0
|
68 (H >=0, sum(H, axis=1) = [1 .. 1]) and W resides on actual data points.
|
mi@0
|
69 Factorization is solved via an alternating least squares optimization using
|
mi@0
|
70 the quadratic programming solver from cvxopt. The results are usually
|
mi@0
|
71 equivalent to Archetypal Analysis (pymf.AA) but CHNMF also works for very
|
mi@0
|
72 large datasets.
|
mi@0
|
73
|
mi@0
|
74 Parameters
|
mi@0
|
75 ----------
|
mi@0
|
76 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
77 the input data
|
mi@0
|
78 num_bases: int, optional
|
mi@0
|
79 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
80 4 (default)
|
mi@0
|
81 base_sel: int,
|
mi@0
|
82 Number of pairwise basis vector projections. Set to a value< rank(data).
|
mi@0
|
83 Computation time scale exponentially with this value, usually rather low
|
mi@0
|
84 values are sufficient (3-10).
|
mi@0
|
85
|
mi@0
|
86 Attributes
|
mi@0
|
87 ----------
|
mi@0
|
88 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
89 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
90 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
91
|
mi@0
|
92 Example
|
mi@0
|
93 -------
|
mi@0
|
94 Applying CHNMF to some rather stupid data set:
|
mi@0
|
95
|
mi@0
|
96 >>> import numpy as np
|
mi@0
|
97 >>> from chnmf import CHNMF
|
mi@0
|
98 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
99
|
mi@0
|
100 Use 2 basis vectors -> W shape(data_dimension, 2).
|
mi@0
|
101
|
mi@0
|
102 >>> chnmf_mdl = CHNMF(data, num_bases=2)
|
mi@0
|
103
|
mi@0
|
104 And start computing the factorization.
|
mi@0
|
105
|
mi@0
|
106 >>> chnmf_mdl.factorize()
|
mi@0
|
107
|
mi@0
|
108 The basis vectors are now stored in chnmf_mdl.W, the coefficients in
|
mi@0
|
109 chnmf_mdl.H. To compute coefficients for an existing set of basis vectors
|
mi@0
|
110 simply copy W to chnmf_mdl.W, and set compute_w to False:
|
mi@0
|
111
|
mi@0
|
112 >>> data = np.array([[1.5, 2.0], [1.2, 1.8]])
|
mi@0
|
113 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
114 >>> chnmf_mdl = CHNMF(data, num_bases=2)
|
mi@0
|
115 >>> chnmf_mdl.W = W
|
mi@0
|
116 >>> chnmf_mdl.factorize(compute_w=False)
|
mi@0
|
117
|
mi@0
|
118 The result is a set of coefficients chnmf_mdl.H, s.t. data = W * chnmf_mdl.H.
|
mi@0
|
119 """
|
mi@0
|
120
|
mi@0
|
121 def __init__(self, data, num_bases=4, base_sel=3):
|
mi@0
|
122
|
mi@0
|
123 # call inherited method
|
mi@0
|
124 AA.__init__(self, data, num_bases=num_bases)
|
mi@0
|
125
|
mi@0
|
126 # base sel should never be larger than the actual data dimension
|
mi@0
|
127 self._base_sel = base_sel
|
mi@0
|
128 if base_sel > self.data.shape[0]:
|
mi@0
|
129 self._base_sel = self.data.shape[0]
|
mi@0
|
130
|
mi@0
|
131 def init_h(self):
|
mi@0
|
132 self.H = np.zeros((self._num_bases, self._num_samples))
|
mi@0
|
133
|
mi@0
|
134 def init_w(self):
|
mi@0
|
135 self.W = np.zeros((self._data_dimension, self._num_bases))
|
mi@0
|
136
|
mi@0
|
137 def _map_w_to_data(self):
|
mi@0
|
138 """ Return data points that are most similar to basis vectors W
|
mi@0
|
139 """
|
mi@0
|
140
|
mi@0
|
141 # assign W to the next best data sample
|
mi@0
|
142 self._Wmapped_index = vq(self.data, self.W)
|
mi@0
|
143 self.Wmapped = np.zeros(self.W.shape)
|
mi@0
|
144
|
mi@0
|
145 # do not directly assign, i.e. Wdist = self.data[:,sel]
|
mi@0
|
146 # as self might be unsorted (in non ascending order)
|
mi@0
|
147 # -> sorting sel would screw the matching to W if
|
mi@0
|
148 # self.data is stored as a hdf5 table (see h5py)
|
mi@0
|
149 for i, s in enumerate(self._Wmapped_index):
|
mi@0
|
150 self.Wmapped[:,i] = self.data[:,s]
|
mi@0
|
151
|
mi@0
|
152 def update_w(self):
|
mi@0
|
153 """ compute new W """
|
mi@0
|
154 def select_hull_points(data, n=3):
|
mi@0
|
155 """ select data points for pairwise projections of the first n
|
mi@0
|
156 dimensions """
|
mi@0
|
157
|
mi@0
|
158 # iterate over all projections and select data points
|
mi@0
|
159 idx = np.array([])
|
mi@0
|
160
|
mi@0
|
161 # iterate over some pairwise combinations of dimensions
|
mi@0
|
162 for i in combinations(range(n), 2):
|
mi@0
|
163 # sample convex hull points in 2D projection
|
mi@0
|
164 convex_hull_d = quickhull(data[i, :].T)
|
mi@0
|
165
|
mi@0
|
166 # get indices for convex hull data points
|
mi@0
|
167 idx = np.append(idx, vq(data[i, :], convex_hull_d.T))
|
mi@0
|
168 idx = np.unique(idx)
|
mi@0
|
169
|
mi@0
|
170 return np.int32(idx)
|
mi@0
|
171
|
mi@0
|
172 # determine convex hull data points using either PCA or random
|
mi@0
|
173 # projections
|
mi@0
|
174 method = 'randomprojection'
|
mi@0
|
175 if method == 'pca':
|
mi@0
|
176 pcamodel = PCA(self.data)
|
mi@0
|
177 pcamodel.factorize(show_progress=False)
|
mi@0
|
178 proj = pcamodel.H
|
mi@0
|
179 else:
|
mi@0
|
180 R = np.random.randn(self._base_sel, self._data_dimension)
|
mi@0
|
181 proj = np.dot(R, self.data)
|
mi@0
|
182
|
mi@0
|
183 self._hull_idx = select_hull_points(proj, n=self._base_sel)
|
mi@0
|
184 aa_mdl = AA(self.data[:, self._hull_idx], num_bases=self._num_bases)
|
mi@0
|
185
|
mi@0
|
186 # determine W
|
mi@0
|
187 aa_mdl.factorize(niter=50, compute_h=True, compute_w=True,
|
mi@0
|
188 compute_err=True, show_progress=False)
|
mi@0
|
189
|
mi@0
|
190 self.W = aa_mdl.W
|
mi@0
|
191 self._map_w_to_data()
|
mi@0
|
192
|
mi@0
|
193 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
|
mi@0
|
194 compute_err=True, niter=1):
|
mi@0
|
195 """ Factorize s.t. WH = data
|
mi@0
|
196
|
mi@0
|
197 Parameters
|
mi@0
|
198 ----------
|
mi@0
|
199 show_progress : bool
|
mi@0
|
200 print some extra information to stdout.
|
mi@0
|
201 compute_h : bool
|
mi@0
|
202 iteratively update values for H.
|
mi@0
|
203 compute_w : bool
|
mi@0
|
204 iteratively update values for W.
|
mi@0
|
205 compute_err : bool
|
mi@0
|
206 compute Frobenius norm |data-WH| after each update and store
|
mi@0
|
207 it to .ferr[k].
|
mi@0
|
208
|
mi@0
|
209 Updated Values
|
mi@0
|
210 --------------
|
mi@0
|
211 .W : updated values for W.
|
mi@0
|
212 .H : updated values for H.
|
mi@0
|
213 .ferr : Frobenius norm |data-WH|.
|
mi@0
|
214 """
|
mi@0
|
215
|
mi@0
|
216 AA.factorize(self, niter=1, show_progress=show_progress,
|
mi@0
|
217 compute_w=compute_w, compute_h=compute_h,
|
mi@0
|
218 compute_err=compute_err)
|
mi@0
|
219
|
mi@0
|
220
|
mi@0
|
221 if __name__ == "__main__":
|
mi@0
|
222 import doctest
|
mi@0
|
223 doctest.testmod()
|