mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF K-means clustering (unary-convex matrix factorization).
|
mi@0
|
8 Copyright (C) Christian Thurau, 2010. GNU General Public License (GPL).
|
mi@0
|
9 """
|
mi@0
|
10
|
mi@0
|
11
|
mi@0
|
12
|
mi@0
|
13 import numpy as np
|
mi@0
|
14
|
mi@0
|
15 import dist
|
mi@0
|
16 from nmf import NMF
|
mi@0
|
17
|
mi@0
|
18 __all__ = ["Cmeans"]
|
mi@0
|
19
|
mi@0
|
20 class Cmeans(NMF):
|
mi@0
|
21 """
|
mi@0
|
22 cmeans(data, num_bases=4)
|
mi@0
|
23
|
mi@0
|
24
|
mi@0
|
25 Fuzzy c-means soft clustering. Factorize a data matrix into two matrices s.t.
|
mi@0
|
26 F = | data - W*H | is minimal. H is restricted to convexity (columns
|
mi@0
|
27 sum to 1) W is simply the weighted mean over the corresponding samples in
|
mi@0
|
28 data. Note that the objective function is based on distances (?), hence the
|
mi@0
|
29 Frobenius norm is probably not a good quality measure.
|
mi@0
|
30
|
mi@0
|
31 Parameters
|
mi@0
|
32 ----------
|
mi@0
|
33 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
34 the input data
|
mi@0
|
35 num_bases: int, optional
|
mi@0
|
36 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
37 4 (default)
|
mi@0
|
38
|
mi@0
|
39
|
mi@0
|
40 Attributes
|
mi@0
|
41 ----------
|
mi@0
|
42 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
43 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
44 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
45
|
mi@0
|
46 Example
|
mi@0
|
47 -------
|
mi@0
|
48 Applying C-means to some rather stupid data set:
|
mi@0
|
49
|
mi@0
|
50 >>> import numpy as np
|
mi@0
|
51 >>> from cmeans import Cmeans
|
mi@0
|
52 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
53 >>> cmeans_mdl = Cmeans(data, num_bases=2, niter=10)
|
mi@0
|
54 >>> cmeans_mdl.initialization()
|
mi@0
|
55 >>> cmeans_mdl.factorize()
|
mi@0
|
56
|
mi@0
|
57 The basis vectors are now stored in cmeans_mdl.W, the coefficients in cmeans_mdl.H.
|
mi@0
|
58 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
59 to cmeans_mdl.W, and set compute_w to False:
|
mi@0
|
60
|
mi@0
|
61 >>> data = np.array([[1.5], [1.2]])
|
mi@0
|
62 >>> W = [[1.0, 0.0], [0.0, 1.0]]
|
mi@0
|
63 >>> cmeans_mdl = Cmeans(data, num_bases=2)
|
mi@0
|
64 >>> cmeans_mdl.initialization()
|
mi@0
|
65 >>> cmeans_mdl.W = W
|
mi@0
|
66 >>> cmeans_mdl.factorize(compute_w=False, niter=50)
|
mi@0
|
67
|
mi@0
|
68 The result is a set of coefficients kmeans_mdl.H, s.t. data = W * kmeans_mdl.H.
|
mi@0
|
69 """
|
mi@0
|
70
|
mi@0
|
71 def update_h(self):
|
mi@0
|
72 # assign samples to best matching centres ...
|
mi@0
|
73 m = 1.75
|
mi@0
|
74 tmp_dist = dist.pdist(self.W, self.data, metric='l2') + self._EPS
|
mi@0
|
75 self.H[:,:] = 0.0
|
mi@0
|
76
|
mi@0
|
77 for i in range(self._num_bases):
|
mi@0
|
78 for k in range(self._num_bases):
|
mi@0
|
79 self.H[i,:] += (tmp_dist[i,:]/tmp_dist[k,:])**(2.0/(m-1))
|
mi@0
|
80
|
mi@0
|
81 self.H = np.where(self.H>0, 1.0/self.H, 0)
|
mi@0
|
82
|
mi@0
|
83 def update_w(self):
|
mi@0
|
84 for i in range(self._num_bases):
|
mi@0
|
85 tmp = (self.H[i:i+1,:] * self.data).sum(axis=1)
|
mi@0
|
86 self.W[:,i] = tmp/(self.H[i,:].sum() + self._EPS)
|