mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Archetypal Analysis [1]
|
mi@0
|
8
|
mi@0
|
9 AA: class for Archetypal Analysis
|
mi@0
|
10
|
mi@0
|
11 [1] Cutler, A. Breiman, L. (1994), "Archetypal Analysis", Technometrics 36(4),
|
mi@0
|
12 338-347.
|
mi@0
|
13 """
|
mi@0
|
14
|
mi@0
|
15
|
mi@0
|
16 import numpy as np
|
mi@0
|
17 from dist import vq
|
mi@0
|
18 from cvxopt import solvers, base
|
mi@0
|
19
|
mi@0
|
20 from svd import pinv
|
mi@0
|
21 from nmf import NMF
|
mi@0
|
22
|
mi@0
|
23 __all__ = ["AA"]
|
mi@0
|
24
|
mi@0
|
25 class AA(NMF):
|
mi@0
|
26 """
|
mi@0
|
27 AA(data, num_bases=4)
|
mi@0
|
28
|
mi@0
|
29 Archetypal Analysis. Factorize a data matrix into two matrices s.t.
|
mi@0
|
30 F = | data - W*H | = | data - data*beta*H| is minimal. H and beta
|
mi@0
|
31 are restricted to convexity (beta >=0, sum(beta, axis=1) = [1 .. 1]).
|
mi@0
|
32 Factorization is solved via an alternating least squares optimization
|
mi@0
|
33 using the quadratic programming solver from cvxopt.
|
mi@0
|
34
|
mi@0
|
35 Parameters
|
mi@0
|
36 ----------
|
mi@0
|
37 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
38 the input data
|
mi@0
|
39 num_bases: int, optional
|
mi@0
|
40 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
41 4 (default)
|
mi@0
|
42
|
mi@0
|
43
|
mi@0
|
44 Attributes
|
mi@0
|
45 ----------
|
mi@0
|
46 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
47 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
48 beta : "num_bases x num_samples" matrix of basis vector coefficients
|
mi@0
|
49 (for constructing W s.t. W = beta * data.T )
|
mi@0
|
50 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
51
|
mi@0
|
52 Example
|
mi@0
|
53 -------
|
mi@0
|
54 Applying AA to some rather stupid data set:
|
mi@0
|
55
|
mi@0
|
56 >>> import numpy as np
|
mi@0
|
57 >>> from aa import AA
|
mi@0
|
58 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
59
|
mi@0
|
60 Use 2 basis vectors -> W shape(data_dimension, 2).
|
mi@0
|
61
|
mi@0
|
62 >>> aa_mdl = AA(data, num_bases=2)
|
mi@0
|
63
|
mi@0
|
64 Set number of iterations to 5 and start computing the factorization.
|
mi@0
|
65
|
mi@0
|
66 >>> aa_mdl.factorize(niter=5)
|
mi@0
|
67
|
mi@0
|
68 The basis vectors are now stored in aa_mdl.W, the coefficients in aa_mdl.H.
|
mi@0
|
69 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
70 to aa_mdl.W, and set compute_w to False:
|
mi@0
|
71
|
mi@0
|
72 >>> data = np.array([[1.5], [1.2]])
|
mi@0
|
73 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
74 >>> aa_mdl = AA(data, num_bases=2)
|
mi@0
|
75 >>> aa_mdl.W = W
|
mi@0
|
76 >>> aa_mdl.factorize(niter=5, compute_w=False)
|
mi@0
|
77
|
mi@0
|
78 The result is a set of coefficients aa_mdl.H, s.t. data = W * aa_mdl.H.
|
mi@0
|
79 """
|
mi@0
|
80 # set cvxopt options
|
mi@0
|
81 solvers.options['show_progress'] = False
|
mi@0
|
82
|
mi@0
|
83 def init_h(self):
|
mi@0
|
84 self.H = np.random.random((self._num_bases, self._num_samples))
|
mi@0
|
85 self.H /= self.H.sum(axis=0)
|
mi@0
|
86
|
mi@0
|
87 def init_w(self):
|
mi@0
|
88 self.beta = np.random.random((self._num_bases, self._num_samples))
|
mi@0
|
89 self.beta /= self.beta.sum(axis=0)
|
mi@0
|
90 self.W = np.dot(self.beta, self.data.T).T
|
mi@0
|
91 self.W = np.random.random((self._data_dimension, self._num_bases))
|
mi@0
|
92
|
mi@0
|
93 def update_h(self):
|
mi@0
|
94 """ alternating least squares step, update H under the convexity
|
mi@0
|
95 constraint """
|
mi@0
|
96 def update_single_h(i):
|
mi@0
|
97 """ compute single H[:,i] """
|
mi@0
|
98 # optimize alpha using qp solver from cvxopt
|
mi@0
|
99 FA = base.matrix(np.float64(np.dot(-self.W.T, self.data[:,i])))
|
mi@0
|
100 al = solvers.qp(HA, FA, INQa, INQb, EQa, EQb)
|
mi@0
|
101 self.H[:,i] = np.array(al['x']).reshape((1, self._num_bases))
|
mi@0
|
102
|
mi@0
|
103 EQb = base.matrix(1.0, (1,1))
|
mi@0
|
104 # float64 required for cvxopt
|
mi@0
|
105 HA = base.matrix(np.float64(np.dot(self.W.T, self.W)))
|
mi@0
|
106 INQa = base.matrix(-np.eye(self._num_bases))
|
mi@0
|
107 INQb = base.matrix(0.0, (self._num_bases,1))
|
mi@0
|
108 EQa = base.matrix(1.0, (1, self._num_bases))
|
mi@0
|
109
|
mi@0
|
110 for i in xrange(self._num_samples):
|
mi@0
|
111 update_single_h(i)
|
mi@0
|
112
|
mi@0
|
113 def update_w(self):
|
mi@0
|
114 """ alternating least squares step, update W under the convexity
|
mi@0
|
115 constraint """
|
mi@0
|
116 def update_single_w(i):
|
mi@0
|
117 """ compute single W[:,i] """
|
mi@0
|
118 # optimize beta using qp solver from cvxopt
|
mi@0
|
119 FB = base.matrix(np.float64(np.dot(-self.data.T, W_hat[:,i])))
|
mi@0
|
120 be = solvers.qp(HB, FB, INQa, INQb, EQa, EQb)
|
mi@0
|
121 self.beta[i,:] = np.array(be['x']).reshape((1, self._num_samples))
|
mi@0
|
122
|
mi@0
|
123 # float64 required for cvxopt
|
mi@0
|
124 HB = base.matrix(np.float64(np.dot(self.data[:,:].T, self.data[:,:])))
|
mi@0
|
125 EQb = base.matrix(1.0, (1, 1))
|
mi@0
|
126 W_hat = np.dot(self.data, pinv(self.H))
|
mi@0
|
127 INQa = base.matrix(-np.eye(self._num_samples))
|
mi@0
|
128 INQb = base.matrix(0.0, (self._num_samples, 1))
|
mi@0
|
129 EQa = base.matrix(1.0, (1, self._num_samples))
|
mi@0
|
130
|
mi@0
|
131 for i in xrange(self._num_bases):
|
mi@0
|
132 update_single_w(i)
|
mi@0
|
133
|
mi@0
|
134 self.W = np.dot(self.beta, self.data.T).T
|
mi@0
|
135
|
mi@0
|
136 if __name__ == "__main__":
|
mi@0
|
137 import doctest
|
mi@0
|
138 doctest.testmod()
|