mi@0
|
1 #!/usr/bin/python
|
mi@0
|
2 #
|
mi@0
|
3 # Copyright (C) Christian Thurau, 2010.
|
mi@0
|
4 # Licensed under the GNU General Public License (GPL).
|
mi@0
|
5 # http://www.gnu.org/licenses/gpl.txt
|
mi@0
|
6 """
|
mi@0
|
7 PyMF Principal Component Analysis.
|
mi@0
|
8
|
mi@0
|
9 PCA: Class for Principal Component Analysis
|
mi@0
|
10 """
|
mi@0
|
11
|
mi@0
|
12
|
mi@0
|
13
|
mi@0
|
14 import numpy as np
|
mi@0
|
15
|
mi@0
|
16 from nmf import NMF
|
mi@0
|
17 from svd import SVD
|
mi@0
|
18
|
mi@0
|
19
|
mi@0
|
20 __all__ = ["PCA"]
|
mi@0
|
21
|
mi@0
|
22 class PCA(NMF):
|
mi@0
|
23 """
|
mi@0
|
24 PCA(data, num_bases=4, center_mean=True)
|
mi@0
|
25
|
mi@0
|
26
|
mi@0
|
27 Archetypal Analysis. Factorize a data matrix into two matrices s.t.
|
mi@0
|
28 F = | data - W*H | is minimal. W is set to the eigenvectors of the
|
mi@0
|
29 data covariance.
|
mi@0
|
30
|
mi@0
|
31 Parameters
|
mi@0
|
32 ----------
|
mi@0
|
33 data : array_like, shape (_data_dimension, _num_samples)
|
mi@0
|
34 the input data
|
mi@0
|
35 num_bases: int, optional
|
mi@0
|
36 Number of bases to compute (column rank of W and row rank of H).
|
mi@0
|
37 4 (default)
|
mi@0
|
38 center_mean: bool, True
|
mi@0
|
39 Make sure that the data is centred around the mean.
|
mi@0
|
40
|
mi@0
|
41 Attributes
|
mi@0
|
42 ----------
|
mi@0
|
43 W : "data_dimension x num_bases" matrix of basis vectors
|
mi@0
|
44 H : "num bases x num_samples" matrix of coefficients
|
mi@0
|
45 ferr : frobenius norm (after calling .factorize())
|
mi@0
|
46
|
mi@0
|
47 Example
|
mi@0
|
48 -------
|
mi@0
|
49 Applying PCA to some rather stupid data set:
|
mi@0
|
50
|
mi@0
|
51 >>> import numpy as np
|
mi@0
|
52 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
|
mi@0
|
53 >>> pca_mdl = PCA(data, num_bases=2)
|
mi@0
|
54 >>> pca_mdl.factorize()
|
mi@0
|
55
|
mi@0
|
56 The basis vectors are now stored in pca_mdl.W, the coefficients in pca_mdl.H.
|
mi@0
|
57 To compute coefficients for an existing set of basis vectors simply copy W
|
mi@0
|
58 to pca_mdl.W, and set compute_w to False:
|
mi@0
|
59
|
mi@0
|
60 >>> data = np.array([[1.5], [1.2]])
|
mi@0
|
61 >>> W = np.array([[1.0, 0.0], [0.0, 1.0]])
|
mi@0
|
62 >>> pca_mdl = PCA(data, num_bases=2)
|
mi@0
|
63 >>> pca_mdl.W = W
|
mi@0
|
64 >>> pca_mdl.factorize(compute_w=False)
|
mi@0
|
65
|
mi@0
|
66 The result is a set of coefficients pca_mdl.H, s.t. data = W * pca_mdl.H.
|
mi@0
|
67 """
|
mi@0
|
68
|
mi@0
|
69 def __init__(self, data, num_bases=0, center_mean=True):
|
mi@0
|
70
|
mi@0
|
71 NMF.__init__(self, data, num_bases=num_bases)
|
mi@0
|
72
|
mi@0
|
73 # center the data around the mean first
|
mi@0
|
74 self._center_mean = center_mean
|
mi@0
|
75
|
mi@0
|
76 if self._center_mean:
|
mi@0
|
77 # copy the data before centering it
|
mi@0
|
78 self._data_orig = data
|
mi@0
|
79 self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1)
|
mi@0
|
80 self.data = self._data_orig - self._meanv
|
mi@0
|
81 else:
|
mi@0
|
82 self.data = data
|
mi@0
|
83
|
mi@0
|
84 def init_h(self):
|
mi@0
|
85 pass
|
mi@0
|
86
|
mi@0
|
87 def init_w(self):
|
mi@0
|
88 pass
|
mi@0
|
89
|
mi@0
|
90 def update_h(self):
|
mi@0
|
91 self.H = np.dot(self.W.T, self.data[:,:])
|
mi@0
|
92
|
mi@0
|
93 def update_w(self):
|
mi@0
|
94 # compute eigenvectors and eigenvalues using SVD
|
mi@0
|
95 svd_mdl = SVD(self.data)
|
mi@0
|
96 svd_mdl.factorize()
|
mi@0
|
97
|
mi@0
|
98 # argsort sorts in ascending order -> do reverese indexing
|
mi@0
|
99 # for accesing values in descending order
|
mi@0
|
100 S = np.diag(svd_mdl.S)
|
mi@0
|
101 order = np.argsort(S)[::-1]
|
mi@0
|
102
|
mi@0
|
103 # select only a few eigenvectors ...
|
mi@0
|
104 if self._num_bases >0:
|
mi@0
|
105 order = order[:self._num_bases]
|
mi@0
|
106
|
mi@0
|
107 self.W = svd_mdl.U[:,order]
|
mi@0
|
108 self.eigenvalues = S[order]
|
mi@0
|
109
|
mi@0
|
110 def factorize(self, show_progress=False, compute_w=True, compute_h=True,
|
mi@0
|
111 compute_err=True, niter=1):
|
mi@0
|
112 """ Factorize s.t. WH = data
|
mi@0
|
113
|
mi@0
|
114 Parameters
|
mi@0
|
115 ----------
|
mi@0
|
116 show_progress : bool
|
mi@0
|
117 print some extra information to stdout.
|
mi@0
|
118 compute_h : bool
|
mi@0
|
119 iteratively update values for H.
|
mi@0
|
120 compute_w : bool
|
mi@0
|
121 iteratively update values for W.
|
mi@0
|
122 compute_err : bool
|
mi@0
|
123 compute Frobenius norm |data-WH| after each update and store
|
mi@0
|
124 it to .ferr[k].
|
mi@0
|
125
|
mi@0
|
126 Updated Values
|
mi@0
|
127 --------------
|
mi@0
|
128 .W : updated values for W.
|
mi@0
|
129 .H : updated values for H.
|
mi@0
|
130 .ferr : Frobenius norm |data-WH|.
|
mi@0
|
131 """
|
mi@0
|
132
|
mi@0
|
133 NMF.factorize(self, niter=1, show_progress=show_progress,
|
mi@0
|
134 compute_w=compute_w, compute_h=compute_h,
|
mi@0
|
135 compute_err=compute_err)
|
mi@0
|
136
|
mi@0
|
137
|
mi@0
|
138 if __name__ == "__main__":
|
mi@0
|
139 import doctest
|
mi@0
|
140 doctest.testmod()
|