e@0
|
1 # -*- coding: utf-8 -*-
|
e@0
|
2 """
|
e@0
|
3 Created on Mon Apr 20 14:51:02 2015
|
e@0
|
4
|
e@0
|
5 @author: Emmanouil Chourdakis
|
e@0
|
6
|
e@0
|
7 Principal Factor Analisys
|
e@0
|
8
|
e@0
|
9 """
|
e@0
|
10
|
e@0
|
11 from numpy import *
|
e@0
|
12 from sklearn import cluster
|
e@0
|
13 from sklearn.metrics import pairwise_distances
|
e@0
|
14 from numpy.linalg.linalg import eig
|
e@0
|
15 from matplotlib.mlab import find
|
e@0
|
16 from numpy.random import randn
|
e@1
|
17 #from empca import *
|
e@0
|
18
|
e@0
|
19 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
|
e@0
|
20
|
e@0
|
21 L = matrix([
|
e@0
|
22 [2,4,5,5,3,2],
|
e@0
|
23 [2,3,4,5,4,3]
|
e@0
|
24 ])
|
e@0
|
25
|
e@0
|
26
|
e@0
|
27 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
|
e@0
|
28 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
|
e@0
|
29 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
|
e@0
|
30 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
|
e@0
|
31
|
e@0
|
32
|
e@0
|
33 def testing():
|
e@0
|
34 L = matrix([
|
e@0
|
35 [2,4,5,5,3,2],
|
e@0
|
36 [2,3,4,5,4,3]
|
e@0
|
37 ])
|
e@0
|
38
|
e@0
|
39 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
|
e@0
|
40 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
|
e@0
|
41 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
|
e@0
|
42 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
|
e@0
|
43
|
e@0
|
44
|
e@0
|
45 def kmeans(X, k):
|
e@0
|
46 # simulate matlab's kmeans using python's sklearn.cluster
|
e@0
|
47 k_means = cluster.KMeans(n_clusters=k)
|
e@0
|
48 k_means.fit(X)
|
e@0
|
49
|
e@0
|
50 C = k_means.cluster_centers_
|
e@0
|
51 idx = k_means.labels_
|
e@0
|
52
|
e@0
|
53
|
e@0
|
54 # Create inter-cluster distance sums
|
e@0
|
55 sumd = zeros((k,1))
|
e@0
|
56 for i in range(0, k):
|
e@0
|
57 S = X[idx == i, :] - C[i, :]
|
e@0
|
58 sumd[i] = sum(array(S)**2)
|
e@0
|
59
|
e@0
|
60 # sumd = sum(array(S)**2)
|
e@0
|
61
|
e@0
|
62 D = matrix(pairwise_distances(X, C))
|
e@0
|
63
|
e@0
|
64 return idx,C,sumd,D
|
e@0
|
65
|
e@0
|
66
|
e@0
|
67 def princomp(A):
|
e@0
|
68 # Adapted from THE GLOWING PYTHON
|
e@0
|
69
|
e@0
|
70 if A.shape[1] == 1:
|
e@0
|
71 return 1, 0, 0
|
e@0
|
72
|
e@0
|
73 A = matrix(A)
|
e@0
|
74 M = mean(A, axis=1)
|
e@0
|
75
|
e@0
|
76 Rxx = matrix(corrcoef(A - M, ddof=1))
|
e@0
|
77 # Rxx = matrix(cov(A-M, ddof=1))
|
e@0
|
78 #print A-M
|
e@0
|
79 latent,coeff = linalg.eig(Rxx)
|
e@0
|
80 p = size(coeff,axis=1)
|
e@0
|
81 idx = argsort(latent)
|
e@0
|
82 idx = idx[::-1]
|
e@0
|
83
|
e@0
|
84
|
e@0
|
85 coeff = coeff[:,idx]
|
e@0
|
86 latent = latent[idx] # sorting eigenvalues
|
e@0
|
87 score = matrix(coeff).T * (A-M)# projection of the data in the new space
|
e@0
|
88 return matrix(coeff),matrix(score).T,matrix(latent).T
|
e@0
|
89
|
e@0
|
90
|
e@0
|
91
|
e@0
|
92
|
e@0
|
93 def papca(X):
|
e@0
|
94 """ Horn's method for parallel PCA, returns nComp: the number of
|
e@0
|
95 principal compoents to use."""
|
e@0
|
96
|
e@0
|
97 N = 1000
|
e@0
|
98 X = matrix(X)
|
e@0
|
99 coeff, score, lq = princomp(X)
|
e@0
|
100
|
e@0
|
101 eigs = matrix(zeros((len(lq), N)))
|
e@0
|
102
|
e@0
|
103 for k in range(1, N+1):
|
e@0
|
104 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
|
e@0
|
105 eigs[:,k-1] = lrq
|
e@0
|
106
|
e@0
|
107 lrqbar = mean(eigs, 1)
|
e@0
|
108 ladjq = lq - lrqbar + 1
|
e@0
|
109
|
e@0
|
110 nComp = sum(ladjq > 1)
|
e@0
|
111
|
e@0
|
112 if nComp == 0:
|
e@0
|
113 return 1
|
e@0
|
114 else:
|
e@0
|
115 return nComp
|
e@0
|
116
|
e@0
|
117
|
e@0
|
118
|
e@0
|
119
|
e@0
|
120
|
e@0
|
121 def pfa(X, Variability = 0.81):
|
e@0
|
122
|
e@0
|
123 def choose_n(Lambda, variability = Variability):
|
e@0
|
124 """ Choose for variability """
|
e@0
|
125 sumL = sum(Lambda)
|
e@0
|
126 suml = 0
|
e@0
|
127
|
e@0
|
128 for i in range(0, len(Lambda)):
|
e@0
|
129 suml += Lambda[i]
|
e@0
|
130 v = suml/sumL
|
e@0
|
131 if v >= variability:
|
e@0
|
132 return i+1
|
e@0
|
133
|
e@0
|
134 # print X
|
e@0
|
135 X = matrix(X)
|
e@0
|
136 # print X
|
e@0
|
137 N = shape(X)[1]
|
e@0
|
138
|
e@0
|
139 meanX = mean(X)
|
e@0
|
140 stdX = std(X, ddof=1)
|
e@0
|
141
|
e@0
|
142 X = X - meanX
|
e@0
|
143 X = X / stdX
|
e@0
|
144
|
e@0
|
145
|
e@0
|
146 Sigma = corrcoef(X, ddof=1)
|
e@0
|
147
|
e@0
|
148 Lambda, A = eig(Sigma)
|
e@0
|
149
|
e@0
|
150
|
e@0
|
151 q = choose_n(Lambda, Variability)
|
e@0
|
152
|
e@0
|
153 # Sort Eigenvalues
|
e@0
|
154
|
e@0
|
155 idx = argsort(Lambda)[::-1]
|
e@0
|
156
|
e@0
|
157 Lambda = Lambda[idx]
|
e@0
|
158
|
e@0
|
159 A = matrix(A)
|
e@0
|
160 A = A[:,idx]
|
e@0
|
161
|
e@0
|
162 Aq = A[:, 0:q]
|
e@0
|
163
|
e@0
|
164 p = q + 2
|
e@0
|
165
|
e@0
|
166 p = min(Aq.shape[0], p)
|
e@0
|
167
|
e@0
|
168 # Todo K _means
|
e@0
|
169
|
e@0
|
170
|
e@0
|
171 labels, centers, sumd, D = kmeans(Aq, p)
|
e@0
|
172
|
e@0
|
173 tokeep = []
|
e@0
|
174
|
e@0
|
175 for i in range(0,p):
|
e@0
|
176 # Find vectors in clusters
|
e@0
|
177 vectors_idx = find(labels==i)
|
e@0
|
178 vectors = Aq[vectors_idx,:]
|
e@0
|
179 # Compute mean
|
e@0
|
180 vectors_mean = mean(vectors, 0)
|
e@0
|
181
|
e@0
|
182 # Compute distance from mean
|
e@0
|
183 distances_from_mean = pairwise_distances(vectors, vectors_mean)
|
e@0
|
184
|
e@0
|
185 # Get the vector with the minimum distance
|
e@0
|
186 sorted_idx = argsort(distances_from_mean.T)
|
e@0
|
187 closest_idx = vectors_idx[sorted_idx]
|
e@0
|
188 tokeep.append(int(closest_idx[0,0]))
|
e@0
|
189
|
e@0
|
190
|
e@0
|
191
|
e@0
|
192 return sort(tokeep)
|
e@0
|
193
|
e@0
|
194
|
e@0
|
195 def convert_to_pc(data, kernel, q, features):
|
e@0
|
196 """ Expects a PC kernel, the desired number
|
e@0
|
197 of PCs and an array of the kept features
|
e@0
|
198 and returns the transformed data projected
|
e@0
|
199 into the PC kernel space"""
|
e@0
|
200
|
e@0
|
201
|
e@0
|
202 data_n = data[features, :]
|
e@0
|
203 pc = kernel.T*data_n
|
e@0
|
204
|
e@0
|
205 return matrix(pc[0:q, :])
|
e@0
|
206
|
e@0
|
207 def extract_pca_configuration_from_data(data, Variability = 0.9):
|
e@0
|
208 """ Expects a k x n Matrix of n samples of k features and returns
|
e@0
|
209 a matrix Kernel with the PCs and an array Features of the chosen features"""
|
e@0
|
210
|
e@0
|
211 print "[II] Selecting salient features"
|
e@0
|
212 features = pfa(data);
|
e@0
|
213 data_n = data[features, :]
|
e@0
|
214 #data_n = data
|
e@0
|
215
|
e@0
|
216
|
e@0
|
217
|
e@0
|
218 print "[II] Selected %d features" % len(features)
|
e@0
|
219
|
e@0
|
220 print "[II] Calculating principal components coefficients"
|
e@0
|
221 coeff, score, latent = princomp(data_n)
|
e@0
|
222
|
e@0
|
223 print "[II] Calculating the principal components"
|
e@0
|
224 pc = coeff.T*data_n
|
e@0
|
225
|
e@0
|
226 print "[II] Doing horn's Parallel Analysis and setting number of vectors"
|
e@0
|
227 q = papca(data_n)
|
e@0
|
228
|
e@0
|
229 print "[II] Selected %d PCs to include" % q
|
e@0
|
230
|
e@0
|
231 if q < pc.shape[0]:
|
e@0
|
232 pc[q+1:, :] = 0
|
e@0
|
233
|
e@0
|
234 test_data = zeros((data.shape[0], data.shape[1]))
|
e@0
|
235 test_data[features, :] = coeff*pc
|
e@0
|
236
|
e@0
|
237 #test_data = coeff*pc
|
e@0
|
238
|
e@0
|
239
|
e@0
|
240 # print test_data
|
e@0
|
241 # print ","
|
e@0
|
242 # print data
|
e@0
|
243
|
e@0
|
244 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data[features,:], data_n)
|
e@0
|
245
|
e@0
|
246 # Extract Kernel, number of PCs and array of features
|
e@0
|
247 return coeff, q, features
|
e@0
|
248
|
e@0
|
249
|
e@0
|
250
|
e@0
|
251
|
e@0
|
252
|
e@0
|
253
|
e@0
|
254
|
e@0
|
255
|
e@0
|
256 if __name__=="__main__":
|
e@0
|
257 import sys
|
e@0
|
258
|
e@0
|
259 test = array([
|
e@0
|
260 [1., 2, 3, 4],
|
e@0
|
261 [5, 6, 7, 8],
|
e@0
|
262 [9,10,11,12],
|
e@0
|
263 [8, 9, 7, 6]
|
e@0
|
264 ])
|