e@0
|
1 # -*- coding: utf-8 -*-
|
e@0
|
2 """
|
e@0
|
3 Created on Mon Apr 20 14:51:02 2015
|
e@0
|
4
|
e@0
|
5 @author: Emmanouil Chourdakis
|
e@0
|
6
|
e@0
|
7 Principal Factor Analisys
|
e@0
|
8
|
e@0
|
9 """
|
e@0
|
10
|
e@0
|
11 from numpy import *
|
e@0
|
12 from sklearn import cluster
|
e@0
|
13 from sklearn.metrics import pairwise_distances
|
e@0
|
14
|
e@0
|
15 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
|
e@0
|
16
|
e@0
|
17 L = matrix([
|
e@0
|
18 [2,4,5,5,3,2],
|
e@0
|
19 [2,3,4,5,4,3]
|
e@0
|
20 ])
|
e@0
|
21
|
e@0
|
22
|
e@0
|
23 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
|
e@0
|
24 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
|
e@0
|
25 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
|
e@0
|
26 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
|
e@0
|
27
|
e@0
|
28
|
e@0
|
29 def testing():
|
e@0
|
30 L = matrix([
|
e@0
|
31 [2,4,5,5,3,2],
|
e@0
|
32 [2,3,4,5,4,3]
|
e@0
|
33 ])
|
e@0
|
34
|
e@0
|
35 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
|
e@0
|
36 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
|
e@0
|
37 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
|
e@0
|
38 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
|
e@0
|
39
|
e@0
|
40
|
e@0
|
41 def kmeans(X, k):
|
e@0
|
42 # simulate matlab's kmeans using python's sklearn.cluster
|
e@0
|
43 k_means = cluster.KMeans(n_clusters=k)
|
e@0
|
44 k_means.fit(X)
|
e@0
|
45
|
e@0
|
46 C = k_means.cluster_centers_
|
e@0
|
47 idx = k_means.labels_
|
e@0
|
48
|
e@0
|
49
|
e@0
|
50 # Create inter-cluster distance sums
|
e@0
|
51 sumd = zeros((k,1))
|
e@0
|
52 for i in range(0, k):
|
e@0
|
53 S = X[idx == i, :] - C[i, :]
|
e@0
|
54 sumd[i] = sum(array(S)**2)
|
e@0
|
55
|
e@0
|
56 # sumd = sum(array(S)**2)
|
e@0
|
57
|
e@0
|
58 D = matrix(pairwise_distances(X, C))
|
e@0
|
59
|
e@0
|
60 return idx,C,sumd,D
|
e@0
|
61
|
e@0
|
62
|
e@0
|
63 def princomp(A):
|
e@0
|
64 # Adapted from THE GLOWING PYTHON
|
e@0
|
65
|
e@0
|
66 if A.shape[1] == 1:
|
e@0
|
67 return 1, 0, 0
|
e@0
|
68
|
e@0
|
69 A = matrix(A)
|
e@0
|
70 M = mean(A, axis=1)
|
e@0
|
71
|
e@0
|
72 Rxx = matrix(corrcoef(A - M, ddof=1))
|
e@0
|
73 #print A-M
|
e@0
|
74 latent,coeff = linalg.eig(Rxx)
|
e@0
|
75 p = size(coeff,axis=1)
|
e@0
|
76 idx = argsort(latent)
|
e@0
|
77 idx = idx[::-1]
|
e@0
|
78
|
e@0
|
79
|
e@0
|
80 coeff = coeff[:,idx]
|
e@0
|
81 latent = latent[idx] # sorting eigenvalues
|
e@0
|
82 score = matrix(coeff).T * (A-M)# projection of the data in the new space
|
e@0
|
83 return matrix(coeff),matrix(score).T,matrix(latent).T
|
e@0
|
84
|
e@0
|
85
|
e@0
|
86
|
e@0
|
87 def papca(X):
|
e@0
|
88 """ Horn's method for parallel PCA, returns nComp: the number of
|
e@0
|
89 principal compoents to use."""
|
e@0
|
90
|
e@0
|
91 N = 1000
|
e@0
|
92 X = matrix(X)
|
e@0
|
93 coeff, score, lq = princomp(X)
|
e@0
|
94
|
e@0
|
95 eigs = matrix(zeros((len(lq), N)))
|
e@0
|
96
|
e@0
|
97 for k in range(1, N+1):
|
e@0
|
98 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
|
e@0
|
99 eigs[:,k-1] = lrq
|
e@0
|
100
|
e@0
|
101 lrqbar = mean(eigs, 1)
|
e@0
|
102 ladjq = lq - lrqbar + 1
|
e@0
|
103
|
e@0
|
104 nComp = sum(ladjq > 1)
|
e@0
|
105
|
e@0
|
106 if nComp == 0:
|
e@0
|
107 return 1
|
e@0
|
108 else:
|
e@0
|
109 return nComp
|
e@0
|
110
|
e@0
|
111
|
e@0
|
112
|
e@0
|
113
|
e@0
|
114
|
e@0
|
115 def pfa(X, Variability = 0.81):
|
e@0
|
116
|
e@0
|
117 def choose_n(Lambda, variability = Variability):
|
e@0
|
118 """ Choose for variability """
|
e@0
|
119 sumL = sum(Lambda)
|
e@0
|
120 suml = 0
|
e@0
|
121
|
e@0
|
122 for i in range(0, len(Lambda)):
|
e@0
|
123 suml += Lambda[i]
|
e@0
|
124 v = suml/sumL
|
e@0
|
125 if v >= variability:
|
e@0
|
126 return i+1
|
e@0
|
127
|
e@0
|
128 # print X
|
e@0
|
129 X = matrix(X)
|
e@0
|
130 # print X
|
e@0
|
131 N = shape(X)[1]
|
e@0
|
132
|
e@0
|
133 meanX = mean(X)
|
e@0
|
134 stdX = std(X, ddof=1)
|
e@0
|
135
|
e@0
|
136 X = X - meanX
|
e@0
|
137 X = X / stdX
|
e@0
|
138
|
e@0
|
139
|
e@0
|
140 Sigma = corrcoef(X, ddof=1)
|
e@0
|
141
|
e@0
|
142 Lambda, A = eig(Sigma)
|
e@0
|
143
|
e@0
|
144
|
e@0
|
145 q = choose_n(Lambda, Variability)
|
e@0
|
146
|
e@0
|
147 # Sort Eigenvalues
|
e@0
|
148
|
e@0
|
149 idx = argsort(Lambda)[::-1]
|
e@0
|
150
|
e@0
|
151 Lambda = Lambda[idx]
|
e@0
|
152
|
e@0
|
153 A = matrix(A)
|
e@0
|
154 A = A[:,idx]
|
e@0
|
155
|
e@0
|
156 Aq = A[:, 0:q]
|
e@0
|
157
|
e@0
|
158 p = q + 2
|
e@0
|
159
|
e@0
|
160 p = min(Aq.shape[0], p)
|
e@0
|
161
|
e@0
|
162 # Todo K _means
|
e@0
|
163
|
e@0
|
164
|
e@0
|
165 labels, centers, sumd, D = kmeans(Aq, p)
|
e@0
|
166
|
e@0
|
167 tokeep = []
|
e@0
|
168
|
e@0
|
169 for i in range(0,p):
|
e@0
|
170 # Find vectors in clusters
|
e@0
|
171 vectors_idx = find(labels==i)
|
e@0
|
172 vectors = Aq[vectors_idx,:]
|
e@0
|
173 # Compute mean
|
e@0
|
174 vectors_mean = mean(vectors, 0)
|
e@0
|
175
|
e@0
|
176 # Compute distance from mean
|
e@0
|
177 distances_from_mean = pairwise_distances(vectors, vectors_mean)
|
e@0
|
178
|
e@0
|
179 # Get the vector with the minimum distance
|
e@0
|
180 sorted_idx = argsort(distances_from_mean.T)
|
e@0
|
181 closest_idx = vectors_idx[sorted_idx]
|
e@0
|
182 tokeep.append(int(closest_idx[0,0]))
|
e@0
|
183
|
e@0
|
184
|
e@0
|
185
|
e@0
|
186 return sort(tokeep)
|
e@0
|
187
|
e@0
|
188
|
e@0
|
189 def convert_to_pc(data, kernel, q, features):
|
e@0
|
190 """ Expects a PC kernel, the desired number
|
e@0
|
191 of PCs and an array of the kept features
|
e@0
|
192 and returns the transformed data projected
|
e@0
|
193 into the PC kernel space"""
|
e@0
|
194
|
e@0
|
195
|
e@0
|
196 data_n = data[features, :]
|
e@0
|
197 pc = kernel.T*data_n
|
e@0
|
198
|
e@0
|
199 return matrix(pc[0:q, :])
|
e@0
|
200
|
e@0
|
201 def extract_pca_configuration_from_data(data, Variability = 0.9):
|
e@0
|
202 """ Expects a k x n Matrix of n samples of k features and returns
|
e@0
|
203 a matrix Kernel with the PCs and an array Features of the chosen features"""
|
e@0
|
204
|
e@0
|
205 print "[II] Selecting salient features"
|
e@0
|
206 features = pfa(data);
|
e@0
|
207 data_n = data[features, :]
|
e@0
|
208 #data_n = data
|
e@0
|
209
|
e@0
|
210
|
e@0
|
211
|
e@0
|
212 print "[II] Selected %d features" % len(features)
|
e@0
|
213
|
e@0
|
214 print "[II] Calculating principal components coefficients"
|
e@0
|
215 coeff, score, latent = princomp(data_n)
|
e@0
|
216
|
e@0
|
217 print "[II] Calculating the principal components"
|
e@0
|
218 pc = coeff.T*data_n
|
e@0
|
219
|
e@0
|
220 print "[II] Doing horn's Parallel Analysis and setting number of vectors"
|
e@0
|
221 q = papca(data_n)
|
e@0
|
222
|
e@0
|
223 print "[II] Selected %d PCs to include" % q
|
e@0
|
224
|
e@0
|
225 pc[q+1:, :] = 0
|
e@0
|
226
|
e@0
|
227 test_data = zeros((data.shape[0], data.shape[1]))
|
e@0
|
228 test_data[features, :] = coeff*pc
|
e@0
|
229
|
e@0
|
230 #test_data = coeff*pc
|
e@0
|
231
|
e@0
|
232
|
e@0
|
233 print test_data
|
e@0
|
234 print ","
|
e@0
|
235 print data
|
e@0
|
236
|
e@0
|
237 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data, data)
|
e@0
|
238
|
e@0
|
239 # Extract Kernel, number of PCs and array of features
|
e@0
|
240 return coeff, q, features
|
e@0
|
241
|
e@0
|
242
|
e@0
|
243
|
e@0
|
244
|
e@0
|
245
|
e@0
|
246
|
e@0
|
247
|
e@0
|
248
|
e@0
|
249 if __name__=="__main__":
|
e@0
|
250 import sys
|
e@0
|
251
|
e@0
|
252 test = array([
|
e@0
|
253 [1., 2, 3, 4],
|
e@0
|
254 [5, 6, 7, 8],
|
e@0
|
255 [9,10,11,12],
|
e@0
|
256 [8, 9, 7, 6]
|
e@0
|
257 ])
|
e@0
|
258
|
e@0
|
259
|
e@0
|
260
|
e@0
|
261
|