comparison experiment-reverb/code/pfa.py @ 0:246d5546657c

initial commit, needs cleanup
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 14 Dec 2016 13:15:48 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:246d5546657c
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Apr 20 14:51:02 2015
4
5 @author: Emmanouil Chourdakis
6
7 Principal Factor Analisys
8
9 """
10
11 from numpy import *
12 from sklearn import cluster
13 from sklearn.metrics import pairwise_distances
14
15 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
16
17 L = matrix([
18 [2,4,5,5,3,2],
19 [2,3,4,5,4,3]
20 ])
21
22
23 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
24 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
25 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
26 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
27
28
29 def testing():
30 L = matrix([
31 [2,4,5,5,3,2],
32 [2,3,4,5,4,3]
33 ])
34
35 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
36 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
37 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
38 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
39
40
41 def kmeans(X, k):
42 # simulate matlab's kmeans using python's sklearn.cluster
43 k_means = cluster.KMeans(n_clusters=k)
44 k_means.fit(X)
45
46 C = k_means.cluster_centers_
47 idx = k_means.labels_
48
49
50 # Create inter-cluster distance sums
51 sumd = zeros((k,1))
52 for i in range(0, k):
53 S = X[idx == i, :] - C[i, :]
54 sumd[i] = sum(array(S)**2)
55
56 # sumd = sum(array(S)**2)
57
58 D = matrix(pairwise_distances(X, C))
59
60 return idx,C,sumd,D
61
62
63 def princomp(A):
64 # Adapted from THE GLOWING PYTHON
65
66 if A.shape[1] == 1:
67 return 1, 0, 0
68
69 A = matrix(A)
70 M = mean(A, axis=1)
71
72 Rxx = matrix(corrcoef(A - M, ddof=1))
73 #print A-M
74 latent,coeff = linalg.eig(Rxx)
75 p = size(coeff,axis=1)
76 idx = argsort(latent)
77 idx = idx[::-1]
78
79
80 coeff = coeff[:,idx]
81 latent = latent[idx] # sorting eigenvalues
82 score = matrix(coeff).T * (A-M)# projection of the data in the new space
83 return matrix(coeff),matrix(score).T,matrix(latent).T
84
85
86
87 def papca(X):
88 """ Horn's method for parallel PCA, returns nComp: the number of
89 principal compoents to use."""
90
91 N = 1000
92 X = matrix(X)
93 coeff, score, lq = princomp(X)
94
95 eigs = matrix(zeros((len(lq), N)))
96
97 for k in range(1, N+1):
98 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
99 eigs[:,k-1] = lrq
100
101 lrqbar = mean(eigs, 1)
102 ladjq = lq - lrqbar + 1
103
104 nComp = sum(ladjq > 1)
105
106 if nComp == 0:
107 return 1
108 else:
109 return nComp
110
111
112
113
114
115 def pfa(X, Variability = 0.81):
116
117 def choose_n(Lambda, variability = Variability):
118 """ Choose for variability """
119 sumL = sum(Lambda)
120 suml = 0
121
122 for i in range(0, len(Lambda)):
123 suml += Lambda[i]
124 v = suml/sumL
125 if v >= variability:
126 return i+1
127
128 # print X
129 X = matrix(X)
130 # print X
131 N = shape(X)[1]
132
133 meanX = mean(X)
134 stdX = std(X, ddof=1)
135
136 X = X - meanX
137 X = X / stdX
138
139
140 Sigma = corrcoef(X, ddof=1)
141
142 Lambda, A = eig(Sigma)
143
144
145 q = choose_n(Lambda, Variability)
146
147 # Sort Eigenvalues
148
149 idx = argsort(Lambda)[::-1]
150
151 Lambda = Lambda[idx]
152
153 A = matrix(A)
154 A = A[:,idx]
155
156 Aq = A[:, 0:q]
157
158 p = q + 2
159
160 p = min(Aq.shape[0], p)
161
162 # Todo K _means
163
164
165 labels, centers, sumd, D = kmeans(Aq, p)
166
167 tokeep = []
168
169 for i in range(0,p):
170 # Find vectors in clusters
171 vectors_idx = find(labels==i)
172 vectors = Aq[vectors_idx,:]
173 # Compute mean
174 vectors_mean = mean(vectors, 0)
175
176 # Compute distance from mean
177 distances_from_mean = pairwise_distances(vectors, vectors_mean)
178
179 # Get the vector with the minimum distance
180 sorted_idx = argsort(distances_from_mean.T)
181 closest_idx = vectors_idx[sorted_idx]
182 tokeep.append(int(closest_idx[0,0]))
183
184
185
186 return sort(tokeep)
187
188
189 def convert_to_pc(data, kernel, q, features):
190 """ Expects a PC kernel, the desired number
191 of PCs and an array of the kept features
192 and returns the transformed data projected
193 into the PC kernel space"""
194
195
196 data_n = data[features, :]
197 pc = kernel.T*data_n
198
199 return matrix(pc[0:q, :])
200
201 def extract_pca_configuration_from_data(data, Variability = 0.9):
202 """ Expects a k x n Matrix of n samples of k features and returns
203 a matrix Kernel with the PCs and an array Features of the chosen features"""
204
205 print "[II] Selecting salient features"
206 features = pfa(data);
207 data_n = data[features, :]
208 #data_n = data
209
210
211
212 print "[II] Selected %d features" % len(features)
213
214 print "[II] Calculating principal components coefficients"
215 coeff, score, latent = princomp(data_n)
216
217 print "[II] Calculating the principal components"
218 pc = coeff.T*data_n
219
220 print "[II] Doing horn's Parallel Analysis and setting number of vectors"
221 q = papca(data_n)
222
223 print "[II] Selected %d PCs to include" % q
224
225 pc[q+1:, :] = 0
226
227 test_data = zeros((data.shape[0], data.shape[1]))
228 test_data[features, :] = coeff*pc
229
230 #test_data = coeff*pc
231
232
233 print test_data
234 print ","
235 print data
236
237 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data, data)
238
239 # Extract Kernel, number of PCs and array of features
240 return coeff, q, features
241
242
243
244
245
246
247
248
249 if __name__=="__main__":
250 import sys
251
252 test = array([
253 [1., 2, 3, 4],
254 [5, 6, 7, 8],
255 [9,10,11,12],
256 [8, 9, 7, 6]
257 ])
258
259
260
261