Mercurial > hg > chourdakisreiss2016
comparison experiment-reverb/code/pfa.py @ 0:246d5546657c
initial commit, needs cleanup
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 14 Dec 2016 13:15:48 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:246d5546657c |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Mon Apr 20 14:51:02 2015 | |
4 | |
5 @author: Emmanouil Chourdakis | |
6 | |
7 Principal Factor Analisys | |
8 | |
9 """ | |
10 | |
11 from numpy import * | |
12 from sklearn import cluster | |
13 from sklearn.metrics import pairwise_distances | |
14 | |
15 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() | |
16 | |
17 L = matrix([ | |
18 [2,4,5,5,3,2], | |
19 [2,3,4,5,4,3] | |
20 ]) | |
21 | |
22 | |
23 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], | |
24 [-0.16114828, -0.70320328, 0.25243464, -0.10076863], | |
25 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], | |
26 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) | |
27 | |
28 | |
29 def testing(): | |
30 L = matrix([ | |
31 [2,4,5,5,3,2], | |
32 [2,3,4,5,4,3] | |
33 ]) | |
34 | |
35 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], | |
36 [-0.16114828, -0.70320328, 0.25243464, -0.10076863], | |
37 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], | |
38 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) | |
39 | |
40 | |
41 def kmeans(X, k): | |
42 # simulate matlab's kmeans using python's sklearn.cluster | |
43 k_means = cluster.KMeans(n_clusters=k) | |
44 k_means.fit(X) | |
45 | |
46 C = k_means.cluster_centers_ | |
47 idx = k_means.labels_ | |
48 | |
49 | |
50 # Create inter-cluster distance sums | |
51 sumd = zeros((k,1)) | |
52 for i in range(0, k): | |
53 S = X[idx == i, :] - C[i, :] | |
54 sumd[i] = sum(array(S)**2) | |
55 | |
56 # sumd = sum(array(S)**2) | |
57 | |
58 D = matrix(pairwise_distances(X, C)) | |
59 | |
60 return idx,C,sumd,D | |
61 | |
62 | |
63 def princomp(A): | |
64 # Adapted from THE GLOWING PYTHON | |
65 | |
66 if A.shape[1] == 1: | |
67 return 1, 0, 0 | |
68 | |
69 A = matrix(A) | |
70 M = mean(A, axis=1) | |
71 | |
72 Rxx = matrix(corrcoef(A - M, ddof=1)) | |
73 #print A-M | |
74 latent,coeff = linalg.eig(Rxx) | |
75 p = size(coeff,axis=1) | |
76 idx = argsort(latent) | |
77 idx = idx[::-1] | |
78 | |
79 | |
80 coeff = coeff[:,idx] | |
81 latent = latent[idx] # sorting eigenvalues | |
82 score = matrix(coeff).T * (A-M)# projection of the data in the new space | |
83 return matrix(coeff),matrix(score).T,matrix(latent).T | |
84 | |
85 | |
86 | |
87 def papca(X): | |
88 """ Horn's method for parallel PCA, returns nComp: the number of | |
89 principal compoents to use.""" | |
90 | |
91 N = 1000 | |
92 X = matrix(X) | |
93 coeff, score, lq = princomp(X) | |
94 | |
95 eigs = matrix(zeros((len(lq), N))) | |
96 | |
97 for k in range(1, N+1): | |
98 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1])) | |
99 eigs[:,k-1] = lrq | |
100 | |
101 lrqbar = mean(eigs, 1) | |
102 ladjq = lq - lrqbar + 1 | |
103 | |
104 nComp = sum(ladjq > 1) | |
105 | |
106 if nComp == 0: | |
107 return 1 | |
108 else: | |
109 return nComp | |
110 | |
111 | |
112 | |
113 | |
114 | |
115 def pfa(X, Variability = 0.81): | |
116 | |
117 def choose_n(Lambda, variability = Variability): | |
118 """ Choose for variability """ | |
119 sumL = sum(Lambda) | |
120 suml = 0 | |
121 | |
122 for i in range(0, len(Lambda)): | |
123 suml += Lambda[i] | |
124 v = suml/sumL | |
125 if v >= variability: | |
126 return i+1 | |
127 | |
128 # print X | |
129 X = matrix(X) | |
130 # print X | |
131 N = shape(X)[1] | |
132 | |
133 meanX = mean(X) | |
134 stdX = std(X, ddof=1) | |
135 | |
136 X = X - meanX | |
137 X = X / stdX | |
138 | |
139 | |
140 Sigma = corrcoef(X, ddof=1) | |
141 | |
142 Lambda, A = eig(Sigma) | |
143 | |
144 | |
145 q = choose_n(Lambda, Variability) | |
146 | |
147 # Sort Eigenvalues | |
148 | |
149 idx = argsort(Lambda)[::-1] | |
150 | |
151 Lambda = Lambda[idx] | |
152 | |
153 A = matrix(A) | |
154 A = A[:,idx] | |
155 | |
156 Aq = A[:, 0:q] | |
157 | |
158 p = q + 2 | |
159 | |
160 p = min(Aq.shape[0], p) | |
161 | |
162 # Todo K _means | |
163 | |
164 | |
165 labels, centers, sumd, D = kmeans(Aq, p) | |
166 | |
167 tokeep = [] | |
168 | |
169 for i in range(0,p): | |
170 # Find vectors in clusters | |
171 vectors_idx = find(labels==i) | |
172 vectors = Aq[vectors_idx,:] | |
173 # Compute mean | |
174 vectors_mean = mean(vectors, 0) | |
175 | |
176 # Compute distance from mean | |
177 distances_from_mean = pairwise_distances(vectors, vectors_mean) | |
178 | |
179 # Get the vector with the minimum distance | |
180 sorted_idx = argsort(distances_from_mean.T) | |
181 closest_idx = vectors_idx[sorted_idx] | |
182 tokeep.append(int(closest_idx[0,0])) | |
183 | |
184 | |
185 | |
186 return sort(tokeep) | |
187 | |
188 | |
189 def convert_to_pc(data, kernel, q, features): | |
190 """ Expects a PC kernel, the desired number | |
191 of PCs and an array of the kept features | |
192 and returns the transformed data projected | |
193 into the PC kernel space""" | |
194 | |
195 | |
196 data_n = data[features, :] | |
197 pc = kernel.T*data_n | |
198 | |
199 return matrix(pc[0:q, :]) | |
200 | |
201 def extract_pca_configuration_from_data(data, Variability = 0.9): | |
202 """ Expects a k x n Matrix of n samples of k features and returns | |
203 a matrix Kernel with the PCs and an array Features of the chosen features""" | |
204 | |
205 print "[II] Selecting salient features" | |
206 features = pfa(data); | |
207 data_n = data[features, :] | |
208 #data_n = data | |
209 | |
210 | |
211 | |
212 print "[II] Selected %d features" % len(features) | |
213 | |
214 print "[II] Calculating principal components coefficients" | |
215 coeff, score, latent = princomp(data_n) | |
216 | |
217 print "[II] Calculating the principal components" | |
218 pc = coeff.T*data_n | |
219 | |
220 print "[II] Doing horn's Parallel Analysis and setting number of vectors" | |
221 q = papca(data_n) | |
222 | |
223 print "[II] Selected %d PCs to include" % q | |
224 | |
225 pc[q+1:, :] = 0 | |
226 | |
227 test_data = zeros((data.shape[0], data.shape[1])) | |
228 test_data[features, :] = coeff*pc | |
229 | |
230 #test_data = coeff*pc | |
231 | |
232 | |
233 print test_data | |
234 print "," | |
235 print data | |
236 | |
237 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data, data) | |
238 | |
239 # Extract Kernel, number of PCs and array of features | |
240 return coeff, q, features | |
241 | |
242 | |
243 | |
244 | |
245 | |
246 | |
247 | |
248 | |
249 if __name__=="__main__": | |
250 import sys | |
251 | |
252 test = array([ | |
253 [1., 2, 3, 4], | |
254 [5, 6, 7, 8], | |
255 [9,10,11,12], | |
256 [8, 9, 7, 6] | |
257 ]) | |
258 | |
259 | |
260 | |
261 |