Mercurial > hg > chourdakisreiss2016
comparison experiment-reverb/code/pca.py @ 0:246d5546657c
initial commit, needs cleanup
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 14 Dec 2016 13:15:48 +0000 |
parents | |
children | 144fbd1d29c3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:246d5546657c |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Mon Apr 20 14:51:02 2015 | |
4 | |
5 @author: Emmanouil Chourdakis | |
6 | |
7 Principal Factor Analisys | |
8 | |
9 """ | |
10 | |
11 from numpy import * | |
12 from sklearn import cluster | |
13 from sklearn.metrics import pairwise_distances | |
14 from numpy.linalg.linalg import eig | |
15 from matplotlib.mlab import find | |
16 from numpy.random import randn | |
17 from empca import * | |
18 | |
19 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() | |
20 | |
21 L = matrix([ | |
22 [2,4,5,5,3,2], | |
23 [2,3,4,5,4,3] | |
24 ]) | |
25 | |
26 | |
27 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], | |
28 [-0.16114828, -0.70320328, 0.25243464, -0.10076863], | |
29 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], | |
30 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) | |
31 | |
32 | |
33 def testing(): | |
34 L = matrix([ | |
35 [2,4,5,5,3,2], | |
36 [2,3,4,5,4,3] | |
37 ]) | |
38 | |
39 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], | |
40 [-0.16114828, -0.70320328, 0.25243464, -0.10076863], | |
41 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], | |
42 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) | |
43 | |
44 | |
45 def kmeans(X, k): | |
46 # simulate matlab's kmeans using python's sklearn.cluster | |
47 k_means = cluster.KMeans(n_clusters=k) | |
48 k_means.fit(X) | |
49 | |
50 C = k_means.cluster_centers_ | |
51 idx = k_means.labels_ | |
52 | |
53 | |
54 # Create inter-cluster distance sums | |
55 sumd = zeros((k,1)) | |
56 for i in range(0, k): | |
57 S = X[idx == i, :] - C[i, :] | |
58 sumd[i] = sum(array(S)**2) | |
59 | |
60 # sumd = sum(array(S)**2) | |
61 | |
62 D = matrix(pairwise_distances(X, C)) | |
63 | |
64 return idx,C,sumd,D | |
65 | |
66 | |
67 def princomp(A): | |
68 # Adapted from THE GLOWING PYTHON | |
69 | |
70 if A.shape[1] == 1: | |
71 return 1, 0, 0 | |
72 | |
73 A = matrix(A) | |
74 M = mean(A, axis=1) | |
75 | |
76 Rxx = matrix(corrcoef(A - M, ddof=1)) | |
77 # Rxx = matrix(cov(A-M, ddof=1)) | |
78 #print A-M | |
79 latent,coeff = linalg.eig(Rxx) | |
80 p = size(coeff,axis=1) | |
81 idx = argsort(latent) | |
82 idx = idx[::-1] | |
83 | |
84 | |
85 coeff = coeff[:,idx] | |
86 latent = latent[idx] # sorting eigenvalues | |
87 score = matrix(coeff).T * (A-M)# projection of the data in the new space | |
88 return matrix(coeff),matrix(score).T,matrix(latent).T | |
89 | |
90 | |
91 | |
92 | |
93 def papca(X): | |
94 """ Horn's method for parallel PCA, returns nComp: the number of | |
95 principal compoents to use.""" | |
96 | |
97 N = 1000 | |
98 X = matrix(X) | |
99 coeff, score, lq = princomp(X) | |
100 | |
101 eigs = matrix(zeros((len(lq), N))) | |
102 | |
103 for k in range(1, N+1): | |
104 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1])) | |
105 eigs[:,k-1] = lrq | |
106 | |
107 lrqbar = mean(eigs, 1) | |
108 ladjq = lq - lrqbar + 1 | |
109 | |
110 nComp = sum(ladjq > 1) | |
111 | |
112 if nComp == 0: | |
113 return 1 | |
114 else: | |
115 return nComp | |
116 | |
117 | |
118 | |
119 | |
120 | |
121 def pfa(X, Variability = 0.81): | |
122 | |
123 def choose_n(Lambda, variability = Variability): | |
124 """ Choose for variability """ | |
125 sumL = sum(Lambda) | |
126 suml = 0 | |
127 | |
128 for i in range(0, len(Lambda)): | |
129 suml += Lambda[i] | |
130 v = suml/sumL | |
131 if v >= variability: | |
132 return i+1 | |
133 | |
134 # print X | |
135 X = matrix(X) | |
136 # print X | |
137 N = shape(X)[1] | |
138 | |
139 meanX = mean(X) | |
140 stdX = std(X, ddof=1) | |
141 | |
142 X = X - meanX | |
143 X = X / stdX | |
144 | |
145 | |
146 Sigma = corrcoef(X, ddof=1) | |
147 | |
148 Lambda, A = eig(Sigma) | |
149 | |
150 | |
151 q = choose_n(Lambda, Variability) | |
152 | |
153 # Sort Eigenvalues | |
154 | |
155 idx = argsort(Lambda)[::-1] | |
156 | |
157 Lambda = Lambda[idx] | |
158 | |
159 A = matrix(A) | |
160 A = A[:,idx] | |
161 | |
162 Aq = A[:, 0:q] | |
163 | |
164 p = q + 2 | |
165 | |
166 p = min(Aq.shape[0], p) | |
167 | |
168 # Todo K _means | |
169 | |
170 | |
171 labels, centers, sumd, D = kmeans(Aq, p) | |
172 | |
173 tokeep = [] | |
174 | |
175 for i in range(0,p): | |
176 # Find vectors in clusters | |
177 vectors_idx = find(labels==i) | |
178 vectors = Aq[vectors_idx,:] | |
179 # Compute mean | |
180 vectors_mean = mean(vectors, 0) | |
181 | |
182 # Compute distance from mean | |
183 distances_from_mean = pairwise_distances(vectors, vectors_mean) | |
184 | |
185 # Get the vector with the minimum distance | |
186 sorted_idx = argsort(distances_from_mean.T) | |
187 closest_idx = vectors_idx[sorted_idx] | |
188 tokeep.append(int(closest_idx[0,0])) | |
189 | |
190 | |
191 | |
192 return sort(tokeep) | |
193 | |
194 | |
195 def convert_to_pc(data, kernel, q, features): | |
196 """ Expects a PC kernel, the desired number | |
197 of PCs and an array of the kept features | |
198 and returns the transformed data projected | |
199 into the PC kernel space""" | |
200 | |
201 | |
202 data_n = data[features, :] | |
203 pc = kernel.T*data_n | |
204 | |
205 return matrix(pc[0:q, :]) | |
206 | |
207 def extract_pca_configuration_from_data(data, Variability = 0.9): | |
208 """ Expects a k x n Matrix of n samples of k features and returns | |
209 a matrix Kernel with the PCs and an array Features of the chosen features""" | |
210 | |
211 print "[II] Selecting salient features" | |
212 features = pfa(data); | |
213 data_n = data[features, :] | |
214 #data_n = data | |
215 | |
216 | |
217 | |
218 print "[II] Selected %d features" % len(features) | |
219 | |
220 print "[II] Calculating principal components coefficients" | |
221 coeff, score, latent = princomp(data_n) | |
222 | |
223 print "[II] Calculating the principal components" | |
224 pc = coeff.T*data_n | |
225 | |
226 print "[II] Doing horn's Parallel Analysis and setting number of vectors" | |
227 q = papca(data_n) | |
228 | |
229 print "[II] Selected %d PCs to include" % q | |
230 | |
231 if q < pc.shape[0]: | |
232 pc[q+1:, :] = 0 | |
233 | |
234 test_data = zeros((data.shape[0], data.shape[1])) | |
235 test_data[features, :] = coeff*pc | |
236 | |
237 #test_data = coeff*pc | |
238 | |
239 | |
240 # print test_data | |
241 # print "," | |
242 # print data | |
243 | |
244 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data[features,:], data_n) | |
245 | |
246 # Extract Kernel, number of PCs and array of features | |
247 return coeff, q, features | |
248 | |
249 | |
250 | |
251 | |
252 | |
253 | |
254 | |
255 | |
256 if __name__=="__main__": | |
257 import sys | |
258 | |
259 test = array([ | |
260 [1., 2, 3, 4], | |
261 [5, 6, 7, 8], | |
262 [9,10,11,12], | |
263 [8, 9, 7, 6] | |
264 ]) |