comparison experiment-reverb/code/pca.py @ 0:246d5546657c

initial commit, needs cleanup
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Wed, 14 Dec 2016 13:15:48 +0000
parents
children 144fbd1d29c3
comparison
equal deleted inserted replaced
-1:000000000000 0:246d5546657c
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Apr 20 14:51:02 2015
4
5 @author: Emmanouil Chourdakis
6
7 Principal Factor Analisys
8
9 """
10
11 from numpy import *
12 from sklearn import cluster
13 from sklearn.metrics import pairwise_distances
14 from numpy.linalg.linalg import eig
15 from matplotlib.mlab import find
16 from numpy.random import randn
17 from empca import *
18
19 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
20
21 L = matrix([
22 [2,4,5,5,3,2],
23 [2,3,4,5,4,3]
24 ])
25
26
27 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
28 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
29 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
30 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
31
32
33 def testing():
34 L = matrix([
35 [2,4,5,5,3,2],
36 [2,3,4,5,4,3]
37 ])
38
39 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
40 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
41 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
42 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
43
44
45 def kmeans(X, k):
46 # simulate matlab's kmeans using python's sklearn.cluster
47 k_means = cluster.KMeans(n_clusters=k)
48 k_means.fit(X)
49
50 C = k_means.cluster_centers_
51 idx = k_means.labels_
52
53
54 # Create inter-cluster distance sums
55 sumd = zeros((k,1))
56 for i in range(0, k):
57 S = X[idx == i, :] - C[i, :]
58 sumd[i] = sum(array(S)**2)
59
60 # sumd = sum(array(S)**2)
61
62 D = matrix(pairwise_distances(X, C))
63
64 return idx,C,sumd,D
65
66
67 def princomp(A):
68 # Adapted from THE GLOWING PYTHON
69
70 if A.shape[1] == 1:
71 return 1, 0, 0
72
73 A = matrix(A)
74 M = mean(A, axis=1)
75
76 Rxx = matrix(corrcoef(A - M, ddof=1))
77 # Rxx = matrix(cov(A-M, ddof=1))
78 #print A-M
79 latent,coeff = linalg.eig(Rxx)
80 p = size(coeff,axis=1)
81 idx = argsort(latent)
82 idx = idx[::-1]
83
84
85 coeff = coeff[:,idx]
86 latent = latent[idx] # sorting eigenvalues
87 score = matrix(coeff).T * (A-M)# projection of the data in the new space
88 return matrix(coeff),matrix(score).T,matrix(latent).T
89
90
91
92
93 def papca(X):
94 """ Horn's method for parallel PCA, returns nComp: the number of
95 principal compoents to use."""
96
97 N = 1000
98 X = matrix(X)
99 coeff, score, lq = princomp(X)
100
101 eigs = matrix(zeros((len(lq), N)))
102
103 for k in range(1, N+1):
104 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
105 eigs[:,k-1] = lrq
106
107 lrqbar = mean(eigs, 1)
108 ladjq = lq - lrqbar + 1
109
110 nComp = sum(ladjq > 1)
111
112 if nComp == 0:
113 return 1
114 else:
115 return nComp
116
117
118
119
120
121 def pfa(X, Variability = 0.81):
122
123 def choose_n(Lambda, variability = Variability):
124 """ Choose for variability """
125 sumL = sum(Lambda)
126 suml = 0
127
128 for i in range(0, len(Lambda)):
129 suml += Lambda[i]
130 v = suml/sumL
131 if v >= variability:
132 return i+1
133
134 # print X
135 X = matrix(X)
136 # print X
137 N = shape(X)[1]
138
139 meanX = mean(X)
140 stdX = std(X, ddof=1)
141
142 X = X - meanX
143 X = X / stdX
144
145
146 Sigma = corrcoef(X, ddof=1)
147
148 Lambda, A = eig(Sigma)
149
150
151 q = choose_n(Lambda, Variability)
152
153 # Sort Eigenvalues
154
155 idx = argsort(Lambda)[::-1]
156
157 Lambda = Lambda[idx]
158
159 A = matrix(A)
160 A = A[:,idx]
161
162 Aq = A[:, 0:q]
163
164 p = q + 2
165
166 p = min(Aq.shape[0], p)
167
168 # Todo K _means
169
170
171 labels, centers, sumd, D = kmeans(Aq, p)
172
173 tokeep = []
174
175 for i in range(0,p):
176 # Find vectors in clusters
177 vectors_idx = find(labels==i)
178 vectors = Aq[vectors_idx,:]
179 # Compute mean
180 vectors_mean = mean(vectors, 0)
181
182 # Compute distance from mean
183 distances_from_mean = pairwise_distances(vectors, vectors_mean)
184
185 # Get the vector with the minimum distance
186 sorted_idx = argsort(distances_from_mean.T)
187 closest_idx = vectors_idx[sorted_idx]
188 tokeep.append(int(closest_idx[0,0]))
189
190
191
192 return sort(tokeep)
193
194
195 def convert_to_pc(data, kernel, q, features):
196 """ Expects a PC kernel, the desired number
197 of PCs and an array of the kept features
198 and returns the transformed data projected
199 into the PC kernel space"""
200
201
202 data_n = data[features, :]
203 pc = kernel.T*data_n
204
205 return matrix(pc[0:q, :])
206
207 def extract_pca_configuration_from_data(data, Variability = 0.9):
208 """ Expects a k x n Matrix of n samples of k features and returns
209 a matrix Kernel with the PCs and an array Features of the chosen features"""
210
211 print "[II] Selecting salient features"
212 features = pfa(data);
213 data_n = data[features, :]
214 #data_n = data
215
216
217
218 print "[II] Selected %d features" % len(features)
219
220 print "[II] Calculating principal components coefficients"
221 coeff, score, latent = princomp(data_n)
222
223 print "[II] Calculating the principal components"
224 pc = coeff.T*data_n
225
226 print "[II] Doing horn's Parallel Analysis and setting number of vectors"
227 q = papca(data_n)
228
229 print "[II] Selected %d PCs to include" % q
230
231 if q < pc.shape[0]:
232 pc[q+1:, :] = 0
233
234 test_data = zeros((data.shape[0], data.shape[1]))
235 test_data[features, :] = coeff*pc
236
237 #test_data = coeff*pc
238
239
240 # print test_data
241 # print ","
242 # print data
243
244 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data[features,:], data_n)
245
246 # Extract Kernel, number of PCs and array of features
247 return coeff, q, features
248
249
250
251
252
253
254
255
256 if __name__=="__main__":
257 import sys
258
259 test = array([
260 [1., 2, 3, 4],
261 [5, 6, 7, 8],
262 [9,10,11,12],
263 [8, 9, 7, 6]
264 ])