annotate experiment-reverb/code/pca.py @ 2:c87a9505f294 tip

Added LICENSE for code, removed .wav files
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 30 Sep 2017 13:25:50 +0100
parents 144fbd1d29c3
children
rev   line source
e@0 1 # -*- coding: utf-8 -*-
e@0 2 """
e@0 3 Created on Mon Apr 20 14:51:02 2015
e@0 4
e@0 5 @author: Emmanouil Chourdakis
e@0 6
e@0 7 Principal Factor Analisys
e@0 8
e@0 9 """
e@0 10
e@0 11 from numpy import *
e@0 12 from sklearn import cluster
e@0 13 from sklearn.metrics import pairwise_distances
e@0 14 from numpy.linalg.linalg import eig
e@0 15 from matplotlib.mlab import find
e@0 16 from numpy.random import randn
e@1 17 #from empca import *
e@0 18
e@0 19 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
e@0 20
e@0 21 L = matrix([
e@0 22 [2,4,5,5,3,2],
e@0 23 [2,3,4,5,4,3]
e@0 24 ])
e@0 25
e@0 26
e@0 27 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
e@0 28 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
e@0 29 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
e@0 30 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
e@0 31
e@0 32
e@0 33 def testing():
e@0 34 L = matrix([
e@0 35 [2,4,5,5,3,2],
e@0 36 [2,3,4,5,4,3]
e@0 37 ])
e@0 38
e@0 39 K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ],
e@0 40 [-0.16114828, -0.70320328, 0.25243464, -0.10076863],
e@0 41 [ 1.10255852, 0.41740569, 1.14947174, -0.95764054],
e@0 42 [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]]))
e@0 43
e@0 44
e@0 45 def kmeans(X, k):
e@0 46 # simulate matlab's kmeans using python's sklearn.cluster
e@0 47 k_means = cluster.KMeans(n_clusters=k)
e@0 48 k_means.fit(X)
e@0 49
e@0 50 C = k_means.cluster_centers_
e@0 51 idx = k_means.labels_
e@0 52
e@0 53
e@0 54 # Create inter-cluster distance sums
e@0 55 sumd = zeros((k,1))
e@0 56 for i in range(0, k):
e@0 57 S = X[idx == i, :] - C[i, :]
e@0 58 sumd[i] = sum(array(S)**2)
e@0 59
e@0 60 # sumd = sum(array(S)**2)
e@0 61
e@0 62 D = matrix(pairwise_distances(X, C))
e@0 63
e@0 64 return idx,C,sumd,D
e@0 65
e@0 66
e@0 67 def princomp(A):
e@0 68 # Adapted from THE GLOWING PYTHON
e@0 69
e@0 70 if A.shape[1] == 1:
e@0 71 return 1, 0, 0
e@0 72
e@0 73 A = matrix(A)
e@0 74 M = mean(A, axis=1)
e@0 75
e@0 76 Rxx = matrix(corrcoef(A - M, ddof=1))
e@0 77 # Rxx = matrix(cov(A-M, ddof=1))
e@0 78 #print A-M
e@0 79 latent,coeff = linalg.eig(Rxx)
e@0 80 p = size(coeff,axis=1)
e@0 81 idx = argsort(latent)
e@0 82 idx = idx[::-1]
e@0 83
e@0 84
e@0 85 coeff = coeff[:,idx]
e@0 86 latent = latent[idx] # sorting eigenvalues
e@0 87 score = matrix(coeff).T * (A-M)# projection of the data in the new space
e@0 88 return matrix(coeff),matrix(score).T,matrix(latent).T
e@0 89
e@0 90
e@0 91
e@0 92
e@0 93 def papca(X):
e@0 94 """ Horn's method for parallel PCA, returns nComp: the number of
e@0 95 principal compoents to use."""
e@0 96
e@0 97 N = 1000
e@0 98 X = matrix(X)
e@0 99 coeff, score, lq = princomp(X)
e@0 100
e@0 101 eigs = matrix(zeros((len(lq), N)))
e@0 102
e@0 103 for k in range(1, N+1):
e@0 104 coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
e@0 105 eigs[:,k-1] = lrq
e@0 106
e@0 107 lrqbar = mean(eigs, 1)
e@0 108 ladjq = lq - lrqbar + 1
e@0 109
e@0 110 nComp = sum(ladjq > 1)
e@0 111
e@0 112 if nComp == 0:
e@0 113 return 1
e@0 114 else:
e@0 115 return nComp
e@0 116
e@0 117
e@0 118
e@0 119
e@0 120
e@0 121 def pfa(X, Variability = 0.81):
e@0 122
e@0 123 def choose_n(Lambda, variability = Variability):
e@0 124 """ Choose for variability """
e@0 125 sumL = sum(Lambda)
e@0 126 suml = 0
e@0 127
e@0 128 for i in range(0, len(Lambda)):
e@0 129 suml += Lambda[i]
e@0 130 v = suml/sumL
e@0 131 if v >= variability:
e@0 132 return i+1
e@0 133
e@0 134 # print X
e@0 135 X = matrix(X)
e@0 136 # print X
e@0 137 N = shape(X)[1]
e@0 138
e@0 139 meanX = mean(X)
e@0 140 stdX = std(X, ddof=1)
e@0 141
e@0 142 X = X - meanX
e@0 143 X = X / stdX
e@0 144
e@0 145
e@0 146 Sigma = corrcoef(X, ddof=1)
e@0 147
e@0 148 Lambda, A = eig(Sigma)
e@0 149
e@0 150
e@0 151 q = choose_n(Lambda, Variability)
e@0 152
e@0 153 # Sort Eigenvalues
e@0 154
e@0 155 idx = argsort(Lambda)[::-1]
e@0 156
e@0 157 Lambda = Lambda[idx]
e@0 158
e@0 159 A = matrix(A)
e@0 160 A = A[:,idx]
e@0 161
e@0 162 Aq = A[:, 0:q]
e@0 163
e@0 164 p = q + 2
e@0 165
e@0 166 p = min(Aq.shape[0], p)
e@0 167
e@0 168 # Todo K _means
e@0 169
e@0 170
e@0 171 labels, centers, sumd, D = kmeans(Aq, p)
e@0 172
e@0 173 tokeep = []
e@0 174
e@0 175 for i in range(0,p):
e@0 176 # Find vectors in clusters
e@0 177 vectors_idx = find(labels==i)
e@0 178 vectors = Aq[vectors_idx,:]
e@0 179 # Compute mean
e@0 180 vectors_mean = mean(vectors, 0)
e@0 181
e@0 182 # Compute distance from mean
e@0 183 distances_from_mean = pairwise_distances(vectors, vectors_mean)
e@0 184
e@0 185 # Get the vector with the minimum distance
e@0 186 sorted_idx = argsort(distances_from_mean.T)
e@0 187 closest_idx = vectors_idx[sorted_idx]
e@0 188 tokeep.append(int(closest_idx[0,0]))
e@0 189
e@0 190
e@0 191
e@0 192 return sort(tokeep)
e@0 193
e@0 194
e@0 195 def convert_to_pc(data, kernel, q, features):
e@0 196 """ Expects a PC kernel, the desired number
e@0 197 of PCs and an array of the kept features
e@0 198 and returns the transformed data projected
e@0 199 into the PC kernel space"""
e@0 200
e@0 201
e@0 202 data_n = data[features, :]
e@0 203 pc = kernel.T*data_n
e@0 204
e@0 205 return matrix(pc[0:q, :])
e@0 206
e@0 207 def extract_pca_configuration_from_data(data, Variability = 0.9):
e@0 208 """ Expects a k x n Matrix of n samples of k features and returns
e@0 209 a matrix Kernel with the PCs and an array Features of the chosen features"""
e@0 210
e@0 211 print "[II] Selecting salient features"
e@0 212 features = pfa(data);
e@0 213 data_n = data[features, :]
e@0 214 #data_n = data
e@0 215
e@0 216
e@0 217
e@0 218 print "[II] Selected %d features" % len(features)
e@0 219
e@0 220 print "[II] Calculating principal components coefficients"
e@0 221 coeff, score, latent = princomp(data_n)
e@0 222
e@0 223 print "[II] Calculating the principal components"
e@0 224 pc = coeff.T*data_n
e@0 225
e@0 226 print "[II] Doing horn's Parallel Analysis and setting number of vectors"
e@0 227 q = papca(data_n)
e@0 228
e@0 229 print "[II] Selected %d PCs to include" % q
e@0 230
e@0 231 if q < pc.shape[0]:
e@0 232 pc[q+1:, :] = 0
e@0 233
e@0 234 test_data = zeros((data.shape[0], data.shape[1]))
e@0 235 test_data[features, :] = coeff*pc
e@0 236
e@0 237 #test_data = coeff*pc
e@0 238
e@0 239
e@0 240 # print test_data
e@0 241 # print ","
e@0 242 # print data
e@0 243
e@0 244 print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data[features,:], data_n)
e@0 245
e@0 246 # Extract Kernel, number of PCs and array of features
e@0 247 return coeff, q, features
e@0 248
e@0 249
e@0 250
e@0 251
e@0 252
e@0 253
e@0 254
e@0 255
e@0 256 if __name__=="__main__":
e@0 257 import sys
e@0 258
e@0 259 test = array([
e@0 260 [1., 2, 3, 4],
e@0 261 [5, 6, 7, 8],
e@0 262 [9,10,11,12],
e@0 263 [8, 9, 7, 6]
e@0 264 ])