Mercurial > hg > chourdakisreiss2016
diff experiment-reverb/code/pfa.py @ 0:246d5546657c
initial commit, needs cleanup
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 14 Dec 2016 13:15:48 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/experiment-reverb/code/pfa.py Wed Dec 14 13:15:48 2016 +0000 @@ -0,0 +1,261 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 20 14:51:02 2015 + +@author: Emmanouil Chourdakis + +Principal Factor Analisys + +""" + +from numpy import * +from sklearn import cluster +from sklearn.metrics import pairwise_distances + +mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() + +L = matrix([ + [2,4,5,5,3,2], + [2,3,4,5,4,3] + ]) + + +K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], + [-0.16114828, -0.70320328, 0.25243464, -0.10076863], + [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], + [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) + + +def testing(): + L = matrix([ + [2,4,5,5,3,2], + [2,3,4,5,4,3] + ]) + + K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], + [-0.16114828, -0.70320328, 0.25243464, -0.10076863], + [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], + [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) + + +def kmeans(X, k): + # simulate matlab's kmeans using python's sklearn.cluster + k_means = cluster.KMeans(n_clusters=k) + k_means.fit(X) + + C = k_means.cluster_centers_ + idx = k_means.labels_ + + + # Create inter-cluster distance sums + sumd = zeros((k,1)) + for i in range(0, k): + S = X[idx == i, :] - C[i, :] + sumd[i] = sum(array(S)**2) + + # sumd = sum(array(S)**2) + + D = matrix(pairwise_distances(X, C)) + + return idx,C,sumd,D + + +def princomp(A): + # Adapted from THE GLOWING PYTHON + + if A.shape[1] == 1: + return 1, 0, 0 + + A = matrix(A) + M = mean(A, axis=1) + + Rxx = matrix(corrcoef(A - M, ddof=1)) + #print A-M + latent,coeff = linalg.eig(Rxx) + p = size(coeff,axis=1) + idx = argsort(latent) + idx = idx[::-1] + + + coeff = coeff[:,idx] + latent = latent[idx] # sorting eigenvalues + score = matrix(coeff).T * (A-M)# projection of the data in the new space + return matrix(coeff),matrix(score).T,matrix(latent).T + + + +def papca(X): + """ Horn's method for parallel PCA, returns nComp: the number of + principal compoents to use.""" + + N = 1000 + X = matrix(X) + coeff, score, lq = princomp(X) + + eigs = matrix(zeros((len(lq), N))) + + for k in range(1, N+1): + coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1])) + eigs[:,k-1] = lrq + + lrqbar = mean(eigs, 1) + ladjq = lq - lrqbar + 1 + + nComp = sum(ladjq > 1) + + if nComp == 0: + return 1 + else: + return nComp + + + + + +def pfa(X, Variability = 0.81): + + def choose_n(Lambda, variability = Variability): + """ Choose for variability """ + sumL = sum(Lambda) + suml = 0 + + for i in range(0, len(Lambda)): + suml += Lambda[i] + v = suml/sumL + if v >= variability: + return i+1 + + # print X + X = matrix(X) + # print X + N = shape(X)[1] + + meanX = mean(X) + stdX = std(X, ddof=1) + + X = X - meanX + X = X / stdX + + + Sigma = corrcoef(X, ddof=1) + + Lambda, A = eig(Sigma) + + + q = choose_n(Lambda, Variability) + + # Sort Eigenvalues + + idx = argsort(Lambda)[::-1] + + Lambda = Lambda[idx] + + A = matrix(A) + A = A[:,idx] + + Aq = A[:, 0:q] + + p = q + 2 + + p = min(Aq.shape[0], p) + + # Todo K _means + + + labels, centers, sumd, D = kmeans(Aq, p) + + tokeep = [] + + for i in range(0,p): + # Find vectors in clusters + vectors_idx = find(labels==i) + vectors = Aq[vectors_idx,:] + # Compute mean + vectors_mean = mean(vectors, 0) + + # Compute distance from mean + distances_from_mean = pairwise_distances(vectors, vectors_mean) + + # Get the vector with the minimum distance + sorted_idx = argsort(distances_from_mean.T) + closest_idx = vectors_idx[sorted_idx] + tokeep.append(int(closest_idx[0,0])) + + + + return sort(tokeep) + + +def convert_to_pc(data, kernel, q, features): + """ Expects a PC kernel, the desired number + of PCs and an array of the kept features + and returns the transformed data projected + into the PC kernel space""" + + + data_n = data[features, :] + pc = kernel.T*data_n + + return matrix(pc[0:q, :]) + +def extract_pca_configuration_from_data(data, Variability = 0.9): + """ Expects a k x n Matrix of n samples of k features and returns + a matrix Kernel with the PCs and an array Features of the chosen features""" + + print "[II] Selecting salient features" + features = pfa(data); + data_n = data[features, :] + #data_n = data + + + + print "[II] Selected %d features" % len(features) + + print "[II] Calculating principal components coefficients" + coeff, score, latent = princomp(data_n) + + print "[II] Calculating the principal components" + pc = coeff.T*data_n + + print "[II] Doing horn's Parallel Analysis and setting number of vectors" + q = papca(data_n) + + print "[II] Selected %d PCs to include" % q + + pc[q+1:, :] = 0 + + test_data = zeros((data.shape[0], data.shape[1])) + test_data[features, :] = coeff*pc + + #test_data = coeff*pc + + + print test_data + print "," + print data + + print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data, data) + + # Extract Kernel, number of PCs and array of features + return coeff, q, features + + + + + + + + +if __name__=="__main__": + import sys + + test = array([ + [1., 2, 3, 4], + [5, 6, 7, 8], + [9,10,11,12], + [8, 9, 7, 6] + ]) + + + +