e@0: # -*- coding: utf-8 -*- e@0: """ e@0: Created on Mon Apr 20 14:51:02 2015 e@0: e@0: @author: Emmanouil Chourdakis e@0: e@0: Principal Factor Analisys e@0: e@0: """ e@0: e@0: from numpy import * e@0: from sklearn import cluster e@0: from sklearn.metrics import pairwise_distances e@0: from numpy.linalg.linalg import eig e@0: from matplotlib.mlab import find e@0: from numpy.random import randn e@1: #from empca import * e@0: e@0: mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() e@0: e@0: L = matrix([ e@0: [2,4,5,5,3,2], e@0: [2,3,4,5,4,3] e@0: ]) e@0: e@0: e@0: K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], e@0: [-0.16114828, -0.70320328, 0.25243464, -0.10076863], e@0: [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], e@0: [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) e@0: e@0: e@0: def testing(): e@0: L = matrix([ e@0: [2,4,5,5,3,2], e@0: [2,3,4,5,4,3] e@0: ]) e@0: e@0: K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153, 0.1000316 ], e@0: [-0.16114828, -0.70320328, 0.25243464, -0.10076863], e@0: [ 1.10255852, 0.41740569, 1.14947174, -0.95764054], e@0: [ 0.31158905, -0.37584351, 1.65328982, 0.1987012 ]])) e@0: e@0: e@0: def kmeans(X, k): e@0: # simulate matlab's kmeans using python's sklearn.cluster e@0: k_means = cluster.KMeans(n_clusters=k) e@0: k_means.fit(X) e@0: e@0: C = k_means.cluster_centers_ e@0: idx = k_means.labels_ e@0: e@0: e@0: # Create inter-cluster distance sums e@0: sumd = zeros((k,1)) e@0: for i in range(0, k): e@0: S = X[idx == i, :] - C[i, :] e@0: sumd[i] = sum(array(S)**2) e@0: e@0: # sumd = sum(array(S)**2) e@0: e@0: D = matrix(pairwise_distances(X, C)) e@0: e@0: return idx,C,sumd,D e@0: e@0: e@0: def princomp(A): e@0: # Adapted from THE GLOWING PYTHON e@0: e@0: if A.shape[1] == 1: e@0: return 1, 0, 0 e@0: e@0: A = matrix(A) e@0: M = mean(A, axis=1) e@0: e@0: Rxx = matrix(corrcoef(A - M, ddof=1)) e@0: # Rxx = matrix(cov(A-M, ddof=1)) e@0: #print A-M e@0: latent,coeff = linalg.eig(Rxx) e@0: p = size(coeff,axis=1) e@0: idx = argsort(latent) e@0: idx = idx[::-1] e@0: e@0: e@0: coeff = coeff[:,idx] e@0: latent = latent[idx] # sorting eigenvalues e@0: score = matrix(coeff).T * (A-M)# projection of the data in the new space e@0: return matrix(coeff),matrix(score).T,matrix(latent).T e@0: e@0: e@0: e@0: e@0: def papca(X): e@0: """ Horn's method for parallel PCA, returns nComp: the number of e@0: principal compoents to use.""" e@0: e@0: N = 1000 e@0: X = matrix(X) e@0: coeff, score, lq = princomp(X) e@0: e@0: eigs = matrix(zeros((len(lq), N))) e@0: e@0: for k in range(1, N+1): e@0: coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1])) e@0: eigs[:,k-1] = lrq e@0: e@0: lrqbar = mean(eigs, 1) e@0: ladjq = lq - lrqbar + 1 e@0: e@0: nComp = sum(ladjq > 1) e@0: e@0: if nComp == 0: e@0: return 1 e@0: else: e@0: return nComp e@0: e@0: e@0: e@0: e@0: e@0: def pfa(X, Variability = 0.81): e@0: e@0: def choose_n(Lambda, variability = Variability): e@0: """ Choose for variability """ e@0: sumL = sum(Lambda) e@0: suml = 0 e@0: e@0: for i in range(0, len(Lambda)): e@0: suml += Lambda[i] e@0: v = suml/sumL e@0: if v >= variability: e@0: return i+1 e@0: e@0: # print X e@0: X = matrix(X) e@0: # print X e@0: N = shape(X)[1] e@0: e@0: meanX = mean(X) e@0: stdX = std(X, ddof=1) e@0: e@0: X = X - meanX e@0: X = X / stdX e@0: e@0: e@0: Sigma = corrcoef(X, ddof=1) e@0: e@0: Lambda, A = eig(Sigma) e@0: e@0: e@0: q = choose_n(Lambda, Variability) e@0: e@0: # Sort Eigenvalues e@0: e@0: idx = argsort(Lambda)[::-1] e@0: e@0: Lambda = Lambda[idx] e@0: e@0: A = matrix(A) e@0: A = A[:,idx] e@0: e@0: Aq = A[:, 0:q] e@0: e@0: p = q + 2 e@0: e@0: p = min(Aq.shape[0], p) e@0: e@0: # Todo K _means e@0: e@0: e@0: labels, centers, sumd, D = kmeans(Aq, p) e@0: e@0: tokeep = [] e@0: e@0: for i in range(0,p): e@0: # Find vectors in clusters e@0: vectors_idx = find(labels==i) e@0: vectors = Aq[vectors_idx,:] e@0: # Compute mean e@0: vectors_mean = mean(vectors, 0) e@0: e@0: # Compute distance from mean e@0: distances_from_mean = pairwise_distances(vectors, vectors_mean) e@0: e@0: # Get the vector with the minimum distance e@0: sorted_idx = argsort(distances_from_mean.T) e@0: closest_idx = vectors_idx[sorted_idx] e@0: tokeep.append(int(closest_idx[0,0])) e@0: e@0: e@0: e@0: return sort(tokeep) e@0: e@0: e@0: def convert_to_pc(data, kernel, q, features): e@0: """ Expects a PC kernel, the desired number e@0: of PCs and an array of the kept features e@0: and returns the transformed data projected e@0: into the PC kernel space""" e@0: e@0: e@0: data_n = data[features, :] e@0: pc = kernel.T*data_n e@0: e@0: return matrix(pc[0:q, :]) e@0: e@0: def extract_pca_configuration_from_data(data, Variability = 0.9): e@0: """ Expects a k x n Matrix of n samples of k features and returns e@0: a matrix Kernel with the PCs and an array Features of the chosen features""" e@0: e@0: print "[II] Selecting salient features" e@0: features = pfa(data); e@0: data_n = data[features, :] e@0: #data_n = data e@0: e@0: e@0: e@0: print "[II] Selected %d features" % len(features) e@0: e@0: print "[II] Calculating principal components coefficients" e@0: coeff, score, latent = princomp(data_n) e@0: e@0: print "[II] Calculating the principal components" e@0: pc = coeff.T*data_n e@0: e@0: print "[II] Doing horn's Parallel Analysis and setting number of vectors" e@0: q = papca(data_n) e@0: e@0: print "[II] Selected %d PCs to include" % q e@0: e@0: if q < pc.shape[0]: e@0: pc[q+1:, :] = 0 e@0: e@0: test_data = zeros((data.shape[0], data.shape[1])) e@0: test_data[features, :] = coeff*pc e@0: e@0: #test_data = coeff*pc e@0: e@0: e@0: # print test_data e@0: # print "," e@0: # print data e@0: e@0: print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data[features,:], data_n) e@0: e@0: # Extract Kernel, number of PCs and array of features e@0: return coeff, q, features e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: if __name__=="__main__": e@0: import sys e@0: e@0: test = array([ e@0: [1., 2, 3, 4], e@0: [5, 6, 7, 8], e@0: [9,10,11,12], e@0: [8, 9, 7, 6] e@0: ])