view experiment-reverb/code/pfa.py @ 2:c87a9505f294 tip

Added LICENSE for code, removed .wav files
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 30 Sep 2017 13:25:50 +0100
parents 246d5546657c
children
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 20 14:51:02 2015

@author: Emmanouil Chourdakis

Principal Factor Analisys

"""

from numpy import *
from sklearn import cluster
from sklearn.metrics import pairwise_distances

mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()

L = matrix([
            [2,4,5,5,3,2],
            [2,3,4,5,4,3]
    ])
    

K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153,  0.1000316 ],
       [-0.16114828, -0.70320328,  0.25243464, -0.10076863],
       [ 1.10255852,  0.41740569,  1.14947174, -0.95764054],
       [ 0.31158905, -0.37584351,  1.65328982,  0.1987012 ]]))
       

def testing():
    L = matrix([
            [2,4,5,5,3,2],
            [2,3,4,5,4,3]
    ])
    
    K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153,  0.1000316 ],
           [-0.16114828, -0.70320328,  0.25243464, -0.10076863],
           [ 1.10255852,  0.41740569,  1.14947174, -0.95764054],
           [ 0.31158905, -0.37584351,  1.65328982,  0.1987012 ]]))    
    

def kmeans(X, k):
    # simulate matlab's kmeans using python's sklearn.cluster
    k_means = cluster.KMeans(n_clusters=k)    
    k_means.fit(X)
    
    C = k_means.cluster_centers_
    idx = k_means.labels_
    
    
    # Create inter-cluster distance sums
    sumd = zeros((k,1))
    for i in range(0, k):
        S = X[idx == i, :] - C[i, :]
        sumd[i] = sum(array(S)**2)
        
   # sumd = sum(array(S)**2)
 
    D = matrix(pairwise_distances(X, C))
    
    return idx,C,sumd,D
    
       
def princomp(A):
 # Adapted from THE GLOWING PYTHON

 if A.shape[1] == 1:
     return 1, 0, 0
     
 A = matrix(A)
 M = mean(A, axis=1)

 Rxx = matrix(corrcoef(A - M, ddof=1))
 #print A-M
 latent,coeff = linalg.eig(Rxx)
 p = size(coeff,axis=1)
 idx = argsort(latent) 
 idx = idx[::-1]       
 

 coeff = coeff[:,idx]
 latent = latent[idx]   # sorting eigenvalues
 score =  matrix(coeff).T * (A-M)# projection of the data in the new space
 return matrix(coeff),matrix(score).T,matrix(latent).T
 
    

def papca(X):
    """ Horn's method for parallel PCA, returns nComp: the number of
    principal compoents to use."""
    
    N = 1000
    X = matrix(X)
    coeff, score, lq = princomp(X)
    
    eigs = matrix(zeros((len(lq), N)))
    
    for k in range(1, N+1):
        coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
        eigs[:,k-1] = lrq
        
    lrqbar = mean(eigs, 1)
    ladjq = lq - lrqbar + 1
    
    nComp =  sum(ladjq > 1)
    
    if nComp == 0:
        return 1
    else:
        return nComp

        
        
    
            
def pfa(X, Variability = 0.81):
    
    def choose_n(Lambda, variability = Variability):
        """ Choose for variability """ 
        sumL = sum(Lambda)
        suml = 0
        
        for i in range(0, len(Lambda)):
            suml += Lambda[i]
            v = suml/sumL
            if v >= variability:
                return i+1
        
 #   print X
    X = matrix(X)
  #  print X
    N = shape(X)[1]
    
    meanX = mean(X)
    stdX = std(X, ddof=1)
    
    X = X - meanX
    X = X / stdX
    
    
    Sigma = corrcoef(X, ddof=1)
    
    Lambda, A = eig(Sigma)
    
    
    q = choose_n(Lambda, Variability)

    # Sort Eigenvalues

    idx = argsort(Lambda)[::-1]    
    
    Lambda = Lambda[idx]
    
    A = matrix(A)
    A = A[:,idx]
    
    Aq = A[:, 0:q]

    p = q + 2
    
    p = min(Aq.shape[0], p)
    
    # Todo K _means
    
 
    labels, centers, sumd, D = kmeans(Aq, p)
    
    tokeep = []
    
    for i in range(0,p):
        # Find vectors in clusters
        vectors_idx = find(labels==i)
        vectors = Aq[vectors_idx,:]
        # Compute mean
        vectors_mean = mean(vectors, 0)
        
        # Compute distance from mean
        distances_from_mean = pairwise_distances(vectors, vectors_mean)
        
        # Get the vector with the minimum distance
        sorted_idx = argsort(distances_from_mean.T)
        closest_idx = vectors_idx[sorted_idx]
        tokeep.append(int(closest_idx[0,0]))
        
        
    
    return sort(tokeep)


def convert_to_pc(data, kernel, q, features):
    """ Expects a PC kernel, the desired number
    of PCs and an array of the kept features
    and returns the transformed data projected
    into the PC kernel space"""
    
    
    data_n = data[features, :]
    pc = kernel.T*data_n
    
    return matrix(pc[0:q, :])

def extract_pca_configuration_from_data(data, Variability = 0.9):
    """ Expects a k x n Matrix of n samples of k features and returns
    a matrix Kernel with the PCs and an array Features of the chosen features"""
    
    print "[II] Selecting salient features"    
    features = pfa(data);
    data_n = data[features, :]
    #data_n = data
    

    
    print "[II] Selected %d features" % len(features)
    
    print "[II] Calculating principal components coefficients"
    coeff, score, latent = princomp(data_n)
    
    print "[II] Calculating the principal components"
    pc = coeff.T*data_n
    
    print "[II] Doing horn's Parallel Analysis and setting number of vectors"
    q = papca(data_n)
    
    print "[II] Selected %d PCs to include" % q
    
    pc[q+1:, :] = 0
    
    test_data = zeros((data.shape[0], data.shape[1]))
    test_data[features, :] = coeff*pc
    
    #test_data = coeff*pc

    
    print test_data
    print ","
    print data
    
    print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data, data)
    
    # Extract Kernel, number of PCs and array of features
    return  coeff, q, features
    
    
    
    
    
    
    

if __name__=="__main__":
    import sys
    
    test = array([
                    [1., 2, 3, 4],
                    [5, 6, 7, 8],
                    [9,10,11,12],
                    [8, 9, 7, 6]
                 ])