Mercurial > hg > chourdakisreiss2016

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 20 14:51:02 2015

@author: Emmanouil Chourdakis

Principal Factor Analisys

"""

from numpy import *
from sklearn import cluster
from sklearn.metrics import pairwise_distances

mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()

L = matrix([
            [2,4,5,5,3,2],
            [2,3,4,5,4,3]
    ])


K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153,  0.1000316 ],
       [-0.16114828, -0.70320328,  0.25243464, -0.10076863],
       [ 1.10255852,  0.41740569,  1.14947174, -0.95764054],
       [ 0.31158905, -0.37584351,  1.65328982,  0.1987012 ]]))


def testing():
    L = matrix([
            [2,4,5,5,3,2],
            [2,3,4,5,4,3]
    ])

    K = matrix(array([[ 0.94241267, -1.63156463, -0.04654153,  0.1000316 ],
           [-0.16114828, -0.70320328,  0.25243464, -0.10076863],
           [ 1.10255852,  0.41740569,  1.14947174, -0.95764054],
           [ 0.31158905, -0.37584351,  1.65328982,  0.1987012 ]]))


def kmeans(X, k):
    # simulate matlab's kmeans using python's sklearn.cluster
    k_means = cluster.KMeans(n_clusters=k)
    k_means.fit(X)

    C = k_means.cluster_centers_
    idx = k_means.labels_


    # Create inter-cluster distance sums
    sumd = zeros((k,1))
    for i in range(0, k):
        S = X[idx == i, :] - C[i, :]
        sumd[i] = sum(array(S)**2)

   # sumd = sum(array(S)**2)

    D = matrix(pairwise_distances(X, C))

    return idx,C,sumd,D


def princomp(A):
 # Adapted from THE GLOWING PYTHON

 if A.shape[1] == 1:
     return 1, 0, 0

 A = matrix(A)
 M = mean(A, axis=1)

 Rxx = matrix(corrcoef(A - M, ddof=1))
 #print A-M
 latent,coeff = linalg.eig(Rxx)
 p = size(coeff,axis=1)
 idx = argsort(latent)
 idx = idx[::-1]


 coeff = coeff[:,idx]
 latent = latent[idx]   # sorting eigenvalues
 score =  matrix(coeff).T * (A-M)# projection of the data in the new space
 return matrix(coeff),matrix(score).T,matrix(latent).T


def papca(X):
    """ Horn's method for parallel PCA, returns nComp: the number of
    principal compoents to use."""

    N = 1000
    X = matrix(X)
    coeff, score, lq = princomp(X)

    eigs = matrix(zeros((len(lq), N)))

    for k in range(1, N+1):
        coeff, score, lrq = princomp(randn(X.shape[0], X.shape[1]))
        eigs[:,k-1] = lrq

    lrqbar = mean(eigs, 1)
    ladjq = lq - lrqbar + 1

    nComp =  sum(ladjq > 1)

    if nComp == 0:
        return 1
    else:
        return nComp


def pfa(X, Variability = 0.81):

    def choose_n(Lambda, variability = Variability):
        """ Choose for variability """
        sumL = sum(Lambda)
        suml = 0

        for i in range(0, len(Lambda)):
            suml += Lambda[i]
            v = suml/sumL
            if v >= variability:
                return i+1

 #   print X
    X = matrix(X)
  #  print X
    N = shape(X)[1]

    meanX = mean(X)
    stdX = std(X, ddof=1)

    X = X - meanX
    X = X / stdX


    Sigma = corrcoef(X, ddof=1)

    Lambda, A = eig(Sigma)


    q = choose_n(Lambda, Variability)

    # Sort Eigenvalues

    idx = argsort(Lambda)[::-1]

    Lambda = Lambda[idx]

    A = matrix(A)
    A = A[:,idx]

    Aq = A[:, 0:q]

    p = q + 2

    p = min(Aq.shape[0], p)

    # Todo K _means


    labels, centers, sumd, D = kmeans(Aq, p)

    tokeep = []

    for i in range(0,p):
        # Find vectors in clusters
        vectors_idx = find(labels==i)
        vectors = Aq[vectors_idx,:]
        # Compute mean
        vectors_mean = mean(vectors, 0)

        # Compute distance from mean
        distances_from_mean = pairwise_distances(vectors, vectors_mean)

        # Get the vector with the minimum distance
        sorted_idx = argsort(distances_from_mean.T)
        closest_idx = vectors_idx[sorted_idx]
        tokeep.append(int(closest_idx[0,0]))


    return sort(tokeep)


def convert_to_pc(data, kernel, q, features):
    """ Expects a PC kernel, the desired number
    of PCs and an array of the kept features
    and returns the transformed data projected
    into the PC kernel space"""


    data_n = data[features, :]
    pc = kernel.T*data_n

    return matrix(pc[0:q, :])

def extract_pca_configuration_from_data(data, Variability = 0.9):
    """ Expects a k x n Matrix of n samples of k features and returns
    a matrix Kernel with the PCs and an array Features of the chosen features"""

    print "[II] Selecting salient features"
    features = pfa(data);
    data_n = data[features, :]
    #data_n = data


    print "[II] Selected %d features" % len(features)

    print "[II] Calculating principal components coefficients"
    coeff, score, latent = princomp(data_n)

    print "[II] Calculating the principal components"
    pc = coeff.T*data_n

    print "[II] Doing horn's Parallel Analysis and setting number of vectors"
    q = papca(data_n)

    print "[II] Selected %d PCs to include" % q

    pc[q+1:, :] = 0

    test_data = zeros((data.shape[0], data.shape[1]))
    test_data[features, :] = coeff*pc

    #test_data = coeff*pc


    print test_data
    print ","
    print data

    print "[II] PCA Reconstruction MSE is: %.4f" % mse(test_data, data)

    # Extract Kernel, number of PCs and array of features
    return  coeff, q, features


if __name__=="__main__":
    import sys

    test = array([
                    [1., 2, 3, 4],
                    [5, 6, 7, 8],
                    [9,10,11,12],
                    [8, 9, 7, 6]
                 ])
author	Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date	Sat, 30 Sep 2017 13:25:50 +0100
parents	246d5546657c
children