view maths/pca/pca.c @ 483:fdaa63607c15

Untabify, indent, tidy
author Chris Cannam <cannam@all-day-breakfast.com>
date Fri, 31 May 2019 11:54:32 +0100
parents 7e8d1f26b098
children
line wrap: on
line source
/*********************************/
/* Principal Components Analysis */
/*********************************/

/*********************************************************************/
/* Principal Components Analysis or the Karhunen-Loeve expansion is a
   classical method for dimensionality reduction or exploratory data
   analysis.  One reference among many is: F. Murtagh and A. Heck,
   Multivariate Data Analysis, Kluwer Academic, Dordrecht, 1987.

   Author:
   F. Murtagh
   Phone:        + 49 89 32006298 (work)
                 + 49 89 965307 (home)
   Earn/Bitnet:  fionn@dgaeso51,  fim@dgaipp1s,  murtagh@stsci
   Span:         esomc1::fionn
   Internet:     murtagh@scivax.stsci.edu
   
   F. Murtagh, Munich, 6 June 1989                                   */   
/*********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include "pca.h"

#define SIGN(a, b) ( (b) < 0 ? -fabs(a) : fabs(a) )

/**  Variance-covariance matrix: creation  *****************************/

/* Create m * m covariance matrix from given n * m data matrix. */
void covcol(double** data, int n, int m, double** symmat)
{
    double *mean;
    int i, j, j1, j2;

/* Allocate storage for mean vector */

    mean = (double*) malloc(m*sizeof(double));

/* Determine mean of column vectors of input data matrix */

    for (j = 0; j < m; j++)
    {
        mean[j] = 0.0;
        for (i = 0; i < n; i++)
        {
            mean[j] += data[i][j];
        }
        mean[j] /= (double)n;
    }

/*
  printf("\nMeans of column vectors:\n");
  for (j = 0; j < m; j++)  {
  printf("%12.1f",mean[j]);  }   printf("\n");
*/

/* Center the column vectors. */

    for (i = 0; i < n; i++)
    {
        for (j = 0; j < m; j++)
        {
            data[i][j] -= mean[j];
        }
    }

/* Calculate the m * m covariance matrix. */
    for (j1 = 0; j1 < m; j1++)
    {
        for (j2 = j1; j2 < m; j2++)
        {
            symmat[j1][j2] = 0.0;
            for (i = 0; i < n; i++)
            {
                symmat[j1][j2] += data[i][j1] * data[i][j2];
            }
            symmat[j2][j1] = symmat[j1][j2];
        }
    }

    free(mean);

    return;

}

/**  Error handler  **************************************************/

void erhand(char* err_msg)
{
    fprintf(stderr,"Run-time error:\n");
    fprintf(stderr,"%s\n", err_msg);
    fprintf(stderr,"Exiting to system.\n");
    exit(1);
}


/**  Reduce a real, symmetric matrix to a symmetric, tridiag. matrix. */

/* Householder reduction of matrix a to tridiagonal form.
   Algorithm: Martin et al., Num. Math. 11, 181-195, 1968.
   Ref: Smith et al., Matrix Eigensystem Routines -- EISPACK Guide
   Springer-Verlag, 1976, pp. 489-494.
   W H Press et al., Numerical Recipes in C, Cambridge U P,
   1988, pp. 373-374.  */
void tred2(double** a, int n, double* d, double* e)
{
    int l, k, j, i;
    double scale, hh, h, g, f;
        
    for (i = n-1; i >= 1; i--)
    {
        l = i - 1;
        h = scale = 0.0;
        if (l > 0)
        {
            for (k = 0; k <= l; k++)
                scale += fabs(a[i][k]);
            if (scale == 0.0)
                e[i] = a[i][l];
            else
            {
                for (k = 0; k <= l; k++)
                {
                    a[i][k] /= scale;
                    h += a[i][k] * a[i][k];
                }
                f = a[i][l];
                g = f>0 ? -sqrt(h) : sqrt(h);
                e[i] = scale * g;
                h -= f * g;
                a[i][l] = f - g;
                f = 0.0;
                for (j = 0; j <= l; j++)
                {
                    a[j][i] = a[i][j]/h;
                    g = 0.0;
                    for (k = 0; k <= j; k++)
                        g += a[j][k] * a[i][k];
                    for (k = j+1; k <= l; k++)
                        g += a[k][j] * a[i][k];
                    e[j] = g / h;
                    f += e[j] * a[i][j];
                }
                hh = f / (h + h);
                for (j = 0; j <= l; j++)
                {
                    f = a[i][j];
                    e[j] = g = e[j] - hh * f;
                    for (k = 0; k <= j; k++)
                        a[j][k] -= (f * e[k] + g * a[i][k]);
                }
            }
        }
        else
            e[i] = a[i][l];
        d[i] = h;
    }
    d[0] = 0.0;
    e[0] = 0.0;
    for (i = 0; i < n; i++)
    {
        l = i - 1;
        if (d[i])
        {
            for (j = 0; j <= l; j++)
            {
                g = 0.0;
                for (k = 0; k <= l; k++)
                    g += a[i][k] * a[k][j];
                for (k = 0; k <= l; k++)
                    a[k][j] -= g * a[k][i];
            }
        }
        d[i] = a[i][i];
        a[i][i] = 1.0;
        for (j = 0; j <= l; j++)
            a[j][i] = a[i][j] = 0.0;
    }
}

/**  Tridiagonal QL algorithm -- Implicit  **********************/

void tqli(double* d, double* e, int n, double** z)
{
    int m, l, iter, i, k;
    double s, r, p, g, f, dd, c, b;
        
    for (i = 1; i < n; i++)
        e[i-1] = e[i];
    e[n-1] = 0.0;
    for (l = 0; l < n; l++)
    {
        iter = 0;
        do
        {
            for (m = l; m < n-1; m++)
            {
                dd = fabs(d[m]) + fabs(d[m+1]);
                if (fabs(e[m]) + dd == dd) break;
            }
            if (m != l)
            {
                if (iter++ == 30) erhand("No convergence in TLQI.");
                g = (d[l+1] - d[l]) / (2.0 * e[l]);
                r = sqrt((g * g) + 1.0);
                g = d[m] - d[l] + e[l] / (g + SIGN(r, g));
                s = c = 1.0;
                p = 0.0;
                for (i = m-1; i >= l; i--)
                {
                    f = s * e[i];
                    b = c * e[i];
                    if (fabs(f) >= fabs(g))
                    {
                        c = g / f;
                        r = sqrt((c * c) + 1.0);
                        e[i+1] = f * r;
                        c *= (s = 1.0/r);
                    }
                    else
                    {
                        s = f / g;
                        r = sqrt((s * s) + 1.0);
                        e[i+1] = g * r;
                        s *= (c = 1.0/r);
                    }
                    g = d[i+1] - p;
                    r = (d[i] - g) * s + 2.0 * c * b;
                    p = s * r;
                    d[i+1] = g + p;
                    g = c * r - b;
                    for (k = 0; k < n; k++)
                    {
                        f = z[k][i+1];
                        z[k][i+1] = s * z[k][i] + c * f;
                        z[k][i] = c * z[k][i] - s * f;
                    }
                }
                d[l] = d[l] - p;
                e[l] = g;
                e[m] = 0.0;
            }
        }  while (m != l);
    }
}

/* In place projection onto basis vectors */
void pca_project(double** data, int n, int m, int ncomponents)
{
    int  i, j, k, k2;
    double  **symmat, /* **symmat2, */ *evals, *interm;
        
    //TODO: assert ncomponents < m
        
    symmat = (double**) malloc(m*sizeof(double*));
    for (i = 0; i < m; i++)
        symmat[i] = (double*) malloc(m*sizeof(double));
                
    covcol(data, n, m, symmat);
        
    /*********************************************************************
                Eigen-reduction
    **********************************************************************/
        
    /* Allocate storage for dummy and new vectors. */
    evals = (double*) malloc(m*sizeof(double));     /* Storage alloc. for vector of eigenvalues */
    interm = (double*) malloc(m*sizeof(double));    /* Storage alloc. for 'intermediate' vector */
    //MALLOC_ARRAY(symmat2,m,m,double);    
    //for (i = 0; i < m; i++) {
    //      for (j = 0; j < m; j++) {
    //              symmat2[i][j] = symmat[i][j]; /* Needed below for col. projections */
    //      }
    //}
    tred2(symmat, m, evals, interm);  /* Triangular decomposition */
    tqli(evals, interm, m, symmat);   /* Reduction of sym. trid. matrix */
/* evals now contains the eigenvalues,
   columns of symmat now contain the associated eigenvectors. */   

/*
  printf("\nEigenvalues:\n");
  for (j = m-1; j >= 0; j--) {
  printf("%18.5f\n", evals[j]); }
  printf("\n(Eigenvalues should be strictly positive; limited\n");
  printf("precision machine arithmetic may affect this.\n");
  printf("Eigenvalues are often expressed as cumulative\n");
  printf("percentages, representing the 'percentage variance\n");
  printf("explained' by the associated axis or principal component.)\n");
        
  printf("\nEigenvectors:\n");
  printf("(First three; their definition in terms of original vbes.)\n");
  for (j = 0; j < m; j++) {
  for (i = 1; i <= 3; i++)  {
  printf("%12.4f", symmat[j][m-i]);  }
  printf("\n");  }
*/

/* Form projections of row-points on prin. components. */
/* Store in 'data', overwriting original data. */
    for (i = 0; i < n; i++) {
        for (j = 0; j < m; j++) {
            interm[j] = data[i][j]; }   /* data[i][j] will be overwritten */
        for (k = 0; k < ncomponents; k++) {
            data[i][k] = 0.0;
            for (k2 = 0; k2 < m; k2++) {
                data[i][k] += interm[k2] * symmat[k2][m-k-1]; }
        }
    }

/*      
        printf("\nProjections of row-points on first 3 prin. comps.:\n");
        for (i = 0; i < n; i++) {
        for (j = 0; j < 3; j++)  {
        printf("%12.4f", data[i][j]);  }
        printf("\n");  }
*/

/* Form projections of col.-points on first three prin. components. */
/* Store in 'symmat2', overwriting what was stored in this. */
//for (j = 0; j < m; j++) {
//       for (k = 0; k < m; k++) {
//               interm[k] = symmat2[j][k]; }  /*symmat2[j][k] will be overwritten*/
//  for (i = 0; i < 3; i++) {
//      symmat2[j][i] = 0.0;
//              for (k2 = 0; k2 < m; k2++) {
//                      symmat2[j][i] += interm[k2] * symmat[k2][m-i-1]; }
//              if (evals[m-i-1] > 0.0005)   /* Guard against zero eigenvalue */
//                      symmat2[j][i] /= sqrt(evals[m-i-1]);   /* Rescale */
//              else
//                      symmat2[j][i] = 0.0;    /* Standard kludge */
//    }
// }

/*
  printf("\nProjections of column-points on first 3 prin. comps.:\n");
  for (j = 0; j < m; j++) {
  for (k = 0; k < 3; k++)  {
  printf("%12.4f", symmat2[j][k]);  }
  printf("\n");  }
*/


    for (i = 0; i < m; i++)
        free(symmat[i]);
    free(symmat);
//FREE_ARRAY(symmat2,m);
    free(evals);
    free(interm);

}