cannam@16: /*********************************/ cannam@16: /* Principal Components Analysis */ cannam@16: /*********************************/ cannam@16: cannam@16: /*********************************************************************/ cannam@16: /* Principal Components Analysis or the Karhunen-Loeve expansion is a cannam@16: classical method for dimensionality reduction or exploratory data cannam@16: analysis. One reference among many is: F. Murtagh and A. Heck, cannam@16: Multivariate Data Analysis, Kluwer Academic, Dordrecht, 1987. cannam@16: cannam@16: Author: cannam@16: F. Murtagh cannam@16: Phone: + 49 89 32006298 (work) cannam@16: + 49 89 965307 (home) cannam@16: Earn/Bitnet: fionn@dgaeso51, fim@dgaipp1s, murtagh@stsci cannam@16: Span: esomc1::fionn cannam@16: Internet: murtagh@scivax.stsci.edu cannam@16: cannam@16: F. Murtagh, Munich, 6 June 1989 */ cannam@16: /*********************************************************************/ cannam@16: cannam@16: #include cannam@16: #include cannam@16: #include cannam@16: cannam@17: #include "PCA.h" cannam@16: cannam@16: #define SIGN(a, b) ( (b) < 0 ? -fabs(a) : fabs(a) ) cannam@16: cannam@16: /** Variance-covariance matrix: creation *****************************/ cannam@16: cannam@16: /* Create m * m covariance matrix from given n * m data matrix. */ cannam@16: void covcol(double** data, int n, int m, double** symmat) cannam@16: { cannam@16: double *mean; cannam@16: int i, j, j1, j2; cannam@16: cannam@16: /* Allocate storage for mean vector */ cannam@16: cannam@16: mean = (double*) malloc(m*sizeof(double)); cannam@16: cannam@16: /* Determine mean of column vectors of input data matrix */ cannam@16: cannam@16: for (j = 0; j < m; j++) cannam@16: { cannam@16: mean[j] = 0.0; cannam@16: for (i = 0; i < n; i++) cannam@16: { cannam@16: mean[j] += data[i][j]; cannam@16: } cannam@16: mean[j] /= (double)n; cannam@16: } cannam@16: cannam@16: /* cannam@16: printf("\nMeans of column vectors:\n"); cannam@16: for (j = 0; j < m; j++) { cannam@16: printf("%12.1f",mean[j]); } printf("\n"); cannam@16: */ cannam@16: cannam@16: /* Center the column vectors. */ cannam@16: cannam@16: for (i = 0; i < n; i++) cannam@16: { cannam@16: for (j = 0; j < m; j++) cannam@16: { cannam@16: data[i][j] -= mean[j]; cannam@16: } cannam@16: } cannam@16: cannam@16: /* Calculate the m * m covariance matrix. */ cannam@16: for (j1 = 0; j1 < m; j1++) cannam@16: { cannam@16: for (j2 = j1; j2 < m; j2++) cannam@16: { cannam@16: symmat[j1][j2] = 0.0; cannam@16: for (i = 0; i < n; i++) cannam@16: { cannam@16: symmat[j1][j2] += data[i][j1] * data[i][j2]; cannam@16: } cannam@16: symmat[j2][j1] = symmat[j1][j2]; cannam@16: } cannam@16: } cannam@16: cannam@16: free(mean); cannam@16: cannam@16: return; cannam@16: cannam@16: } cannam@16: cannam@16: /** Error handler **************************************************/ cannam@16: cannam@16: void erhand(char* err_msg) cannam@16: { cannam@16: fprintf(stderr,"Run-time error:\n"); cannam@16: fprintf(stderr,"%s\n", err_msg); cannam@16: fprintf(stderr,"Exiting to system.\n"); cannam@16: exit(1); cannam@16: } cannam@16: cannam@16: cannam@16: /** Reduce a real, symmetric matrix to a symmetric, tridiag. matrix. */ cannam@16: cannam@16: /* Householder reduction of matrix a to tridiagonal form. cannam@16: Algorithm: Martin et al., Num. Math. 11, 181-195, 1968. cannam@16: Ref: Smith et al., Matrix Eigensystem Routines -- EISPACK Guide cannam@16: Springer-Verlag, 1976, pp. 489-494. cannam@16: W H Press et al., Numerical Recipes in C, Cambridge U P, cannam@16: 1988, pp. 373-374. */ cannam@16: void tred2(double** a, int n, double* d, double* e) cannam@16: { cannam@16: int l, k, j, i; cannam@16: double scale, hh, h, g, f; cannam@16: cannam@16: for (i = n-1; i >= 1; i--) cannam@16: { cannam@16: l = i - 1; cannam@16: h = scale = 0.0; cannam@16: if (l > 0) cannam@16: { cannam@16: for (k = 0; k <= l; k++) cannam@16: scale += fabs(a[i][k]); cannam@16: if (scale == 0.0) cannam@16: e[i] = a[i][l]; cannam@16: else cannam@16: { cannam@16: for (k = 0; k <= l; k++) cannam@16: { cannam@16: a[i][k] /= scale; cannam@16: h += a[i][k] * a[i][k]; cannam@16: } cannam@16: f = a[i][l]; cannam@16: g = f>0 ? -sqrt(h) : sqrt(h); cannam@16: e[i] = scale * g; cannam@16: h -= f * g; cannam@16: a[i][l] = f - g; cannam@16: f = 0.0; cannam@16: for (j = 0; j <= l; j++) cannam@16: { cannam@16: a[j][i] = a[i][j]/h; cannam@16: g = 0.0; cannam@16: for (k = 0; k <= j; k++) cannam@16: g += a[j][k] * a[i][k]; cannam@16: for (k = j+1; k <= l; k++) cannam@16: g += a[k][j] * a[i][k]; cannam@16: e[j] = g / h; cannam@16: f += e[j] * a[i][j]; cannam@16: } cannam@16: hh = f / (h + h); cannam@16: for (j = 0; j <= l; j++) cannam@16: { cannam@16: f = a[i][j]; cannam@16: e[j] = g = e[j] - hh * f; cannam@16: for (k = 0; k <= j; k++) cannam@16: a[j][k] -= (f * e[k] + g * a[i][k]); cannam@16: } cannam@16: } cannam@16: } cannam@16: else cannam@16: e[i] = a[i][l]; cannam@16: d[i] = h; cannam@16: } cannam@16: d[0] = 0.0; cannam@16: e[0] = 0.0; cannam@16: for (i = 0; i < n; i++) cannam@16: { cannam@16: l = i - 1; cannam@16: if (d[i]) cannam@16: { cannam@16: for (j = 0; j <= l; j++) cannam@16: { cannam@16: g = 0.0; cannam@16: for (k = 0; k <= l; k++) cannam@16: g += a[i][k] * a[k][j]; cannam@16: for (k = 0; k <= l; k++) cannam@16: a[k][j] -= g * a[k][i]; cannam@16: } cannam@16: } cannam@16: d[i] = a[i][i]; cannam@16: a[i][i] = 1.0; cannam@16: for (j = 0; j <= l; j++) cannam@16: a[j][i] = a[i][j] = 0.0; cannam@16: } cannam@16: } cannam@16: cannam@16: /** Tridiagonal QL algorithm -- Implicit **********************/ cannam@16: cannam@16: void tqli(double* d, double* e, int n, double** z) cannam@16: { cannam@16: int m, l, iter, i, k; cannam@16: double s, r, p, g, f, dd, c, b; cannam@16: cannam@16: for (i = 1; i < n; i++) cannam@16: e[i-1] = e[i]; cannam@16: e[n-1] = 0.0; cannam@16: for (l = 0; l < n; l++) cannam@16: { cannam@16: iter = 0; cannam@16: do cannam@16: { cannam@16: for (m = l; m < n-1; m++) cannam@16: { cannam@16: dd = fabs(d[m]) + fabs(d[m+1]); cannam@16: if (fabs(e[m]) + dd == dd) break; cannam@16: } cannam@16: if (m != l) cannam@16: { cannam@16: if (iter++ == 30) erhand("No convergence in TLQI."); cannam@16: g = (d[l+1] - d[l]) / (2.0 * e[l]); cannam@16: r = sqrt((g * g) + 1.0); cannam@16: g = d[m] - d[l] + e[l] / (g + SIGN(r, g)); cannam@16: s = c = 1.0; cannam@16: p = 0.0; cannam@16: for (i = m-1; i >= l; i--) cannam@16: { cannam@16: f = s * e[i]; cannam@16: b = c * e[i]; cannam@16: if (fabs(f) >= fabs(g)) cannam@16: { cannam@16: c = g / f; cannam@16: r = sqrt((c * c) + 1.0); cannam@16: e[i+1] = f * r; cannam@16: c *= (s = 1.0/r); cannam@16: } cannam@16: else cannam@16: { cannam@16: s = f / g; cannam@16: r = sqrt((s * s) + 1.0); cannam@16: e[i+1] = g * r; cannam@16: s *= (c = 1.0/r); cannam@16: } cannam@16: g = d[i+1] - p; cannam@16: r = (d[i] - g) * s + 2.0 * c * b; cannam@16: p = s * r; cannam@16: d[i+1] = g + p; cannam@16: g = c * r - b; cannam@16: for (k = 0; k < n; k++) cannam@16: { cannam@16: f = z[k][i+1]; cannam@16: z[k][i+1] = s * z[k][i] + c * f; cannam@16: z[k][i] = c * z[k][i] - s * f; cannam@16: } cannam@16: } cannam@16: d[l] = d[l] - p; cannam@16: e[l] = g; cannam@16: e[m] = 0.0; cannam@16: } cannam@16: } while (m != l); cannam@16: } cannam@16: } cannam@16: cannam@16: /* In place projection onto basis vectors */ cannam@16: void pca_project(double** data, int n, int m, int ncomponents) cannam@16: { cannam@16: int i, j, k, k2; cannam@16: double **symmat, **symmat2, *evals, *interm; cannam@16: cannam@16: //TODO: assert ncomponents < m cannam@16: cannam@16: symmat = (double**) malloc(m*sizeof(double*)); cannam@16: for (i = 0; i < m; i++) cannam@16: symmat[i] = (double*) malloc(m*sizeof(double)); cannam@16: cannam@16: covcol(data, n, m, symmat); cannam@16: cannam@16: /********************************************************************* cannam@16: Eigen-reduction cannam@16: **********************************************************************/ cannam@16: cannam@16: /* Allocate storage for dummy and new vectors. */ cannam@16: evals = (double*) malloc(m*sizeof(double)); /* Storage alloc. for vector of eigenvalues */ cannam@16: interm = (double*) malloc(m*sizeof(double)); /* Storage alloc. for 'intermediate' vector */ cannam@16: //MALLOC_ARRAY(symmat2,m,m,double); cannam@16: //for (i = 0; i < m; i++) { cannam@16: // for (j = 0; j < m; j++) { cannam@16: // symmat2[i][j] = symmat[i][j]; /* Needed below for col. projections */ cannam@16: // } cannam@16: //} cannam@16: tred2(symmat, m, evals, interm); /* Triangular decomposition */ cannam@16: tqli(evals, interm, m, symmat); /* Reduction of sym. trid. matrix */ cannam@16: /* evals now contains the eigenvalues, cannam@16: columns of symmat now contain the associated eigenvectors. */ cannam@16: cannam@16: /* cannam@16: printf("\nEigenvalues:\n"); cannam@16: for (j = m-1; j >= 0; j--) { cannam@16: printf("%18.5f\n", evals[j]); } cannam@16: printf("\n(Eigenvalues should be strictly positive; limited\n"); cannam@16: printf("precision machine arithmetic may affect this.\n"); cannam@16: printf("Eigenvalues are often expressed as cumulative\n"); cannam@16: printf("percentages, representing the 'percentage variance\n"); cannam@16: printf("explained' by the associated axis or principal component.)\n"); cannam@16: cannam@16: printf("\nEigenvectors:\n"); cannam@16: printf("(First three; their definition in terms of original vbes.)\n"); cannam@16: for (j = 0; j < m; j++) { cannam@16: for (i = 1; i <= 3; i++) { cannam@16: printf("%12.4f", symmat[j][m-i]); } cannam@16: printf("\n"); } cannam@16: */ cannam@16: cannam@16: /* Form projections of row-points on prin. components. */ cannam@16: /* Store in 'data', overwriting original data. */ cannam@16: for (i = 0; i < n; i++) { cannam@16: for (j = 0; j < m; j++) { cannam@16: interm[j] = data[i][j]; } /* data[i][j] will be overwritten */ cannam@16: for (k = 0; k < ncomponents; k++) { cannam@16: data[i][k] = 0.0; cannam@16: for (k2 = 0; k2 < m; k2++) { cannam@16: data[i][k] += interm[k2] * symmat[k2][m-k-1]; } cannam@16: } cannam@16: } cannam@16: cannam@16: /* cannam@16: printf("\nProjections of row-points on first 3 prin. comps.:\n"); cannam@16: for (i = 0; i < n; i++) { cannam@16: for (j = 0; j < 3; j++) { cannam@16: printf("%12.4f", data[i][j]); } cannam@16: printf("\n"); } cannam@16: */ cannam@16: cannam@16: /* Form projections of col.-points on first three prin. components. */ cannam@16: /* Store in 'symmat2', overwriting what was stored in this. */ cannam@16: //for (j = 0; j < m; j++) { cannam@16: // for (k = 0; k < m; k++) { cannam@16: // interm[k] = symmat2[j][k]; } /*symmat2[j][k] will be overwritten*/ cannam@16: // for (i = 0; i < 3; i++) { cannam@16: // symmat2[j][i] = 0.0; cannam@16: // for (k2 = 0; k2 < m; k2++) { cannam@16: // symmat2[j][i] += interm[k2] * symmat[k2][m-i-1]; } cannam@16: // if (evals[m-i-1] > 0.0005) /* Guard against zero eigenvalue */ cannam@16: // symmat2[j][i] /= sqrt(evals[m-i-1]); /* Rescale */ cannam@16: // else cannam@16: // symmat2[j][i] = 0.0; /* Standard kludge */ cannam@16: // } cannam@16: // } cannam@16: cannam@16: /* cannam@16: printf("\nProjections of column-points on first 3 prin. comps.:\n"); cannam@16: for (j = 0; j < m; j++) { cannam@16: for (k = 0; k < 3; k++) { cannam@16: printf("%12.4f", symmat2[j][k]); } cannam@16: printf("\n"); } cannam@16: */ cannam@16: cannam@16: cannam@16: for (i = 0; i < m; i++) cannam@16: free(symmat[i]); cannam@16: free(symmat); cannam@16: //FREE_ARRAY(symmat2,m); cannam@16: free(evals); cannam@16: free(interm); cannam@16: cannam@16: } cannam@16: cannam@16: cannam@16: