c@241
|
1 /*********************************/
|
c@241
|
2 /* Principal Components Analysis */
|
c@241
|
3 /*********************************/
|
c@241
|
4
|
c@241
|
5 /*********************************************************************/
|
c@241
|
6 /* Principal Components Analysis or the Karhunen-Loeve expansion is a
|
c@241
|
7 classical method for dimensionality reduction or exploratory data
|
c@241
|
8 analysis. One reference among many is: F. Murtagh and A. Heck,
|
c@241
|
9 Multivariate Data Analysis, Kluwer Academic, Dordrecht, 1987.
|
c@241
|
10
|
c@241
|
11 Author:
|
c@241
|
12 F. Murtagh
|
c@241
|
13 Phone: + 49 89 32006298 (work)
|
c@241
|
14 + 49 89 965307 (home)
|
c@241
|
15 Earn/Bitnet: fionn@dgaeso51, fim@dgaipp1s, murtagh@stsci
|
c@241
|
16 Span: esomc1::fionn
|
c@241
|
17 Internet: murtagh@scivax.stsci.edu
|
c@241
|
18
|
c@241
|
19 F. Murtagh, Munich, 6 June 1989 */
|
c@241
|
20 /*********************************************************************/
|
c@241
|
21
|
c@241
|
22 #include <stdio.h>
|
c@241
|
23 #include <stdlib.h>
|
c@241
|
24 #include <math.h>
|
c@241
|
25
|
c@241
|
26 #include "pca.h"
|
c@241
|
27
|
c@241
|
28 #define SIGN(a, b) ( (b) < 0 ? -fabs(a) : fabs(a) )
|
c@241
|
29
|
c@241
|
30 /** Variance-covariance matrix: creation *****************************/
|
c@241
|
31
|
c@241
|
32 /* Create m * m covariance matrix from given n * m data matrix. */
|
c@241
|
33 void covcol(double** data, int n, int m, double** symmat)
|
c@241
|
34 {
|
c@241
|
35 double *mean;
|
c@241
|
36 int i, j, j1, j2;
|
c@241
|
37
|
c@241
|
38 /* Allocate storage for mean vector */
|
c@241
|
39
|
c@241
|
40 mean = (double*) malloc(m*sizeof(double));
|
c@241
|
41
|
c@241
|
42 /* Determine mean of column vectors of input data matrix */
|
c@241
|
43
|
c@241
|
44 for (j = 0; j < m; j++)
|
c@241
|
45 {
|
c@241
|
46 mean[j] = 0.0;
|
c@241
|
47 for (i = 0; i < n; i++)
|
c@241
|
48 {
|
c@241
|
49 mean[j] += data[i][j];
|
c@241
|
50 }
|
c@241
|
51 mean[j] /= (double)n;
|
c@241
|
52 }
|
c@241
|
53
|
c@241
|
54 /*
|
c@241
|
55 printf("\nMeans of column vectors:\n");
|
c@241
|
56 for (j = 0; j < m; j++) {
|
c@241
|
57 printf("%12.1f",mean[j]); } printf("\n");
|
c@241
|
58 */
|
c@241
|
59
|
c@241
|
60 /* Center the column vectors. */
|
c@241
|
61
|
c@241
|
62 for (i = 0; i < n; i++)
|
c@241
|
63 {
|
c@241
|
64 for (j = 0; j < m; j++)
|
c@241
|
65 {
|
c@241
|
66 data[i][j] -= mean[j];
|
c@241
|
67 }
|
c@241
|
68 }
|
c@241
|
69
|
c@241
|
70 /* Calculate the m * m covariance matrix. */
|
c@241
|
71 for (j1 = 0; j1 < m; j1++)
|
c@241
|
72 {
|
c@241
|
73 for (j2 = j1; j2 < m; j2++)
|
c@241
|
74 {
|
c@241
|
75 symmat[j1][j2] = 0.0;
|
c@241
|
76 for (i = 0; i < n; i++)
|
c@241
|
77 {
|
c@241
|
78 symmat[j1][j2] += data[i][j1] * data[i][j2];
|
c@241
|
79 }
|
c@241
|
80 symmat[j2][j1] = symmat[j1][j2];
|
c@241
|
81 }
|
c@241
|
82 }
|
c@241
|
83
|
c@241
|
84 free(mean);
|
c@241
|
85
|
c@241
|
86 return;
|
c@241
|
87
|
c@241
|
88 }
|
c@241
|
89
|
c@241
|
90 /** Error handler **************************************************/
|
c@241
|
91
|
c@241
|
92 void erhand(char* err_msg)
|
c@241
|
93 {
|
c@241
|
94 fprintf(stderr,"Run-time error:\n");
|
c@241
|
95 fprintf(stderr,"%s\n", err_msg);
|
c@241
|
96 fprintf(stderr,"Exiting to system.\n");
|
c@241
|
97 exit(1);
|
c@241
|
98 }
|
c@241
|
99
|
c@241
|
100
|
c@241
|
101 /** Reduce a real, symmetric matrix to a symmetric, tridiag. matrix. */
|
c@241
|
102
|
c@241
|
103 /* Householder reduction of matrix a to tridiagonal form.
|
c@241
|
104 Algorithm: Martin et al., Num. Math. 11, 181-195, 1968.
|
c@241
|
105 Ref: Smith et al., Matrix Eigensystem Routines -- EISPACK Guide
|
c@241
|
106 Springer-Verlag, 1976, pp. 489-494.
|
c@241
|
107 W H Press et al., Numerical Recipes in C, Cambridge U P,
|
c@241
|
108 1988, pp. 373-374. */
|
c@241
|
109 void tred2(double** a, int n, double* d, double* e)
|
c@241
|
110 {
|
c@241
|
111 int l, k, j, i;
|
c@241
|
112 double scale, hh, h, g, f;
|
c@241
|
113
|
c@241
|
114 for (i = n-1; i >= 1; i--)
|
c@241
|
115 {
|
c@241
|
116 l = i - 1;
|
c@241
|
117 h = scale = 0.0;
|
c@241
|
118 if (l > 0)
|
c@241
|
119 {
|
c@241
|
120 for (k = 0; k <= l; k++)
|
c@241
|
121 scale += fabs(a[i][k]);
|
c@241
|
122 if (scale == 0.0)
|
c@241
|
123 e[i] = a[i][l];
|
c@241
|
124 else
|
c@241
|
125 {
|
c@241
|
126 for (k = 0; k <= l; k++)
|
c@241
|
127 {
|
c@241
|
128 a[i][k] /= scale;
|
c@241
|
129 h += a[i][k] * a[i][k];
|
c@241
|
130 }
|
c@241
|
131 f = a[i][l];
|
c@241
|
132 g = f>0 ? -sqrt(h) : sqrt(h);
|
c@241
|
133 e[i] = scale * g;
|
c@241
|
134 h -= f * g;
|
c@241
|
135 a[i][l] = f - g;
|
c@241
|
136 f = 0.0;
|
c@241
|
137 for (j = 0; j <= l; j++)
|
c@241
|
138 {
|
c@241
|
139 a[j][i] = a[i][j]/h;
|
c@241
|
140 g = 0.0;
|
c@241
|
141 for (k = 0; k <= j; k++)
|
c@241
|
142 g += a[j][k] * a[i][k];
|
c@241
|
143 for (k = j+1; k <= l; k++)
|
c@241
|
144 g += a[k][j] * a[i][k];
|
c@241
|
145 e[j] = g / h;
|
c@241
|
146 f += e[j] * a[i][j];
|
c@241
|
147 }
|
c@241
|
148 hh = f / (h + h);
|
c@241
|
149 for (j = 0; j <= l; j++)
|
c@241
|
150 {
|
c@241
|
151 f = a[i][j];
|
c@241
|
152 e[j] = g = e[j] - hh * f;
|
c@241
|
153 for (k = 0; k <= j; k++)
|
c@241
|
154 a[j][k] -= (f * e[k] + g * a[i][k]);
|
c@241
|
155 }
|
c@241
|
156 }
|
c@241
|
157 }
|
c@241
|
158 else
|
c@241
|
159 e[i] = a[i][l];
|
c@241
|
160 d[i] = h;
|
c@241
|
161 }
|
c@241
|
162 d[0] = 0.0;
|
c@241
|
163 e[0] = 0.0;
|
c@241
|
164 for (i = 0; i < n; i++)
|
c@241
|
165 {
|
c@241
|
166 l = i - 1;
|
c@241
|
167 if (d[i])
|
c@241
|
168 {
|
c@241
|
169 for (j = 0; j <= l; j++)
|
c@241
|
170 {
|
c@241
|
171 g = 0.0;
|
c@241
|
172 for (k = 0; k <= l; k++)
|
c@241
|
173 g += a[i][k] * a[k][j];
|
c@241
|
174 for (k = 0; k <= l; k++)
|
c@241
|
175 a[k][j] -= g * a[k][i];
|
c@241
|
176 }
|
c@241
|
177 }
|
c@241
|
178 d[i] = a[i][i];
|
c@241
|
179 a[i][i] = 1.0;
|
c@241
|
180 for (j = 0; j <= l; j++)
|
c@241
|
181 a[j][i] = a[i][j] = 0.0;
|
c@241
|
182 }
|
c@241
|
183 }
|
c@241
|
184
|
c@241
|
185 /** Tridiagonal QL algorithm -- Implicit **********************/
|
c@241
|
186
|
c@241
|
187 void tqli(double* d, double* e, int n, double** z)
|
c@241
|
188 {
|
c@241
|
189 int m, l, iter, i, k;
|
c@241
|
190 double s, r, p, g, f, dd, c, b;
|
c@241
|
191
|
c@241
|
192 for (i = 1; i < n; i++)
|
c@241
|
193 e[i-1] = e[i];
|
c@241
|
194 e[n-1] = 0.0;
|
c@241
|
195 for (l = 0; l < n; l++)
|
c@241
|
196 {
|
c@241
|
197 iter = 0;
|
c@241
|
198 do
|
c@241
|
199 {
|
c@241
|
200 for (m = l; m < n-1; m++)
|
c@241
|
201 {
|
c@241
|
202 dd = fabs(d[m]) + fabs(d[m+1]);
|
c@241
|
203 if (fabs(e[m]) + dd == dd) break;
|
c@241
|
204 }
|
c@241
|
205 if (m != l)
|
c@241
|
206 {
|
c@241
|
207 if (iter++ == 30) erhand("No convergence in TLQI.");
|
c@241
|
208 g = (d[l+1] - d[l]) / (2.0 * e[l]);
|
c@241
|
209 r = sqrt((g * g) + 1.0);
|
c@241
|
210 g = d[m] - d[l] + e[l] / (g + SIGN(r, g));
|
c@241
|
211 s = c = 1.0;
|
c@241
|
212 p = 0.0;
|
c@241
|
213 for (i = m-1; i >= l; i--)
|
c@241
|
214 {
|
c@241
|
215 f = s * e[i];
|
c@241
|
216 b = c * e[i];
|
c@241
|
217 if (fabs(f) >= fabs(g))
|
c@241
|
218 {
|
c@241
|
219 c = g / f;
|
c@241
|
220 r = sqrt((c * c) + 1.0);
|
c@241
|
221 e[i+1] = f * r;
|
c@241
|
222 c *= (s = 1.0/r);
|
c@241
|
223 }
|
c@241
|
224 else
|
c@241
|
225 {
|
c@241
|
226 s = f / g;
|
c@241
|
227 r = sqrt((s * s) + 1.0);
|
c@241
|
228 e[i+1] = g * r;
|
c@241
|
229 s *= (c = 1.0/r);
|
c@241
|
230 }
|
c@241
|
231 g = d[i+1] - p;
|
c@241
|
232 r = (d[i] - g) * s + 2.0 * c * b;
|
c@241
|
233 p = s * r;
|
c@241
|
234 d[i+1] = g + p;
|
c@241
|
235 g = c * r - b;
|
c@241
|
236 for (k = 0; k < n; k++)
|
c@241
|
237 {
|
c@241
|
238 f = z[k][i+1];
|
c@241
|
239 z[k][i+1] = s * z[k][i] + c * f;
|
c@241
|
240 z[k][i] = c * z[k][i] - s * f;
|
c@241
|
241 }
|
c@241
|
242 }
|
c@241
|
243 d[l] = d[l] - p;
|
c@241
|
244 e[l] = g;
|
c@241
|
245 e[m] = 0.0;
|
c@241
|
246 }
|
c@241
|
247 } while (m != l);
|
c@241
|
248 }
|
c@241
|
249 }
|
c@241
|
250
|
c@241
|
251 /* In place projection onto basis vectors */
|
c@241
|
252 void pca_project(double** data, int n, int m, int ncomponents)
|
c@241
|
253 {
|
c@241
|
254 int i, j, k, k2;
|
c@241
|
255 double **symmat, **symmat2, *evals, *interm;
|
c@241
|
256
|
c@241
|
257 //TODO: assert ncomponents < m
|
c@241
|
258
|
c@241
|
259 symmat = (double**) malloc(m*sizeof(double*));
|
c@241
|
260 for (i = 0; i < m; i++)
|
c@241
|
261 symmat[i] = (double*) malloc(m*sizeof(double));
|
c@241
|
262
|
c@241
|
263 covcol(data, n, m, symmat);
|
c@241
|
264
|
c@241
|
265 /*********************************************************************
|
c@241
|
266 Eigen-reduction
|
c@241
|
267 **********************************************************************/
|
c@241
|
268
|
c@241
|
269 /* Allocate storage for dummy and new vectors. */
|
c@241
|
270 evals = (double*) malloc(m*sizeof(double)); /* Storage alloc. for vector of eigenvalues */
|
c@241
|
271 interm = (double*) malloc(m*sizeof(double)); /* Storage alloc. for 'intermediate' vector */
|
c@241
|
272 //MALLOC_ARRAY(symmat2,m,m,double);
|
c@241
|
273 //for (i = 0; i < m; i++) {
|
c@241
|
274 // for (j = 0; j < m; j++) {
|
c@241
|
275 // symmat2[i][j] = symmat[i][j]; /* Needed below for col. projections */
|
c@241
|
276 // }
|
c@241
|
277 //}
|
c@241
|
278 tred2(symmat, m, evals, interm); /* Triangular decomposition */
|
c@241
|
279 tqli(evals, interm, m, symmat); /* Reduction of sym. trid. matrix */
|
c@241
|
280 /* evals now contains the eigenvalues,
|
c@241
|
281 columns of symmat now contain the associated eigenvectors. */
|
c@241
|
282
|
c@241
|
283 /*
|
c@241
|
284 printf("\nEigenvalues:\n");
|
c@241
|
285 for (j = m-1; j >= 0; j--) {
|
c@241
|
286 printf("%18.5f\n", evals[j]); }
|
c@241
|
287 printf("\n(Eigenvalues should be strictly positive; limited\n");
|
c@241
|
288 printf("precision machine arithmetic may affect this.\n");
|
c@241
|
289 printf("Eigenvalues are often expressed as cumulative\n");
|
c@241
|
290 printf("percentages, representing the 'percentage variance\n");
|
c@241
|
291 printf("explained' by the associated axis or principal component.)\n");
|
c@241
|
292
|
c@241
|
293 printf("\nEigenvectors:\n");
|
c@241
|
294 printf("(First three; their definition in terms of original vbes.)\n");
|
c@241
|
295 for (j = 0; j < m; j++) {
|
c@241
|
296 for (i = 1; i <= 3; i++) {
|
c@241
|
297 printf("%12.4f", symmat[j][m-i]); }
|
c@241
|
298 printf("\n"); }
|
c@241
|
299 */
|
c@241
|
300
|
c@241
|
301 /* Form projections of row-points on prin. components. */
|
c@241
|
302 /* Store in 'data', overwriting original data. */
|
c@241
|
303 for (i = 0; i < n; i++) {
|
c@241
|
304 for (j = 0; j < m; j++) {
|
c@241
|
305 interm[j] = data[i][j]; } /* data[i][j] will be overwritten */
|
c@241
|
306 for (k = 0; k < ncomponents; k++) {
|
c@241
|
307 data[i][k] = 0.0;
|
c@241
|
308 for (k2 = 0; k2 < m; k2++) {
|
c@241
|
309 data[i][k] += interm[k2] * symmat[k2][m-k-1]; }
|
c@241
|
310 }
|
c@241
|
311 }
|
c@241
|
312
|
c@241
|
313 /*
|
c@241
|
314 printf("\nProjections of row-points on first 3 prin. comps.:\n");
|
c@241
|
315 for (i = 0; i < n; i++) {
|
c@241
|
316 for (j = 0; j < 3; j++) {
|
c@241
|
317 printf("%12.4f", data[i][j]); }
|
c@241
|
318 printf("\n"); }
|
c@241
|
319 */
|
c@241
|
320
|
c@241
|
321 /* Form projections of col.-points on first three prin. components. */
|
c@241
|
322 /* Store in 'symmat2', overwriting what was stored in this. */
|
c@241
|
323 //for (j = 0; j < m; j++) {
|
c@241
|
324 // for (k = 0; k < m; k++) {
|
c@241
|
325 // interm[k] = symmat2[j][k]; } /*symmat2[j][k] will be overwritten*/
|
c@241
|
326 // for (i = 0; i < 3; i++) {
|
c@241
|
327 // symmat2[j][i] = 0.0;
|
c@241
|
328 // for (k2 = 0; k2 < m; k2++) {
|
c@241
|
329 // symmat2[j][i] += interm[k2] * symmat[k2][m-i-1]; }
|
c@241
|
330 // if (evals[m-i-1] > 0.0005) /* Guard against zero eigenvalue */
|
c@241
|
331 // symmat2[j][i] /= sqrt(evals[m-i-1]); /* Rescale */
|
c@241
|
332 // else
|
c@241
|
333 // symmat2[j][i] = 0.0; /* Standard kludge */
|
c@241
|
334 // }
|
c@241
|
335 // }
|
c@241
|
336
|
c@241
|
337 /*
|
c@241
|
338 printf("\nProjections of column-points on first 3 prin. comps.:\n");
|
c@241
|
339 for (j = 0; j < m; j++) {
|
c@241
|
340 for (k = 0; k < 3; k++) {
|
c@241
|
341 printf("%12.4f", symmat2[j][k]); }
|
c@241
|
342 printf("\n"); }
|
c@241
|
343 */
|
c@241
|
344
|
c@241
|
345
|
c@241
|
346 for (i = 0; i < m; i++)
|
c@241
|
347 free(symmat[i]);
|
c@241
|
348 free(symmat);
|
c@241
|
349 //FREE_ARRAY(symmat2,m);
|
c@241
|
350 free(evals);
|
c@241
|
351 free(interm);
|
c@241
|
352
|
c@241
|
353 }
|
c@241
|
354
|
c@241
|
355
|
c@241
|
356
|