mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Principal Component Analysis. mi@0: mi@0: PCA: Class for Principal Component Analysis mi@0: """ mi@0: mi@0: mi@0: mi@0: import numpy as np mi@0: mi@0: from nmf import NMF mi@0: from svd import SVD mi@0: mi@0: mi@0: __all__ = ["PCA"] mi@0: mi@0: class PCA(NMF): mi@0: """ mi@0: PCA(data, num_bases=4, center_mean=True) mi@0: mi@0: mi@0: Archetypal Analysis. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | is minimal. W is set to the eigenvectors of the mi@0: data covariance. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: center_mean: bool, True mi@0: Make sure that the data is centred around the mean. mi@0: mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying PCA to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: >>> pca_mdl = PCA(data, num_bases=2) mi@0: >>> pca_mdl.factorize() mi@0: mi@0: The basis vectors are now stored in pca_mdl.W, the coefficients in pca_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to pca_mdl.W, and set compute_w to False: mi@0: mi@0: >>> data = np.array([[1.5], [1.2]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> pca_mdl = PCA(data, num_bases=2) mi@0: >>> pca_mdl.W = W mi@0: >>> pca_mdl.factorize(compute_w=False) mi@0: mi@0: The result is a set of coefficients pca_mdl.H, s.t. data = W * pca_mdl.H. mi@0: """ mi@0: mi@0: def __init__(self, data, num_bases=0, center_mean=True): mi@0: mi@0: NMF.__init__(self, data, num_bases=num_bases) mi@0: mi@0: # center the data around the mean first mi@0: self._center_mean = center_mean mi@0: mi@0: if self._center_mean: mi@0: # copy the data before centering it mi@0: self._data_orig = data mi@0: self._meanv = self._data_orig[:,:].mean(axis=1).reshape(data.shape[0],-1) mi@0: self.data = self._data_orig - self._meanv mi@0: else: mi@0: self.data = data mi@0: mi@0: def init_h(self): mi@0: pass mi@0: mi@0: def init_w(self): mi@0: pass mi@0: mi@0: def update_h(self): mi@0: self.H = np.dot(self.W.T, self.data[:,:]) mi@0: mi@0: def update_w(self): mi@0: # compute eigenvectors and eigenvalues using SVD mi@0: svd_mdl = SVD(self.data) mi@0: svd_mdl.factorize() mi@0: mi@0: # argsort sorts in ascending order -> do reverese indexing mi@0: # for accesing values in descending order mi@0: S = np.diag(svd_mdl.S) mi@0: order = np.argsort(S)[::-1] mi@0: mi@0: # select only a few eigenvectors ... mi@0: if self._num_bases >0: mi@0: order = order[:self._num_bases] mi@0: mi@0: self.W = svd_mdl.U[:,order] mi@0: self.eigenvalues = S[order] mi@0: mi@0: def factorize(self, show_progress=False, compute_w=True, compute_h=True, mi@0: compute_err=True, niter=1): mi@0: """ Factorize s.t. WH = data mi@0: mi@0: Parameters mi@0: ---------- mi@0: show_progress : bool mi@0: print some extra information to stdout. mi@0: compute_h : bool mi@0: iteratively update values for H. mi@0: compute_w : bool mi@0: iteratively update values for W. mi@0: compute_err : bool mi@0: compute Frobenius norm |data-WH| after each update and store mi@0: it to .ferr[k]. mi@0: mi@0: Updated Values mi@0: -------------- mi@0: .W : updated values for W. mi@0: .H : updated values for H. mi@0: .ferr : Frobenius norm |data-WH|. mi@0: """ mi@0: mi@0: NMF.factorize(self, niter=1, show_progress=show_progress, mi@0: compute_w=compute_w, compute_h=compute_h, mi@0: compute_err=compute_err) mi@0: mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()