mi@0: #!/usr/bin/python mi@0: # mi@0: # Copyright (C) Christian Thurau, 2010. mi@0: # Licensed under the GNU General Public License (GPL). mi@0: # http://www.gnu.org/licenses/gpl.txt mi@0: """ mi@0: PyMF Archetypal Analysis [1] mi@0: mi@0: AA: class for Archetypal Analysis mi@0: mi@0: [1] Cutler, A. Breiman, L. (1994), "Archetypal Analysis", Technometrics 36(4), mi@0: 338-347. mi@0: """ mi@0: mi@0: mi@0: import numpy as np mi@0: from dist import vq mi@0: from cvxopt import solvers, base mi@0: mi@0: from svd import pinv mi@0: from nmf import NMF mi@0: mi@0: __all__ = ["AA"] mi@0: mi@0: class AA(NMF): mi@0: """ mi@0: AA(data, num_bases=4) mi@0: mi@0: Archetypal Analysis. Factorize a data matrix into two matrices s.t. mi@0: F = | data - W*H | = | data - data*beta*H| is minimal. H and beta mi@0: are restricted to convexity (beta >=0, sum(beta, axis=1) = [1 .. 1]). mi@0: Factorization is solved via an alternating least squares optimization mi@0: using the quadratic programming solver from cvxopt. mi@0: mi@0: Parameters mi@0: ---------- mi@0: data : array_like, shape (_data_dimension, _num_samples) mi@0: the input data mi@0: num_bases: int, optional mi@0: Number of bases to compute (column rank of W and row rank of H). mi@0: 4 (default) mi@0: mi@0: mi@0: Attributes mi@0: ---------- mi@0: W : "data_dimension x num_bases" matrix of basis vectors mi@0: H : "num bases x num_samples" matrix of coefficients mi@0: beta : "num_bases x num_samples" matrix of basis vector coefficients mi@0: (for constructing W s.t. W = beta * data.T ) mi@0: ferr : frobenius norm (after calling .factorize()) mi@0: mi@0: Example mi@0: ------- mi@0: Applying AA to some rather stupid data set: mi@0: mi@0: >>> import numpy as np mi@0: >>> from aa import AA mi@0: >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]]) mi@0: mi@0: Use 2 basis vectors -> W shape(data_dimension, 2). mi@0: mi@0: >>> aa_mdl = AA(data, num_bases=2) mi@0: mi@0: Set number of iterations to 5 and start computing the factorization. mi@0: mi@0: >>> aa_mdl.factorize(niter=5) mi@0: mi@0: The basis vectors are now stored in aa_mdl.W, the coefficients in aa_mdl.H. mi@0: To compute coefficients for an existing set of basis vectors simply copy W mi@0: to aa_mdl.W, and set compute_w to False: mi@0: mi@0: >>> data = np.array([[1.5], [1.2]]) mi@0: >>> W = np.array([[1.0, 0.0], [0.0, 1.0]]) mi@0: >>> aa_mdl = AA(data, num_bases=2) mi@0: >>> aa_mdl.W = W mi@0: >>> aa_mdl.factorize(niter=5, compute_w=False) mi@0: mi@0: The result is a set of coefficients aa_mdl.H, s.t. data = W * aa_mdl.H. mi@0: """ mi@0: # set cvxopt options mi@0: solvers.options['show_progress'] = False mi@0: mi@0: def init_h(self): mi@0: self.H = np.random.random((self._num_bases, self._num_samples)) mi@0: self.H /= self.H.sum(axis=0) mi@0: mi@0: def init_w(self): mi@0: self.beta = np.random.random((self._num_bases, self._num_samples)) mi@0: self.beta /= self.beta.sum(axis=0) mi@0: self.W = np.dot(self.beta, self.data.T).T mi@0: self.W = np.random.random((self._data_dimension, self._num_bases)) mi@0: mi@0: def update_h(self): mi@0: """ alternating least squares step, update H under the convexity mi@0: constraint """ mi@0: def update_single_h(i): mi@0: """ compute single H[:,i] """ mi@0: # optimize alpha using qp solver from cvxopt mi@0: FA = base.matrix(np.float64(np.dot(-self.W.T, self.data[:,i]))) mi@0: al = solvers.qp(HA, FA, INQa, INQb, EQa, EQb) mi@0: self.H[:,i] = np.array(al['x']).reshape((1, self._num_bases)) mi@0: mi@0: EQb = base.matrix(1.0, (1,1)) mi@0: # float64 required for cvxopt mi@0: HA = base.matrix(np.float64(np.dot(self.W.T, self.W))) mi@0: INQa = base.matrix(-np.eye(self._num_bases)) mi@0: INQb = base.matrix(0.0, (self._num_bases,1)) mi@0: EQa = base.matrix(1.0, (1, self._num_bases)) mi@0: mi@0: for i in xrange(self._num_samples): mi@0: update_single_h(i) mi@0: mi@0: def update_w(self): mi@0: """ alternating least squares step, update W under the convexity mi@0: constraint """ mi@0: def update_single_w(i): mi@0: """ compute single W[:,i] """ mi@0: # optimize beta using qp solver from cvxopt mi@0: FB = base.matrix(np.float64(np.dot(-self.data.T, W_hat[:,i]))) mi@0: be = solvers.qp(HB, FB, INQa, INQb, EQa, EQb) mi@0: self.beta[i,:] = np.array(be['x']).reshape((1, self._num_samples)) mi@0: mi@0: # float64 required for cvxopt mi@0: HB = base.matrix(np.float64(np.dot(self.data[:,:].T, self.data[:,:]))) mi@0: EQb = base.matrix(1.0, (1, 1)) mi@0: W_hat = np.dot(self.data, pinv(self.H)) mi@0: INQa = base.matrix(-np.eye(self._num_samples)) mi@0: INQb = base.matrix(0.0, (self._num_samples, 1)) mi@0: EQa = base.matrix(1.0, (1, self._num_samples)) mi@0: mi@0: for i in xrange(self._num_bases): mi@0: update_single_w(i) mi@0: mi@0: self.W = np.dot(self.beta, self.data.T).T mi@0: mi@0: if __name__ == "__main__": mi@0: import doctest mi@0: doctest.testmod()