annotate pymf/kmeans.py @ 19:890cfe424f4a tip

added annotations
author mitian
date Fri, 11 Dec 2015 09:47:40 +0000
parents 26838b1f560f
children
rev   line source
mi@0 1 #!/usr/bin/python
mi@0 2 #
mi@0 3 # Copyright (C) Christian Thurau, 2010.
mi@0 4 # Licensed under the GNU General Public License (GPL).
mi@0 5 # http://www.gnu.org/licenses/gpl.txt
mi@0 6 """
mi@0 7 PyMF K-means clustering (unary-convex matrix factorization).
mi@0 8 """
mi@0 9
mi@0 10
mi@0 11 import numpy as np
mi@0 12 import random
mi@0 13
mi@0 14 import dist
mi@0 15 from nmf import NMF
mi@0 16
mi@0 17 __all__ = ["Kmeans"]
mi@0 18
mi@0 19 class Kmeans(NMF):
mi@0 20 """
mi@0 21 Kmeans(data, num_bases=4)
mi@0 22
mi@0 23 K-means clustering. Factorize a data matrix into two matrices s.t.
mi@0 24 F = | data - W*H | is minimal. H is restricted to unary vectors, W
mi@0 25 is simply the mean over the corresponding samples in "data".
mi@0 26
mi@0 27 Parameters
mi@0 28 ----------
mi@0 29 data : array_like, shape (_data_dimension, _num_samples)
mi@0 30 the input data
mi@0 31 num_bases: int, optional
mi@0 32 Number of bases to compute (column rank of W and row rank of H).
mi@0 33 4 (default)
mi@0 34
mi@0 35 Attributes
mi@0 36 ----------
mi@0 37 W : "data_dimension x num_bases" matrix of basis vectors
mi@0 38 H : "num bases x num_samples" matrix of coefficients
mi@0 39 ferr : frobenius norm (after calling .factorize())
mi@0 40
mi@0 41 Example
mi@0 42 -------
mi@0 43 Applying K-means to some rather stupid data set:
mi@0 44
mi@0 45 >>> import numpy as np
mi@0 46 >>> data = np.array([[1.0, 0.0, 2.0], [0.0, 1.0, 1.0]])
mi@0 47 >>> kmeans_mdl = Kmeans(data, num_bases=2)
mi@0 48 >>> kmeans_mdl.factorize(niter=10)
mi@0 49
mi@0 50 The basis vectors are now stored in kmeans_mdl.W, the coefficients in kmeans_mdl.H.
mi@0 51 To compute coefficients for an existing set of basis vectors simply copy W
mi@0 52 to kmeans_mdl.W, and set compute_w to False:
mi@0 53
mi@0 54 >>> data = np.array([[1.5], [1.2]])
mi@0 55 >>> W = [[1.0, 0.0], [0.0, 1.0]]
mi@0 56 >>> kmeans_mdl = Kmeans(data, num_bases=2)
mi@0 57 >>> kmeans_mdl.W = W
mi@0 58 >>> kmeans_mdl.factorize(niter=1, compute_w=False)
mi@0 59
mi@0 60 The result is a set of coefficients kmeans_mdl.H, s.t. data = W * kmeans_mdl.H.
mi@0 61 """
mi@0 62 def init_h(self):
mi@0 63 # W has to be present for H to be initialized
mi@0 64 self.H = np.zeros((self._num_bases, self._num_samples))
mi@0 65 self.update_h()
mi@0 66
mi@0 67 def init_w(self):
mi@0 68 # set W to some random data samples
mi@0 69 sel = random.sample(xrange(self._num_samples), self._num_bases)
mi@0 70
mi@0 71 # sort indices, otherwise h5py won't work
mi@0 72 self.W = self.data[:, np.sort(sel)]
mi@0 73
mi@0 74
mi@0 75 def update_h(self):
mi@0 76 # and assign samples to the best matching centers
mi@0 77 self.assigned = dist.vq(self.W, self.data)
mi@0 78 self.H = np.zeros(self.H.shape)
mi@0 79 self.H[self.assigned, range(self._num_samples)] = 1.0
mi@0 80
mi@0 81
mi@0 82 def update_w(self):
mi@0 83 for i in range(self._num_bases):
mi@0 84 idx = np.where(self.assigned==i)[0]
mi@0 85 n = len(idx)
mi@0 86 if n > 1:
mi@0 87 self.W[:,i] = np.sum(self.data[:,idx], axis=1)/n