mi@0: #!/usr/bin/env python mi@0: """Class that implements X-means.""" mi@0: mitian@18: import sys, os mi@0: import argparse mi@0: import numpy as np mi@0: import logging mi@0: import time mi@0: import pylab as plt mi@0: import scipy.cluster.vq as vq mi@0: from scipy.spatial import distance mi@0: mi@0: mitian@18: class XMeans(object): mitian@18: def __init__(self, X, init_K=2, plot=False): mitian@18: self.X = X mitian@18: self.init_K = init_K mitian@18: self.plot = plot mi@0: mitian@18: def estimate_K_xmeans(self, th=0.2, maxK = 10): mitian@18: """Estimates K running X-means algorithm (Pelleg & Moore, 2000).""" mi@0: mitian@18: # Run initial K-means mitian@18: means, labels = self.run_kmeans(self.X, self.init_K) mi@0: mitian@18: # Run X-means algorithm mitian@18: stop = False mitian@18: curr_K = self.init_K mitian@18: while not stop: mitian@18: stop = True mitian@18: final_means = [] mitian@18: for k in xrange(curr_K): mitian@18: # Find the data that corresponds to the k-th cluster mitian@18: D = self.get_clustered_data(self.X, labels, k) mitian@18: if len(D) == 0 or D.shape[0] == 1: mitian@18: continue mi@0: mitian@18: # Whiten and find whitened mean mitian@18: stdD = np.std(D, axis=0) mitian@18: #D = vq.whiten(D) mitian@18: D /= stdD # Same as line above mitian@18: mean = D.mean(axis=0) mi@0: mitian@18: # Cluster this subspace by half (K=2) mitian@18: half_means, half_labels = self.run_kmeans(D, K=2) mi@0: mitian@18: # Compute BICs mitian@18: bic1 = self.compute_bic(D, [mean], K=1, mitian@18: labels=np.zeros(D.shape[0]), mitian@18: R=D.shape[0]) mitian@18: bic2 = self.compute_bic(D, half_means, K=2, mitian@18: labels=half_labels, R=D.shape[0]) mi@0: mitian@18: # Split or not mitian@18: max_bic = np.max([np.abs(bic1), np.abs(bic2)]) mitian@18: norm_bic1 = bic1 / max_bic mitian@18: norm_bic2 = bic2 / max_bic mitian@18: diff_bic = np.abs(norm_bic1 - norm_bic2) mi@0: mitian@18: # Split! mitian@18: print "diff_bic", diff_bic mitian@18: if diff_bic > th: mitian@18: final_means.append(half_means[0] * stdD) mitian@18: final_means.append(half_means[1] * stdD) mitian@18: curr_K += 1 mitian@18: stop = False mitian@18: # Don't split mitian@18: else: mitian@18: final_means.append(mean * stdD) mi@0: mitian@18: final_means = np.asarray(final_means) mi@0: mitian@18: print "Estimated K: ", curr_K mitian@18: if self.plot: mitian@18: plt.scatter(self.X[:, 0], self.X[:, 1]) mitian@18: plt.scatter(final_means[:, 0], final_means[:, 1], color="y") mitian@18: plt.show() mi@0: mitian@18: if curr_K >= maxK or self.X.shape[-1] != final_means.shape[-1]: mitian@18: stop = True mitian@18: else: mitian@18: labels, dist = vq.vq(self.X, final_means) mi@0: mitian@18: return curr_K mi@0: mitian@18: def estimate_K_knee(self, th=.015, maxK=12): mitian@18: """Estimates the K using K-means and BIC, by sweeping various K and mitian@18: choosing the optimal BIC.""" mitian@18: # Sweep K-means mitian@18: if self.X.shape[0] < maxK: mitian@18: maxK = self.X.shape[0] mitian@18: if maxK < 2: mitian@18: maxK = 2 mitian@18: K = np.arange(1, maxK) mitian@18: bics = [] mitian@18: for k in K: mitian@18: means, labels = self.run_kmeans(self.X, k) mitian@18: print 'means, labels', means.shape, len(labels) mitian@18: bic = self.compute_bic(self.X, means, labels, K=k, mitian@18: R=self.X.shape[0]) mitian@18: bics.append(bic) mitian@18: diff_bics = np.diff(bics) mitian@18: finalK = K[-1] mi@0: mitian@18: if len(bics) == 1: mitian@18: finalK = 2 mitian@18: else: mitian@18: # Normalize mitian@18: bics = np.asarray(bics) mitian@18: bics -= bics.min() mitian@18: #bics /= bics.max() mitian@18: diff_bics -= diff_bics.min() mitian@18: #diff_bics /= diff_bics.max() mi@0: mitian@18: #print bics, diff_bics mi@0: mitian@18: # Find optimum K mitian@18: for i in xrange(len(K[:-1])): mitian@18: #if bics[i] > diff_bics[i]: mitian@18: if diff_bics[i] < th and K[i] != 1: mitian@18: finalK = K[i] mitian@18: break mi@0: mitian@18: logging.info("Estimated Unique Number of Segments: %d" % finalK) mitian@18: if self.plot: mitian@18: plt.subplot(2, 1, 1) mitian@18: plt.plot(K, bics, label="BIC") mitian@18: plt.plot(K[:-1], diff_bics, label="BIC diff") mitian@18: plt.legend(loc=2) mitian@18: plt.subplot(2, 1, 2) mitian@18: plt.scatter(self.X[:, 0], self.X[:, 1]) mitian@18: plt.show() mi@0: mitian@18: return finalK mi@0: mitian@18: def get_clustered_data(self, X, labels, label_index): mitian@18: """Returns the data with a specific label_index, using the previously mitian@18: learned labels.""" mitian@18: D = X[np.argwhere(labels == label_index)] mitian@18: return D.reshape((D.shape[0], D.shape[-1])) mi@0: mitian@18: def run_kmeans(self, X, K): mitian@18: """Runs k-means and returns the labels assigned to the data.""" mitian@18: wX = vq.whiten(X) mitian@18: means, dist = vq.kmeans(wX, K, iter=100) mitian@18: labels, dist = vq.vq(wX, means) mitian@18: return means, labels mi@0: mitian@18: def compute_bic(self, D, means, labels, K, R): mitian@18: """Computes the Bayesian Information Criterion.""" mitian@18: D = vq.whiten(D) mitian@18: Rn = D.shape[0] mitian@18: M = D.shape[1] mi@0: mitian@18: if R == K: mitian@18: return 1 mi@0: mitian@18: # Maximum likelihood estimate (MLE) mitian@18: mle_var = 0 mitian@18: for k in xrange(len(means)): mitian@18: X = D[np.argwhere(labels == k)] mitian@18: X = X.reshape((X.shape[0], X.shape[-1])) mitian@18: for x in X: mitian@18: mle_var += distance.euclidean(x, means[k]) mitian@18: #print x, means[k], mle_var mitian@18: mle_var /= (float(R - K)) mi@0: mitian@18: # Log-likelihood of the data mitian@18: l_D = - Rn/2. * np.log(2*np.pi) - (Rn * M)/2. * np.log(mle_var) - \ mitian@18: (Rn - K) / 2. + Rn * np.log(Rn) - Rn * np.log(R) mi@0: mitian@18: # Params of BIC mitian@18: p = (K-1) + M * K + mle_var mi@0: mitian@18: #print "BIC:", l_D, p, R, K mi@0: mitian@18: # Return the bic mitian@18: return l_D - p/2. * np.log(R) mi@0: mitian@18: @classmethod mitian@18: def generate_2d_data(self, N=100, K=5): mitian@18: """Generates N*K 2D data points with K means and N data points mitian@18: for each mean.""" mitian@18: # Seed the random mitian@18: np.random.seed(seed=int(time.time())) mi@0: mitian@18: # Amount of spread of the centroids mitian@18: spread = 30 mi@0: mitian@18: # Generate random data mitian@18: X = np.empty((0, 2)) mitian@18: for i in xrange(K): mitian@18: mean = np.array([np.random.random()*spread, mitian@18: np.random.random()*spread]) mitian@18: x = np.random.normal(0.0, scale=1.0, size=(N, 2)) + mean mitian@18: X = np.append(X, x, axis=0) mi@0: mitian@18: return X mi@0: mi@0: mi@0: def test_kmeans(K=5): mitian@18: """Test k-means with the synthetic data.""" mitian@18: X = XMeans.generate_2d_data(K=4) mitian@18: wX = vq.whiten(X) mitian@18: dic, dist = vq.kmeans(wX, K, iter=100) mi@0: mitian@18: plt.scatter(wX[:, 0], wX[:, 1]) mitian@18: plt.scatter(dic[:, 0], dic[:, 1], color="m") mitian@18: plt.show() mi@0: mi@0: mi@0: def main(args): mitian@18: #test_kmeans(6) mitian@18: X = XMeans.generate_2d_data(K=args.k) mitian@18: xmeans = XMeans(X, init_K=2, plot=args.plot) mitian@18: est_K = xmeans.estimate_K_xmeans() mitian@18: est_K_knee = xmeans.estimate_K_knee() mitian@18: print "Estimated x-means K:", est_K mitian@18: print "Estimated Knee Point Detection K:", est_K_knee mi@0: mi@0: if __name__ == '__main__': mitian@18: parser = argparse.ArgumentParser( mitian@18: description="Runs x-means") mitian@18: parser.add_argument("k", mitian@18: metavar="k", type=int, mitian@18: help="Number of clusters to estimate.") mitian@18: parser.add_argument("-p", action="store_true", default=False, mitian@18: dest="plot", help="Plot the results") mitian@18: main(parser.parse_args())