mi@0: import sys,os,optparse mi@0: from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext mi@0: import numpy as np mi@0: from numpy import log, power, pi, exp, transpose, zeros, log, ones, dot, argmin, inf, zeros_like, argsort, finfo mi@0: from sklearn.mixture import GMM mi@0: from sklearn.metrics.pairwise import pairwise_distances mi@0: from copy import copy mi@0: mi@0: from MutualInfo import print_progress mi@0: from ComputationCache import Meta, with_cache, with_pickle_dump mi@0: from gmmdist import skl_full, skl_gmm mi@0: from GmmMetrics import GmmDistance mi@0: mi@0: class Node(object): mi@0: mi@0: def __init__(self, model, distances): mi@0: self.model = model mi@0: self.distances = distances mi@0: self.neighborhood = [] mi@0: mi@0: def __str__(self): mi@0: # return 'Node(%i):%s' %(self.sorted_distances[0], self.neighborhood) mi@0: return 'Node(%i):%i, %3.2f' %(self.sorted_distances[0], self.neighborhood_size, self.average_div) mi@0: mi@0: def rankDistances(self): mi@0: '''Return the index of the sorted array.''' mi@0: self.sorted_distances = np.argsort(self.distances) mi@0: mi@0: def findNeighborhood(self, eps): mi@0: '''''' mi@0: d = self.distances[self.sorted_distances][1] mi@0: b = eps * d mi@0: if np.isinf(b) : return None mi@0: mi@0: for i in self.sorted_distances: mi@0: if self.distances[i] <= b : mi@0: self.neighborhood.append(i) mi@0: mi@0: # Brute force hack; we exclude nodes whose neighbourhood is larger than 80% of all frames mi@0: # Proper solution: use the delta assymmetric KL divergence to identify near singularities mitian@18: if len(self.neighborhood) > 0.8 * len(self.sorted_distances) : mi@0: self.neighborhood = [] mi@0: mi@0: def getStatistics(self): mi@0: mi@0: self.neighborhood_size = len(self.neighborhood) mi@0: self.average_div = np.mean(self.distances[self.neighborhood[1:]]) mi@0: mi@0: class rClustering(object): mi@0: mitian@13: meta = Meta() mitian@13: mitian@13: @classmethod mitian@13: def set_cache_filename(cls, filename, cache = True, cache_location=""): mitian@13: cls.meta.cache = cache mitian@13: cls.meta.cache_file_base = filename mitian@13: cls.meta.cache_location = cache_location mi@0: mi@0: def __init__(self, eps, thresh=15, k=5, rank='max_neighbors', centering=True): mi@0: self.eps = eps mi@0: self.thresh = thresh mi@0: self.rank = rank mi@0: self.node_list = [] mi@0: self.methods = {'max_neighbors':'neighborhood_size', 'min_avg_div':'average_div'} mi@0: self.classification = [] mi@0: self.centering = centering mi@0: self.k = k mi@0: mi@0: def fit(self, data): mi@0: ''' mi@0: Arguments: mi@0: data: Input list of GMMs.''' mi@0: D = self.getPairwiseDistances(data) mi@0: if self.centering: mi@0: D[D>=finfo(np.float32).max] = inf mi@0: print "hubness before centering:", self.hubness_measure(D) mi@0: D = self.center_divergence_matrix(D) mi@0: print "hubness after centering:", self.hubness_measure(D) mi@0: for i, model in enumerate(data): mi@0: self.node_list.append(Node(model, D[i])) mi@0: for node in self.node_list: mi@0: node.rankDistances() mi@0: node.findNeighborhood(self.eps) mi@0: node.getStatistics() mi@0: mi@0: print "finding centroids" mi@0: mi@0: hub_candidates = self.findCentroids() mi@0: print "\n\n hubs:",len(hub_candidates) mi@0: if len(hub_candidates) > self.k: mi@0: hub_candidates = hub_candidates[:self.k] mi@0: self.classification = [] mi@0: mi@0: for i, node in enumerate(self.node_list): mi@0: dists = zeros(len(hub_candidates)) mi@0: for k,hub in enumerate(hub_candidates): mi@0: j = self.node_list.index(hub) mi@0: dists[k] = D[i,j] mi@0: di = argmin(dists) mi@0: sel_hub = hub_candidates[di] mi@0: self.classification.append(self.node_list.index(sel_hub)) mi@0: mi@0: return self mi@0: mi@0: def classify(self): mi@0: return self.classification mi@0: mi@0: def findCentroids(self): mi@0: # sorted_nodelist = sorted(self.node_list, key = lambda x: x.neighborhood_size) mi@0: sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank]), reverse=True) mi@0: hub_candidates = [] mi@0: while True: mi@0: hub = sorted_nodelist.pop(0) mi@0: hub_candidates.append(hub) mi@0: [sorted_nodelist.remove(self.node_list[n]) for n in hub.neighborhood if self.node_list[n] in sorted_nodelist] mi@0: # print "hub.neighborhood",len(hub.neighborhood) mi@0: # for n in hub.neighborhood : mi@0: # if self.node_list[n] in sorted_nodelist : mi@0: # sorted_nodelist.remove(self.node_list[n]) mi@0: # print "removed", n, "from node list" mi@0: # print len(hub_candidates),len(sorted_nodelist) mi@0: if not sorted_nodelist: break mi@0: # print 'hub_candidates', hub_candidates mi@0: return hub_candidates mi@0: mi@0: mi@0: def getNodeRank(self): mi@0: average_div = [] mi@0: neighborhood_size = [] mi@0: node_rank = [] mi@0: sorted_nodelist = sorted(self.node_list, key = lambda x: x.__getattribute__(self.methods[self.rank])) mi@0: mi@0: for node in self.node_list: mi@0: neighborhood_size.append(node.neighborhood_size) mi@0: average_div.append(node.average_div) mi@0: node_rank.append(sorted_nodelist.index(node)) mi@0: mi@0: return neighborhood_size, average_div, node_rank mi@0: mi@0: def test(self): mi@0: for n in self.node_list: mi@0: print n mi@0: mi@0: # @with_cache(meta) mi@0: def getPairwiseDistances(self, gmm_list): mi@0: '''Calculate the pairwise distances of the input GMM data''' mi@0: n_GMMs = len(gmm_list) mi@0: distance_matrix = np.zeros((n_GMMs, n_GMMs)) mi@0: print "Creating divergence matrix." mi@0: for i in xrange(n_GMMs): mi@0: print_progress(i,"gmm node processed") mi@0: for j in xrange(i, n_GMMs): mi@0: distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j]) mi@0: distance_matrix[j][i] = distance_matrix[i][j] mi@0: np.fill_diagonal(distance_matrix, 0.0) mi@0: distance_matrix[np.isinf(distance_matrix)] = np.finfo(np.float64).max mi@0: mi@0: return distance_matrix mi@0: mi@0: def hubness_measure(self, D,r=0.45): mi@0: n = D.shape[0] mi@0: Dmean = zeros(n) mi@0: Dstd = zeros(n) mi@0: for i in xrange(n) : mi@0: Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ]) mi@0: Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ]) mi@0: mi@0: m = Dmean.mean() mi@0: r = r*m mi@0: #print m,r mi@0: #print Dmean.min(), Dmean.max() mi@0: mi@0: N = zeros(n) mi@0: for i in xrange(n) : mi@0: s = [D[i] > 0] and [D[i] < r] mi@0: N[i] = len(D[i][s]) mi@0: sortindex = argsort(N) mi@0: hubs_mean = np.mean(Dmean[sortindex][-5:][::-1,...]) mi@0: anti_hubs_mean = np.mean(Dmean[sortindex][:5]) mi@0: return (anti_hubs_mean - hubs_mean) / Dmean.mean() mi@0: mi@0: def center_divergence_matrix(self, D) : mi@0: n = D.shape[0] mi@0: Dmean = zeros(n) mi@0: Dstd = zeros(n) mi@0: for i in xrange(D.shape[0]) : mi@0: Dmean[i] = np.mean(D[i][ [D[i]>0] and [D[i]!=inf] ]) mi@0: Dstd[i] = np.std(D[i][ [D[i]>0] and [D[i]!=inf] ]) mi@0: B = zeros_like(D) mi@0: for i in xrange(n) : mi@0: for j in xrange(n) : mi@0: d = D[i,j] mi@0: B[i,j] = B[j,i] = 0.5 * ((d - Dmean[i]) / Dstd[i] + (d - Dmean[j]) / Dstd[j]) mi@0: B += abs(np.min(B)) mi@0: np.fill_diagonal(B,0.0) mi@0: return B mi@0: mi@0: class FeatureObj() : mi@0: __slots__ = ['key','audio','timestamps','features'] mi@0: mi@0: def getGMMs(feature, gmmWindow=10, stepsize=1): mi@0: gmm_list = [] mi@0: steps = int((feature.shape[0] - gmmWindow + stepsize) / stepsize) mi@0: for i in xrange(steps): mi@0: gmm_list.append(GmmDistance(feature[i*stepsize:(i*stepsize+gmmWindow), :])) mi@0: return gmm_list mi@0: mi@0: def pairwiseSKL(gmm_list): mi@0: '''Compute pairwise symmetrised KL divergence of a list of GMMs.''' mi@0: n_GMMs = len(gmm_list) mi@0: distance_matrix = np.zeros((n_GMMs, n_GMMs)) mi@0: for i in xrange(n_GMMs): mi@0: for j in xrange(i, n_GMMs): mi@0: distance_matrix[i][j] = gmm_list[i].skl_distance_full(gmm_list[j]) mi@0: distance_matrix[j][i] = distance_matrix[i][j] mi@0: # X = np.array(gmm_list) mi@0: # distance_matrix = pairwise_distances(X, metric = lambda x, y: x.skl_distance_full(y) ) mi@0: distance_matrix[np.isnan(distance_matrix)] = 10.0 mi@0: distance_matrix[np.isinf(distance_matrix)] = 10.0 mi@0: mi@0: mi@0: # def parse_args(): mi@0: # # define parser mi@0: # op = optparse.OptionParser() mi@0: # # IO options mi@0: # op.add_option('-i', '--input', action="store", dest="INPUT", default='/Volumes/c4dm-scratch/mi/seg/qupujicheng/features', type="str", help="Loading features from..." ) mi@0: # op.add_option('-o', '--out', action="store", dest="OUTPUT", default='test/clustering_resutls', type="str", help="Writing clustering results to... ") mi@0: # mi@0: # return op.parse_args() mi@0: # mi@0: # options, args = parse_args() mi@0: # mi@0: # def main(): mi@0: # mi@0: # feature_list = [i for i in os.listdir(options.INPUT) if not i.startswith('.')] mi@0: # feature_list.sort() mi@0: # fobj_list = [] mi@0: # mi@0: # for feature in feature_list: mi@0: # data = np.genfromtxt(join(options.INPUT, feature), delimiter=',',filling_values=0.0) mi@0: # dim = data.shape[1] - 1 mi@0: # if dim == 1 : mi@0: # fo = FeatureObj() mi@0: # fo.audio = feature[:feature.find('_vamp')] mi@0: # fo.key = splitext(feature.strip(fo.audio + '_'))[0] mi@0: # fo.timestamps = data[:, 0] # the first column is the timestamps mi@0: # fo.features = data[:, 1] mi@0: # fobj_list.append(fo) mi@0: # mi@0: # else : mi@0: # for col in xrange(dim): mi@0: # fo = FeatureObj() mi@0: # fo.audio = feature[:feature.find('_vamp')] mi@0: # fo.key = splitext(feature.strip(fo.audio + '_'))[0] + '_' + '%d' %col mi@0: # fo.timestamps = data[:, 0] # the first column records the timestamps mi@0: # fo.features = data[:, col+1][:,np.newaxis] mi@0: # fobj_list.append(fo) mi@0: # mi@0: # timestamps = fobj_list[0].timestamps mi@0: # features = map(lambda x: "%i:%s" %(x[0],x[1].key), enumerate(fobj_list)) mi@0: # print "Loading %d features:\n", len(fobj_list) mi@0: # mi@0: # # find the feature with the fewer number of frames mi@0: # n_frames = np.array(map(lambda x: x.features.shape[0], fobj_list)).min() mi@0: # n_features = len(fobj_list) mi@0: # mi@0: # feature_matrix = np.zeros((n_frames, n_features)) mi@0: # print 'feature_matrix', feature_matrix.shape mi@0: # mi@0: # # take the arrays from the feature objects and add them to a matrix mi@0: # for i,f in enumerate(fobj_list) : mi@0: # feature_matrix[:,i] = f.features[:n_frames,0] mi@0: # mi@0: # # normalise the feature matrix, get rid of negative features, ensure numerical stability by adding a small constant mi@0: # feature_matrix = abs(feature_matrix) / (abs(feature_matrix.max(0))+0.0005) mi@0: # feature_matrix[np.isnan(feature_matrix)] = 0.0 mi@0: # mi@0: # winlength = 5 mi@0: # stepsize = 2 mi@0: # mi@0: # gmm_list = getGMMs(feature_matrix, gmmWindow=winlength, stepsize=stepsize) mi@0: # print 'number of GMMs:', len(gmm_list) mi@0: # skl_matrix = pairwiseSKL(gmm_list) mi@0: # mi@0: # rc = rClustering(eps=3, rank='min_avg_div').fit(gmm_list) mi@0: # rc.test() mi@0: # classification = rc.classification mi@0: # neighborhood_size, average_div, node_rank = rc.getNodeRank() mi@0: # mi@0: # f1 = np.array(zip(timestamps[:len(classification)], labelclassifications)) mi@0: # f2 = np.array(zip(timestamps[:len(neighborhood_size)], neighborhood_size)) mi@0: # f3 = np.array(zip(timestamps[:len(average_div)], average_div)) mi@0: # f4 = np.array(zip(timestamps[:len(node_rank)], node_rank)) mi@0: # mi@0: # np.savetxt(join(options.OUTPUT, 'classification')+'.csv', f1, delimiter=',') mi@0: # np.savetxt(join(options.OUTPUT, 'neighborhood_size')+'.csv', f2, delimiter=',') mi@0: # np.savetxt(join(options.OUTPUT, 'average_div')+'.csv', f3, delimiter=',') mi@0: # np.savetxt(join(options.OUTPUT, 'node_rank')+'.csv', f4, delimiter=',') mi@0: # mi@0: # if __name__ == '__main__': mi@0: # main() mi@0: