e@0: #!/usr/bin/python2
e@0: # -*- coding: utf-8 -*-
e@0: """
e@0: Created on Thu Apr 23 11:53:17 2015
e@0: 
e@0: @author: mmxgn
e@0: """
e@0: 
e@0: # This file does the cluster estimation and the removal of outliers
e@0: 
e@0: from sys import argv, exit
e@0: from essentia.standard import YamlInput, YamlOutput
e@0: from essentia import Pool
e@0: from pca import *
e@0: 
e@0: from numpy import *
e@0: from sklearn import cluster
e@0: from sklearn.metrics import pairwise_distances
e@0: 
e@0: mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
e@0: 
e@0: if __name__=="__main__":
e@0:     if len(argv) != 2:
e@0:         print "[EE] Wrong number of arguments"
e@0:         print "[II] Correct syntax is:"
e@0:         print "[II] \t%s <training_file>"
e@0:         print "[II] where <training_file> is a .yaml file containing the"
e@0:         print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)"
e@0:         exit(-1)
e@0:         
e@0:     
e@0:     infile = argv[1]
e@0:     
e@0:     features_pool = YamlInput(filename = infile)()
e@0:     
e@0:     
e@0:     
e@0:     feature_captions = features_pool.descriptorNames()   
e@0:     
e@0:     for c in features_pool.descriptorNames():
e@0:         if c.split('.')[0] == 'metadata':
e@0:             feature_captions.remove(c)
e@0:             
e@0: 
e@0:                 
e@0:     print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0])
e@0:     print "[II] %d Features Available: " % len(feature_captions)
e@0: 
e@0: 
e@0:     
e@0:     print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
e@0:     
e@0:     nfeatures_in = len(feature_captions)
e@0:     features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]])))
e@0:     
e@0:     for i in range(0, nfeatures_in):
e@0:         features_vector[i, :] = features_pool[feature_captions[i]].T
e@0:     
e@0:     print "[II] Extracting PCA configuration "
e@0:     
e@0:     kernel, q, featurelist = extract_pca_configuration_from_data(features_vector)
e@0:     
e@0:     print "[II] Optimal number of PCs to keep: %d" % q
e@0:     
e@0:     feature_captions_array = array(feature_captions)
e@0:     
e@0:     features_to_keep = list(feature_captions_array[featurelist])
e@0:     print "[II] Decided to keep %d features:" % len(features_to_keep)
e@0:     print  str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
e@0:     
e@0:     
e@0:     
e@0:     # TODO: finish writing kernel,q and feature file to a yaml output and incorporate
e@0:     # labelling using the csvs.
e@0: