e@0
|
1 #!/usr/bin/python2
|
e@0
|
2 # -*- coding: utf-8 -*-
|
e@0
|
3 """
|
e@0
|
4 Created on Thu Apr 23 11:53:17 2015
|
e@0
|
5
|
e@0
|
6 @author: mmxgn
|
e@0
|
7 """
|
e@0
|
8
|
e@0
|
9 # This file does the cluster estimation and the removal of outliers
|
e@0
|
10
|
e@0
|
11 from sys import argv, exit
|
e@0
|
12 from essentia.standard import YamlInput, YamlOutput
|
e@0
|
13 from essentia import Pool
|
e@0
|
14 from pca import *
|
e@0
|
15
|
e@0
|
16 from numpy import *
|
e@0
|
17 from sklearn import cluster
|
e@0
|
18 from sklearn.metrics import pairwise_distances
|
e@0
|
19
|
e@0
|
20 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
|
e@0
|
21
|
e@0
|
22 if __name__=="__main__":
|
e@0
|
23 if len(argv) != 2:
|
e@0
|
24 print "[EE] Wrong number of arguments"
|
e@0
|
25 print "[II] Correct syntax is:"
|
e@0
|
26 print "[II] \t%s <training_file>"
|
e@0
|
27 print "[II] where <training_file> is a .yaml file containing the"
|
e@0
|
28 print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)"
|
e@0
|
29 exit(-1)
|
e@0
|
30
|
e@0
|
31
|
e@0
|
32 infile = argv[1]
|
e@0
|
33
|
e@0
|
34 features_pool = YamlInput(filename = infile)()
|
e@0
|
35
|
e@0
|
36
|
e@0
|
37
|
e@0
|
38 feature_captions = features_pool.descriptorNames()
|
e@0
|
39
|
e@0
|
40 for c in features_pool.descriptorNames():
|
e@0
|
41 if c.split('.')[0] == 'metadata':
|
e@0
|
42 feature_captions.remove(c)
|
e@0
|
43
|
e@0
|
44
|
e@0
|
45
|
e@0
|
46 print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0])
|
e@0
|
47 print "[II] %d Features Available: " % len(feature_captions)
|
e@0
|
48
|
e@0
|
49
|
e@0
|
50
|
e@0
|
51 print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
|
e@0
|
52
|
e@0
|
53 nfeatures_in = len(feature_captions)
|
e@0
|
54 features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]])))
|
e@0
|
55
|
e@0
|
56 for i in range(0, nfeatures_in):
|
e@0
|
57 features_vector[i, :] = features_pool[feature_captions[i]].T
|
e@0
|
58
|
e@0
|
59 print "[II] Extracting PCA configuration "
|
e@0
|
60
|
e@0
|
61 kernel, q, featurelist = extract_pca_configuration_from_data(features_vector)
|
e@0
|
62
|
e@0
|
63 print "[II] Optimal number of PCs to keep: %d" % q
|
e@0
|
64
|
e@0
|
65 feature_captions_array = array(feature_captions)
|
e@0
|
66
|
e@0
|
67 features_to_keep = list(feature_captions_array[featurelist])
|
e@0
|
68 print "[II] Decided to keep %d features:" % len(features_to_keep)
|
e@0
|
69 print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
|
e@0
|
70
|
e@0
|
71
|
e@0
|
72
|
e@0
|
73 # TODO: finish writing kernel,q and feature file to a yaml output and incorporate
|
e@0
|
74 # labelling using the csvs.
|
e@0
|
75
|