annotate experiment-reverb/code/supervised_training.py @ 2:c87a9505f294 tip

Added LICENSE for code, removed .wav files
author Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk>
date Sat, 30 Sep 2017 13:25:50 +0100
parents 246d5546657c
children
rev   line source
e@0 1 #!/usr/bin/python2
e@0 2 # -*- coding: utf-8 -*-
e@0 3 """
e@0 4 Created on Thu Apr 23 11:53:17 2015
e@0 5
e@0 6 @author: mmxgn
e@0 7 """
e@0 8
e@0 9 # This file does the cluster estimation and the removal of outliers
e@0 10
e@0 11 from sys import argv, exit
e@0 12 from essentia.standard import YamlInput, YamlOutput
e@0 13 from essentia import Pool
e@0 14 from pca import *
e@0 15
e@0 16 from numpy import *
e@0 17 from sklearn import cluster
e@0 18 from sklearn.metrics import pairwise_distances
e@0 19
e@0 20 mse = lambda A,B: ((array(A)-array(B)) ** 2).mean()
e@0 21
e@0 22 if __name__=="__main__":
e@0 23 if len(argv) != 2:
e@0 24 print "[EE] Wrong number of arguments"
e@0 25 print "[II] Correct syntax is:"
e@0 26 print "[II] \t%s <training_file>"
e@0 27 print "[II] where <training_file> is a .yaml file containing the"
e@0 28 print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)"
e@0 29 exit(-1)
e@0 30
e@0 31
e@0 32 infile = argv[1]
e@0 33
e@0 34 features_pool = YamlInput(filename = infile)()
e@0 35
e@0 36
e@0 37
e@0 38 feature_captions = features_pool.descriptorNames()
e@0 39
e@0 40 for c in features_pool.descriptorNames():
e@0 41 if c.split('.')[0] == 'metadata':
e@0 42 feature_captions.remove(c)
e@0 43
e@0 44
e@0 45
e@0 46 print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0])
e@0 47 print "[II] %d Features Available: " % len(feature_captions)
e@0 48
e@0 49
e@0 50
e@0 51 print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
e@0 52
e@0 53 nfeatures_in = len(feature_captions)
e@0 54 features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]])))
e@0 55
e@0 56 for i in range(0, nfeatures_in):
e@0 57 features_vector[i, :] = features_pool[feature_captions[i]].T
e@0 58
e@0 59 print "[II] Extracting PCA configuration "
e@0 60
e@0 61 kernel, q, featurelist = extract_pca_configuration_from_data(features_vector)
e@0 62
e@0 63 print "[II] Optimal number of PCs to keep: %d" % q
e@0 64
e@0 65 feature_captions_array = array(feature_captions)
e@0 66
e@0 67 features_to_keep = list(feature_captions_array[featurelist])
e@0 68 print "[II] Decided to keep %d features:" % len(features_to_keep)
e@0 69 print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7]
e@0 70
e@0 71
e@0 72
e@0 73 # TODO: finish writing kernel,q and feature file to a yaml output and incorporate
e@0 74 # labelling using the csvs.
e@0 75