Mercurial > hg > chourdakisreiss2016
view experiment-reverb/code/supervised_training_pca-bak.py @ 2:c87a9505f294 tip
Added LICENSE for code, removed .wav files
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Sat, 30 Sep 2017 13:25:50 +0100 |
parents | 246d5546657c |
children |
line wrap: on
line source
#!/usr/bin/python2 # -*- coding: utf-8 -*- """ Created on Thu Apr 23 11:53:17 2015 @author: mmxgn """ # This file does the cluster estimation and the removal of outliers from sys import argv, exit from essentia.standard import YamlInput, YamlOutput from essentia import Pool from pca import * from numpy import * from sklearn import cluster from sklearn.metrics import pairwise_distances from sklearn.cluster import KMeans, MiniBatchKMeans import matplotlib.pyplot as plt from sklearn.mixture import GMM from sklearn.naive_bayes import GaussianNB, MultinomialNB from scipy.signal import decimate #from hmmlearn import hmm from sklearn import hmm #from adpcm import adm, adm_reconstruct mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() def adm_reconstruct(codeword, h, dmin=.01, dmax=.28): x = zeros((1, codeword.shape[1])) delta1 = dmin delta2 = dmin Sum = h x[0] = h for i in range(0, codeword.shape[1]): if codeword[0,i] == 0: delta1 = dmin delta2 = dmin elif codeword[0,i] == 1: delta2 = dmin Sum += delta1 delta1 *= 2 if delta1 > dmax: delta1 = dmax elif codeword[0,i] == -1: delta1 = dmin Sum -= delta2 delta2 *= 2 if delta2 > dmax: delta2 = dmax x[0,i] = Sum return x def adm(x, dmin=.01, dmax=.28, tol=0.0001): # Adaptive delta modulation adapted by code: # (adeltamod.m) # # Adaptive Delta Modulator # by Gandhar Desai (gdesai) # BITS Pilani Goa Campus # Date: 28 Sept, 2013 xsig = x Lx = len(x) ADMout = zeros((1, Lx)) codevec = zeros((1, Lx)) Sum = x[0] delta1 = dmin delta2 = dmin mult1 = 2 mult2 = 2 for i in range(0, Lx): #print abs(xsig[i] - Sum) if (abs(xsig[i] - Sum) < tol): bit = 0 delta2 = dmin delta1 = dmin elif (xsig[i] >= Sum): bit = 1 delta2 = dmin Sum += delta1 delta1 *= mult1 if delta1 > dmax: delta1 = dmax else: bit = -1 delta1 = dmin Sum -= delta2 delta2 *= mult2 if delta2 > dmax: delta2 = dmax ADMout[0, i] = Sum codevec[0, i]= bit return ADMout,codevec, x[0] if __name__=="__main__": if len(argv) != 2: print "[EE] Wrong number of arguments" print "[II] Correct syntax is:" print "[II] \t%s <training_file>" print "[II] where <training_file> is a .yaml file containing the" print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)" exit(-1) infile = argv[1] features_pool = YamlInput(filename = infile)() feature_captions = features_pool.descriptorNames() parameter_captions = [] for c in features_pool.descriptorNames(): if c.split('.')[0] == 'parameter': parameter_captions.append(c) if c.split('.')[0] == 'metadata' or c.split('.')[0] == 'parameter': feature_captions.remove(c) close('all') print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0]) print "[II] %d Features Available: " % len(feature_captions) print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7] nfeatures_in = len(feature_captions) nparameters_in = len(parameter_captions) features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]]))) parameters_vector = zeros((nparameters_in, len(features_pool[parameter_captions[0]]))) for i in range(0, nfeatures_in): features_vector[i, :] = features_pool[feature_captions[i]].T for i in range(0, nparameters_in): parameters_vector[i, :] = features_pool[parameter_captions[0]].T print "[II] %d parameters used:" % len(parameter_captions) print str(parameter_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7].replace('parameter.','') print "[II] Marking silent parts" silent_parts = zeros((1, len(features_pool[feature_captions[i]].T))) rms = features_vector[feature_captions.index('rms'), :] # Implementing Hysteresis Gate -- High threshold is halfway between # the mean and the max and Low is halfway between the mean dn the min rms_threshold_mean = mean(rms) rms_threshold_max = max(rms) rms_threshold_min = min(rms) rms_threshold_high = 0.1 * rms_threshold_mean rms_threshold_low = 0.01 * rms_threshold_mean for n in range(1, len(rms)): prev = rms[n-1] curr = rms[n] if prev >= rms_threshold_high: if curr < rms_threshold_low: silent_parts[0,n] = 1 else: silent_parts[0,n] = 0 elif prev <= rms_threshold_low: if curr > rms_threshold_high: silent_parts[0,n] = 0 else: silent_parts[0,n] = 1 else: silent_parts[0,n] = silent_parts[0,n-1] if silent_parts[0,1] == 1: silent_parts[0, 0] = 1 # plot(rms) # plot(silent_parts.T) # plot(ones((len(rms), 1))*rms_threshold_high) # plot(ones((len(rms), 1))*rms_threshold_low) active_index = invert(silent_parts.flatten().astype(bool)) # Keep only active parts # Uncomment this #features_vector = features_vector[:, active_index] # parameters_vector = parameters_vector[:, active_index] moments_vector = zeros((features_vector.shape[0], 2)) print "[II] Storing moments vector" for i in range(0, features_vector.shape[0]): mean_ = mean(features_vector[i,:]) std_ = std(features_vector[i,:], ddof=1) moments_vector[i,0] = mean_ moments_vector[i,1] = std_ features_vector[i,:] = (features_vector[i,:] - mean_)/std_ print "[II] Extracting PCA configuration " kernel, q, featurelist = extract_pca_configuration_from_data(features_vector) print "[II] Optimal number of PCs to keep: %d" % q feature_captions_array = array(feature_captions) features_to_keep = features_vector features_to_keep = list(feature_captions_array[featurelist]) print "[II] Decided to keep %d features:" % len(features_to_keep) print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7] # Keep the desired features #Uncomment this #features_kept_data = features_vector[featurelist,:] features_lept_data = features_vector # Generate the parameter clusters using k-means # Uncomment this features_vector = (kernel.T*features_kept_data)[0:q,:] # parameters_k_means = KMeans(n_clusters = parameters_k, init='k-means++', max_iter=300, tol=0.0000001, verbose = 1) parameters_k_means = KMeans(init='k-means++', n_init=10, max_iter=300, tol=0.0000001, verbose = 0) # # parameters_k_means = MiniBatchKMeans(init='k-means++', max_iter=300, tol=0.00001, verbose = 1) # # # Quantize the differences of the parameters instead of the parameters themselves # parameters_vector_diff = concatenate((zeros((shape(parameters_vector)[0],1)),diff(parameters_vector, axis=1)),axis=1) # features_vector_diff = concatenate((zeros((shape(features_vector)[0],1)),diff(features_vector,axis=1)),axis=1) # # # Delete this afterwards # # features_vector = features_vector_diff # parameters_k_means.fit(parameters_vector_diff.T) parameters_k_means.fit(parameters_vector.T) # parameters_k_means_centers = parameters_k_means.cluster_centers_ parameters_k_means_labels = parameters_k_means.labels_ # parameters_vector_estimated = zeros(shape(parameters_vector)) # for n in range(0, len(parameters_vector_estimated[0])): parameters_vector_estimated[:,n] = parameters_k_means_centers[parameters_k_means_labels[n]] # ## plot(parameters_vector[0]) # # plot(parameters_vector_estimated[0]) # # # PROvLIMA EDW print "[II] Parameters MSE for %d clusters: %.3f" % (len(parameters_k_means.cluster_centers_), mse(parameters_vector, parameters_vector_estimated)) # # print "[II] Clustering features." # # n_clusters = 100 # features_clustering = GMM(n_components = n_clusters, covariance_type='diag') # # features_clustering.fit( features_vector.T, y=parameters_k_means_labels) # # features_clustering_means = features_clustering.means_ # features_clustering_labels = features_clustering.predict(features_vector.T) # features_clustering_sigmas = features_clustering.covars_ # # features_vector_estimated = zeros(shape(features_vector)) # # # for n in range(0, len(features_vector_estimated[0])): # features_vector_estimated[:,n] = features_clustering_means[features_clustering_labels[n]] # # # for n in range(0,features_vector.shape[0]): # # hist(features_vector[1]-features_vector_estimated[1], 256) # std(features_vector[1]-features_vector_estimated[1], ddof=1) # mean(features_vector[1]-features_vector_estimated[1]) # # print "[II] Features MSE for %d clusters: %.3f" % (n_clusters, mse(features_vector, features_vector_estimated)) # # # print "[II] Testing Gaussian Naive Bayes Classifier" # # gnb = GaussianNB() # gnb.fit(features_vector.T, parameters_k_means_labels) # parameter_labels_estimated = gnb.predict(features_vector.T) # # print "[II] Raw Parameters - Gaussian Naive Bayes classifier testing ratio: %.4f" % (float(sum((parameter_labels_estimated == parameters_k_means_labels).astype(float)))/float(len(parameters_k_means_labels))) # # # print "[II] Trying ADM-coded parameters" # UpsamplingFactor = 100 # print "[II] Upsampling features and parameters by a factor of %d" % UpsamplingFactor # # # # Upsampled features and parameters # features_vector_upsampled = repeat(features_vector,UpsamplingFactor, axis=1) # feature_labels_upsampled = repeat(features_clustering_labels,UpsamplingFactor, axis=0) # parameters_vector_upsampled = repeat(parameters_vector,UpsamplingFactor, axis=1) # # parameters_vector_upsampled_adm = matrix(zeros(shape(parameters_vector_upsampled))) # parameters_vector_upsampled_code = matrix(zeros(shape(parameters_vector_upsampled))) # parameters_vector_upsampled_firstval = matrix(zeros((parameters_vector_upsampled.shape[0],1))) # # # Reconstructed parameters # # parameters_vector_upsampled_reconstructed = matrix(zeros(shape(parameters_vector_upsampled))) # # dmin = 0.001 # dmax = 0.28 # tol = 0.001 # for i in range(0, parameters_vector_upsampled.shape[0]): # out, code, h = adm(parameters_vector_upsampled[i,:],dmin=dmin,dmax=dmax,tol=tol) # parameters_vector_upsampled_adm[i,:] = out # parameters_vector_upsampled_code[i,:] = code # parameters_vector_upsampled_firstval[i, 0] = h # # # # Reconstruct-ADM # parameters_vector_upsampled_reconstructed[i,:] = adm_reconstruct(parameters_vector_upsampled_code[i,:],parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax) # # # # plot(parameters_vector_upsampled_adm.T, 'r--') # # # # plot(parameters_vector_upsampled.T) # # plot(parameters_vector_upsampled_reconstructed.T, 'g.') # # # # parameters_vector_reconstructed = zeros(shape(parameters_vector)) # for n in range(0, parameters_vector.shape[1]): # parameters_vector_reconstructed[:,n] = parameters_vector_upsampled_reconstructed[:,n*UpsamplingFactor] # # # mse_adm = mse(parameters_vector_reconstructed, parameters_vector) # # print "[II] Expected ADM reconstruction MSE: %.4f" % mse_adm # # # figure() # #plot(parameters_vector.T) # # plot(parameters_vector_reconstructed.T) # # print "[II] Building HMM transition, emission matrices and priors" # # transmat = zeros((3,3)) # startprob = zeros((3,)) # emissionmat = zeros((3, n_clusters)) # # # state_labels = parameters_vector_upsampled_code + 1 # stateseq = state_labels.T # # for n in range(0,shape(parameters_vector_upsampled_code)[1]): # if n>0: # transmat[state_labels[0,n-1],state_labels[0,n]] += 1 # startprob[state_labels[0,n]] +=1 # emissionmat[state_labels[0,n],feature_labels_upsampled[n]] += 1 # # # for n in range(0, transmat.shape[0]): # transmat[n,:]/=sum(transmat[n,:]) # emissionmat[n,:]/=sum(emissionmat[n,:]) # # # transmat = matrix(transmat) # emissionmat = matrix(emissionmat) # # Prior # startprob = startprob/sum(startprob) # startprob = ravel(startprob) # # # Vocabulary # # model = hmm.GMMHMM(n_mix=n_clusters, n_components=3, covariance_type="diag") # model.means_ = features_clustering.means_ # model.covars_ = features_clustering.covars_ # # features_vector_array = array(features_vector) # features_vector_upsampled_array=array(features_vector_upsampled) # # model.fit([features_vector_array.T]) # stateseq_estimated = model.predict(features_vector_upsampled_array.T) # # parameters_vector_upsampled_reconstructed_decoded = matrix(zeros(shape(parameters_vector_upsampled))) # # # plot(stateseq_estimated) # plot(stateseq) # # code_estimated = matrix(zeros(shape(parameters_vector_upsampled))) # code_estimated[0,:] = stateseq_estimated - 1 # # # # parameters_vector_upsampled_reconstructed_estimated = matrix(zeros(shape(parameters_vector_upsampled))) # # for i in range(0, parameters_vector_upsampled.shape[0]): # parameters_vector_upsampled_reconstructed_estimated[i,:] = adm_reconstruct(code_estimated,parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax) # figure() # plot(parameters_vector_upsampled_reconstructed_estimated.T) # plot(parameters_vector_upsampled.T)