Mercurial > hg > chourdakisreiss2016
diff experiment-reverb/code/supervised_training_pca-bak.py @ 0:246d5546657c
initial commit, needs cleanup
author | Emmanouil Theofanis Chourdakis <e.t.chourdakis@qmul.ac.uk> |
---|---|
date | Wed, 14 Dec 2016 13:15:48 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/experiment-reverb/code/supervised_training_pca-bak.py Wed Dec 14 13:15:48 2016 +0000 @@ -0,0 +1,440 @@ +#!/usr/bin/python2 +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 23 11:53:17 2015 + +@author: mmxgn +""" + +# This file does the cluster estimation and the removal of outliers + +from sys import argv, exit +from essentia.standard import YamlInput, YamlOutput +from essentia import Pool +from pca import * + +from numpy import * +from sklearn import cluster +from sklearn.metrics import pairwise_distances +from sklearn.cluster import KMeans, MiniBatchKMeans +import matplotlib.pyplot as plt +from sklearn.mixture import GMM +from sklearn.naive_bayes import GaussianNB, MultinomialNB +from scipy.signal import decimate +#from hmmlearn import hmm +from sklearn import hmm +#from adpcm import adm, adm_reconstruct + + +mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() + + + + +def adm_reconstruct(codeword, h, dmin=.01, dmax=.28): + x = zeros((1, codeword.shape[1])) + + delta1 = dmin + delta2 = dmin + Sum = h + + x[0] = h + for i in range(0, codeword.shape[1]): + if codeword[0,i] == 0: + delta1 = dmin + delta2 = dmin + + elif codeword[0,i] == 1: + delta2 = dmin + Sum += delta1 + delta1 *= 2 + if delta1 > dmax: + delta1 = dmax + + elif codeword[0,i] == -1: + delta1 = dmin + Sum -= delta2 + delta2 *= 2 + if delta2 > dmax: + delta2 = dmax + x[0,i] = Sum + return x + +def adm(x, dmin=.01, dmax=.28, tol=0.0001): + + # Adaptive delta modulation adapted by code: + # (adeltamod.m) + # + # Adaptive Delta Modulator + # by Gandhar Desai (gdesai) + # BITS Pilani Goa Campus + # Date: 28 Sept, 2013 + + xsig = x + + Lx = len(x) + + ADMout = zeros((1, Lx)) + codevec = zeros((1, Lx)) + + + Sum = x[0] + delta1 = dmin + delta2 = dmin + mult1 = 2 + mult2 = 2 + for i in range(0, Lx): + #print abs(xsig[i] - Sum) + if (abs(xsig[i] - Sum) < tol): + bit = 0 + delta2 = dmin + delta1 = dmin + + + elif (xsig[i] >= Sum): + bit = 1 + delta2 = dmin + Sum += delta1 + delta1 *= mult1 + if delta1 > dmax: + delta1 = dmax + + + else: + bit = -1 + delta1 = dmin + Sum -= delta2 + delta2 *= mult2 + if delta2 > dmax: + delta2 = dmax + + + + ADMout[0, i] = Sum + codevec[0, i]= bit + + return ADMout,codevec, x[0] + +if __name__=="__main__": + if len(argv) != 2: + print "[EE] Wrong number of arguments" + print "[II] Correct syntax is:" + print "[II] \t%s <training_file>" + print "[II] where <training_file> is a .yaml file containing the" + print "[II] features of the dataset (try output2_stage/fulltraining-last.yaml)" + exit(-1) + + + infile = argv[1] + + features_pool = YamlInput(filename = infile)() + + + + feature_captions = features_pool.descriptorNames() + parameter_captions = [] + + + for c in features_pool.descriptorNames(): + if c.split('.')[0] == 'parameter': + parameter_captions.append(c) + if c.split('.')[0] == 'metadata' or c.split('.')[0] == 'parameter': + feature_captions.remove(c) + + + + close('all') + + print "[II] Loaded training data from %s (%s) " % (infile, features_pool['metadata.date'][0]) + print "[II] %d Features Available: " % len(feature_captions) + + + + print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7] + + nfeatures_in = len(feature_captions) + nparameters_in = len(parameter_captions) + features_vector = zeros((nfeatures_in, len(features_pool[feature_captions[0]]))) + + parameters_vector = zeros((nparameters_in, len(features_pool[parameter_captions[0]]))) + + + for i in range(0, nfeatures_in): + features_vector[i, :] = features_pool[feature_captions[i]].T + for i in range(0, nparameters_in): + parameters_vector[i, :] = features_pool[parameter_captions[0]].T + + print "[II] %d parameters used:" % len(parameter_captions) + print str(parameter_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7].replace('parameter.','') + + print "[II] Marking silent parts" + + silent_parts = zeros((1, len(features_pool[feature_captions[i]].T))) + + rms = features_vector[feature_captions.index('rms'), :] + + # Implementing Hysteresis Gate -- High threshold is halfway between + # the mean and the max and Low is halfway between the mean dn the min + + rms_threshold_mean = mean(rms) + + rms_threshold_max = max(rms) + rms_threshold_min = min(rms) + + rms_threshold_high = 0.1 * rms_threshold_mean + rms_threshold_low = 0.01 * rms_threshold_mean + + for n in range(1, len(rms)): + prev = rms[n-1] + curr = rms[n] + + if prev >= rms_threshold_high: + if curr < rms_threshold_low: + silent_parts[0,n] = 1 + else: + silent_parts[0,n] = 0 + elif prev <= rms_threshold_low: + if curr > rms_threshold_high: + silent_parts[0,n] = 0 + else: + silent_parts[0,n] = 1 + else: + silent_parts[0,n] = silent_parts[0,n-1] + + + if silent_parts[0,1] == 1: + silent_parts[0, 0] = 1 + + + # plot(rms) + # plot(silent_parts.T) + # plot(ones((len(rms), 1))*rms_threshold_high) + # plot(ones((len(rms), 1))*rms_threshold_low) + + active_index = invert(silent_parts.flatten().astype(bool)) + + # Keep only active parts + + # Uncomment this + #features_vector = features_vector[:, active_index] + # parameters_vector = parameters_vector[:, active_index] + + moments_vector = zeros((features_vector.shape[0], 2)) + + print "[II] Storing moments vector" + for i in range(0, features_vector.shape[0]): + mean_ = mean(features_vector[i,:]) + std_ = std(features_vector[i,:], ddof=1) + moments_vector[i,0] = mean_ + moments_vector[i,1] = std_ + + features_vector[i,:] = (features_vector[i,:] - mean_)/std_ + + + + print "[II] Extracting PCA configuration " + + kernel, q, featurelist = extract_pca_configuration_from_data(features_vector) + + print "[II] Optimal number of PCs to keep: %d" % q + + feature_captions_array = array(feature_captions) + + features_to_keep = features_vector + + features_to_keep = list(feature_captions_array[featurelist]) + print "[II] Decided to keep %d features:" % len(features_to_keep) + print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7] + + + + # Keep the desired features + #Uncomment this + #features_kept_data = features_vector[featurelist,:] + features_lept_data = features_vector + + # Generate the parameter clusters using k-means + + # Uncomment this + features_vector = (kernel.T*features_kept_data)[0:q,:] + + + + # parameters_k_means = KMeans(n_clusters = parameters_k, init='k-means++', max_iter=300, tol=0.0000001, verbose = 1) + parameters_k_means = KMeans(init='k-means++', n_init=10, max_iter=300, tol=0.0000001, verbose = 0) +# # parameters_k_means = MiniBatchKMeans(init='k-means++', max_iter=300, tol=0.00001, verbose = 1) +# +# # Quantize the differences of the parameters instead of the parameters themselves +# parameters_vector_diff = concatenate((zeros((shape(parameters_vector)[0],1)),diff(parameters_vector, axis=1)),axis=1) +# features_vector_diff = concatenate((zeros((shape(features_vector)[0],1)),diff(features_vector,axis=1)),axis=1) +# +# # Delete this afterwards +# # features_vector = features_vector_diff +# parameters_k_means.fit(parameters_vector_diff.T) + parameters_k_means.fit(parameters_vector.T) +# + parameters_k_means_centers = parameters_k_means.cluster_centers_ + parameters_k_means_labels = parameters_k_means.labels_ +# + parameters_vector_estimated = zeros(shape(parameters_vector)) +# + for n in range(0, len(parameters_vector_estimated[0])): + parameters_vector_estimated[:,n] = parameters_k_means_centers[parameters_k_means_labels[n]] +# +## plot(parameters_vector[0]) +# # plot(parameters_vector_estimated[0]) +# +# # PROvLIMA EDW + print "[II] Parameters MSE for %d clusters: %.3f" % (len(parameters_k_means.cluster_centers_), mse(parameters_vector, parameters_vector_estimated)) + +# + + + + + +# print "[II] Clustering features." +# +# n_clusters = 100 +# features_clustering = GMM(n_components = n_clusters, covariance_type='diag') +# +# features_clustering.fit( features_vector.T, y=parameters_k_means_labels) +# +# features_clustering_means = features_clustering.means_ +# features_clustering_labels = features_clustering.predict(features_vector.T) +# features_clustering_sigmas = features_clustering.covars_ +# +# features_vector_estimated = zeros(shape(features_vector)) +# +# +# for n in range(0, len(features_vector_estimated[0])): +# features_vector_estimated[:,n] = features_clustering_means[features_clustering_labels[n]] +# +# # for n in range(0,features_vector.shape[0]): +# # hist(features_vector[1]-features_vector_estimated[1], 256) +# std(features_vector[1]-features_vector_estimated[1], ddof=1) +# mean(features_vector[1]-features_vector_estimated[1]) +# +# print "[II] Features MSE for %d clusters: %.3f" % (n_clusters, mse(features_vector, features_vector_estimated)) +# +# +# print "[II] Testing Gaussian Naive Bayes Classifier" +# +# gnb = GaussianNB() +# gnb.fit(features_vector.T, parameters_k_means_labels) +# parameter_labels_estimated = gnb.predict(features_vector.T) +# +# print "[II] Raw Parameters - Gaussian Naive Bayes classifier testing ratio: %.4f" % (float(sum((parameter_labels_estimated == parameters_k_means_labels).astype(float)))/float(len(parameters_k_means_labels))) +# +# +# print "[II] Trying ADM-coded parameters" +# UpsamplingFactor = 100 +# print "[II] Upsampling features and parameters by a factor of %d" % UpsamplingFactor +# +# +# # Upsampled features and parameters +# features_vector_upsampled = repeat(features_vector,UpsamplingFactor, axis=1) +# feature_labels_upsampled = repeat(features_clustering_labels,UpsamplingFactor, axis=0) +# parameters_vector_upsampled = repeat(parameters_vector,UpsamplingFactor, axis=1) +# +# parameters_vector_upsampled_adm = matrix(zeros(shape(parameters_vector_upsampled))) +# parameters_vector_upsampled_code = matrix(zeros(shape(parameters_vector_upsampled))) +# parameters_vector_upsampled_firstval = matrix(zeros((parameters_vector_upsampled.shape[0],1))) +# +# # Reconstructed parameters +# +# parameters_vector_upsampled_reconstructed = matrix(zeros(shape(parameters_vector_upsampled))) +# +# dmin = 0.001 +# dmax = 0.28 +# tol = 0.001 +# for i in range(0, parameters_vector_upsampled.shape[0]): +# out, code, h = adm(parameters_vector_upsampled[i,:],dmin=dmin,dmax=dmax,tol=tol) +# parameters_vector_upsampled_adm[i,:] = out +# parameters_vector_upsampled_code[i,:] = code +# parameters_vector_upsampled_firstval[i, 0] = h +# +# +# # Reconstruct-ADM +# parameters_vector_upsampled_reconstructed[i,:] = adm_reconstruct(parameters_vector_upsampled_code[i,:],parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax) +# +# +# # plot(parameters_vector_upsampled_adm.T, 'r--') +# +# +# # plot(parameters_vector_upsampled.T) +# # plot(parameters_vector_upsampled_reconstructed.T, 'g.') +# +# +# +# parameters_vector_reconstructed = zeros(shape(parameters_vector)) +# for n in range(0, parameters_vector.shape[1]): +# parameters_vector_reconstructed[:,n] = parameters_vector_upsampled_reconstructed[:,n*UpsamplingFactor] +# +# +# mse_adm = mse(parameters_vector_reconstructed, parameters_vector) +# +# print "[II] Expected ADM reconstruction MSE: %.4f" % mse_adm +# +# # figure() +# #plot(parameters_vector.T) +# # plot(parameters_vector_reconstructed.T) +# +# print "[II] Building HMM transition, emission matrices and priors" +# +# transmat = zeros((3,3)) +# startprob = zeros((3,)) +# emissionmat = zeros((3, n_clusters)) +# +# +# state_labels = parameters_vector_upsampled_code + 1 +# stateseq = state_labels.T +# +# for n in range(0,shape(parameters_vector_upsampled_code)[1]): +# if n>0: +# transmat[state_labels[0,n-1],state_labels[0,n]] += 1 +# startprob[state_labels[0,n]] +=1 +# emissionmat[state_labels[0,n],feature_labels_upsampled[n]] += 1 +# +# +# for n in range(0, transmat.shape[0]): +# transmat[n,:]/=sum(transmat[n,:]) +# emissionmat[n,:]/=sum(emissionmat[n,:]) +# +# +# transmat = matrix(transmat) +# emissionmat = matrix(emissionmat) +# # Prior +# startprob = startprob/sum(startprob) +# startprob = ravel(startprob) +# +# # Vocabulary +# +# model = hmm.GMMHMM(n_mix=n_clusters, n_components=3, covariance_type="diag") +# model.means_ = features_clustering.means_ +# model.covars_ = features_clustering.covars_ +# +# features_vector_array = array(features_vector) +# features_vector_upsampled_array=array(features_vector_upsampled) +# +# model.fit([features_vector_array.T]) +# stateseq_estimated = model.predict(features_vector_upsampled_array.T) +# +# parameters_vector_upsampled_reconstructed_decoded = matrix(zeros(shape(parameters_vector_upsampled))) +# +# +# plot(stateseq_estimated) +# plot(stateseq) +# +# code_estimated = matrix(zeros(shape(parameters_vector_upsampled))) +# code_estimated[0,:] = stateseq_estimated - 1 +# +# +# +# parameters_vector_upsampled_reconstructed_estimated = matrix(zeros(shape(parameters_vector_upsampled))) +# +# for i in range(0, parameters_vector_upsampled.shape[0]): +# parameters_vector_upsampled_reconstructed_estimated[i,:] = adm_reconstruct(code_estimated,parameters_vector_upsampled_firstval[i,0], dmin=dmin,dmax=dmax) +# figure() +# plot(parameters_vector_upsampled_reconstructed_estimated.T) +# plot(parameters_vector_upsampled.T) \ No newline at end of file