e@0: # -*- coding: utf-8 -*- e@0: """ e@0: Created on Mon Mar 21 15:29:06 2016 e@0: e@0: @author: Emmanouil Theofanis Chourdakis e@0: """ e@0: e@0: e@0: from glob import glob e@0: from sys import argv e@0: from essentia.standard import YamlInput e@0: #from sklearn.covariance import EllipticEnvelope e@0: from sklearn.naive_bayes import GaussianNB e@0: from sklearn.linear_model import LogisticRegression e@0: from sklearn import svm e@0: from joblib import Parallel, delayed e@0: from collections import Counter e@0: from models import ReverbModel e@0: from sklearn.cluster import KMeans e@0: from time import time e@0: import tempfile e@0: import os e@0: import shutil e@0: from scipy.signal import gaussian e@0: e@0: e@0: from sklearn.metrics import accuracy_score, f1_score e@0: from sklearn.cross_validation import cross_val_score, cross_val_predict e@0: from sklearn.covariance import EllipticEnvelope e@0: e@0: from sklearn.covariance import EmpiricalCovariance, MinCovDet e@0: e@0: from sklearn.svm import LinearSVC e@0: from hmmlearn import hmm e@0: e@0: from mapping import * e@0: from numpy import * e@0: from pca import * e@0: from numpy.random import randint e@0: e@0: e@0: from warnings import filterwarnings e@0: filterwarnings("ignore") e@0: from sklearn.base import BaseEstimator e@0: mse = lambda A,B: ((array(A)-array(B)) ** 2).mean() e@0: e@0: def compress_parameters_vector(parameters_vector, tol=0.000001): e@0: new_parameters_vector = zeros(parameters_vector.shape) e@0: e@0: for i in range(0, parameters_vector.shape[0]): e@0: e@0: p = parameters_vector[i,:] e@0: p_set = matrix(unique(p)).T e@0: e@0: scores = [] e@0: e@0: score = -1000 e@0: e@0: for n in range(1, len(p_set)): e@0: old_score = score e@0: e@0: e@0: k_means = KMeans(n_clusters=n) e@0: k_means.fit(p_set) e@0: score = k_means.score(p_set) e@0: e@0: e@0: scores.append(score) e@0: e@0: if abs(score-old_score) < tol: e@0: n_opt = n e@0: break e@0: e@0: predicted_p = k_means.predict(matrix(p).T) e@0: new_parameters_vector[i,:] = array(ravel(k_means.cluster_centers_[predicted_p])) e@0: e@0: return new_parameters_vector e@0: def smooth_matrix_1D(X,W=100): e@0: window = gaussian(W+1,4) e@0: window = window/sum(window) e@0: intermx = zeros((X.shape[0],X.shape[1]+2*W)) e@0: intermx[:, W:-W] = X e@0: e@0: for m in range(0, X.shape[0]): e@0: # print intermx.shape e@0: intermx[m,:] = convolve(ravel(intermx[m,:]), window,'same') e@0: e@0: return intermx[:,W:-W] e@0: # return intermx e@0: e@0: #def smooth_matrix_1D(X): e@0: # window = gaussian(11,4) e@0: # window = window/sum(window) e@0: # intermx = zeros((X.shape[0],X.shape[1]+20)) e@0: # intermx[:, 10:-10] = X e@0: # e@0: # for m in range(0, X.shape[0]): e@0: # # print intermx.shape e@0: # intermx[m,:] = convolve(ravel(intermx[m,:]), window,'same') e@0: # e@0: # return intermx[:,10:-10] e@0: class HMMsvm: e@0: def __init__(self, svm_): e@0: self.svm = svm_ e@0: e@0: def fit(self, features_list, states, flr=1e-13): e@0: e@0: # TODO: Concatenate features, train e@0: #self.svm.fit(X, states, flr) e@0: #self.prior = self.svm.prior e@0: #self.transmat = self.svm.transmat e@0: e@0: e@0: features = None e@0: e@0: for f in features_list: e@0: if features is None: e@0: features = f e@0: else: e@0: features = append(features, f, 0) e@0: e@0: fullstates = None e@0: for s in states: e@0: if fullstates is None: e@0: fullstates = s e@0: else: e@0: fullstates = append(fullstates, s) e@0: e@0: e@0: e@0: e@0: e@0: e@0: self.n_classes = self.svm.n_classes e@0: n_classes = self.n_classes e@0: e@0: h = histogram(fullstates, n_classes)[0].astype(float) e@0: self.prior = h/sum(h) e@0: e@0: # Add a very small probability for random jump e@0: e@0: self.prior += flr e@0: self.prior = self.prior/sum(self.prior) e@0: e@0: transmat = zeros((n_classes, n_classes)) e@0: transitions = zeros((n_classes, )) e@0: for s in states: e@0: for i in range(1, len(s)): e@0: prev = s[i-1] e@0: cur = s[i] e@0: transmat[prev,cur] += 1 e@0: transitions[prev] += 1 e@0: e@0: transitions[transitions == 0] = 1 e@0: e@0: for i in range(0, transmat.shape[0]): e@0: transmat[i,:] = transmat[i,:]/transitions[i] e@0: e@0: self.transmat = transmat e@0: e@0: # Add a very small probability for random jump to avoid zero values e@0: e@0: self.transmat += flr e@0: e@0: for i in range(0, self.transmat.shape[0]): e@0: self.transmat[i,:] = self.transmat[i,:]/sum(self.transmat[i,:]) e@0: e@0: e@0: A = zeros((n_classes, n_classes)) e@0: e@0: Aold = self.transmat e@0: e@0: while mse(A,Aold)>10*size(A)*flr: e@0: Aold = A.copy() e@0: A = zeros((n_classes, n_classes)) e@0: transitions = zeros((n_classes, )) e@0: e@0: for i in range(0,len(features_list)): e@0: f = features_list[i] e@0: s,p = self.logviterbi(f) e@0: for j in range(1, len(s)): e@0: prev = s[j-1] e@0: cur = s[j] e@0: A[prev,cur] += 1 e@0: transitions[prev] += 1 e@0: e@0: transitions[transitions == 0] = 1 e@0: e@0: e@0: for i in range(0, A.shape[0]): e@0: A[i,:] = A[i,:]/transitions[i] e@0: e@0: A += flr e@0: e@0: e@0: e@0: self.transmat = A e@0: e@0: for i in range(0, A.shape[0]): e@0: if sum(A[i,:]) > 0: e@0: A[i,:] = A[i,:]/sum(A[i,:]) e@0: e@0: def estimate_emission_probability(self, x, q): e@0: post = self.svm.estimate_posterior_probability_vector(x) e@0: prior = self.prior e@0: p = post/prior e@0: p = p/sum(p) e@0: e@0: return p[q] e@0: e@0: e@0: e@0: e@0: def predict(self, X): e@0: return self.logviterbi(X)[0] e@0: e@0: e@0: def logviterbi(self, X): e@0: # Number of states e@0: e@0: N = self.n_classes e@0: e@0: # Number of observations e@0: e@0: e@0: e@0: T = X.shape[0] e@0: e@0: e@0: e@0: transmat = self.transmat e@0: e@0: #1. Initalization e@0: e@0: a1 = self.prior e@0: e@0: delta = zeros((T,N)) e@0: psi = zeros((T,N)) e@0: e@0: for i in range(0, N): e@0: delta[0, i] = log(self.transmat[0,i]) + log(self.estimate_emission_probability(X[0,:],i)) e@0: e@0: e@0: #2. Recursion e@0: e@0: for t in range(1, T): e@0: x = X[t, :] e@0: for j in range(0, N): e@0: delta[t,j] = max(delta[t-1,:] + log(transmat[:,j])) + log(self.estimate_emission_probability(x,j)) e@0: psi[t,j] = argmax(delta[t-1,:] + log(transmat[:,j])) e@0: e@0: e@0: # 3. Termination e@0: e@0: p_star = max(delta[T-1,:] + log(transmat[:,N-1])) e@0: q_star = argmax(delta[T-1,:] + log(transmat[:, N-1])) e@0: e@0: e@0: # 4. Backtracking e@0: e@0: q = zeros((T,)) e@0: p = zeros((T,)) e@0: e@0: q[-1] = q_star e@0: p[-1] = p_star e@0: for t in range(1, T): e@0: q[-t-1] = psi[-t, q[-t]] e@0: p[-t-1] = delta[-t, q[-t]] e@0: e@0: e@0: return q,p e@0: e@0: def viterbi(self, X): e@0: # Number of states e@0: e@0: N = self.n_classes e@0: e@0: # Number of observations e@0: e@0: T = X.shape[0] e@0: e@0: transmat = self.transmat e@0: e@0: #1. Initialization e@0: e@0: a1 = self.prior e@0: e@0: delta = zeros((N, )) e@0: psi = zeros((N, )) e@0: e@0: for i in range(0, N): e@0: delta[i] = a1[i]*self.estimate_emission_probability(X[0,:],i) e@0: e@0: e@0: e@0: e@0: e@0: #2. Recursion e@0: e@0: for t in range(1, T): e@0: delta_old = delta.copy() e@0: x = X[t,:] e@0: for j in range(0, N): e@0: delta[j] = max(delta_old*transmat[:,j])*self.estimate_emission_probability(x, j) e@0: psi[j] = argmax(delta_old*transmat[:,j]) e@0: e@0: #3. Termination e@0: e@0: p_star = max(delta*transmat[:,N-1]) e@0: q_star = argmax(delta*transmat[:,N-1]) e@0: e@0: e@0: e@0: #4. Backtracking e@0: e@0: q = zeros((T,)) e@0: q[-1] = q_star e@0: e@0: for t in range(1, T): e@0: q[-t-1] = psi[q[-t]] e@0: e@0: return q e@0: e@0: e@0: def _log_likelihood_matrix(self, X): e@0: N = self.n_classes e@0: ll = zeros((X.shape[0], N)) e@0: e@0: for i in range(0, X.shape[0]): e@0: ll[i,:] = self._forward(i, X) e@0: e@0: return ll e@0: e@0: def compute_ksus(self, X): e@0: N = self.n_classes e@0: T = X.shape[0] e@0: print "[II] Computing gammas... " e@0: e@0: gammas = self._forward_backward(X) e@0: e@0: # Initialize e@0: e@0: aij = self.transmat e@0: e@0: e@0: e@0: e@0: e@0: e@0: def _not_quite_ksu(self, t, i, j, X): e@0: alpha = exp(self.forward(X[0:t+1,:]))[i] e@0: beta = exp(self.backward(X[t+1:,:]))[j] e@0: e@0: aij = self.transmat[i,j] e@0: bj = self.estimate_emission_probability(X[t+1,:], j) e@0: e@0: return alpha*beta*aij*bj e@0: e@0: def _ksu(self, t, i, j, X): e@0: alphaT = exp(self.forward(X[0:t+1,:]))[0] e@0: e@0: return self._not_quite_ksu(t,i,j,X) e@0: e@0: e@0: def _forward_backward(self, X): e@0: T = X.shape[0] e@0: N = self.n_classes e@0: fv = zeros((T, N)) e@0: sv = zeros((T, N)) e@0: e@0: b = None e@0: for t in range(1, T+1): e@0: e@0: fv[t-1,:] = self._forward_message(fv[t-2,:], X[0:t, ]) e@0: e@0: for t in range(1, T+1): e@0: b = self._backward_message(b, X[-t: , :]) e@0: sv[-t,:] = lognorm(fv[t-1,:]*b) e@0: e@0: return sv e@0: e@0: e@0: def _backward(self, t, X): e@0: N = self.n_classes e@0: a = ones((N,))/N e@0: e@0: beta = zeros((N, )) e@0: transmat = self.transmat e@0: T = X.shape[0] e@0: x = X[t, :] e@0: if t == T-1: e@0: beta = log(a) e@0: e@0: return beta e@0: else: e@0: tosum = zeros((N, )) e@0: bt = self._backward(t+1, X) e@0: btnew = zeros((N, )) e@0: for j in range(0, N): e@0: for i in range(0, N): e@0: tosum[i] = log(self.estimate_emission_probability(x, j)) + bt[i] + log(transmat[i,j]) e@0: e@0: btnew[j] = logsumexp(tosum) e@0: btnew = lognorm(btnew) e@0: return btnew e@0: e@0: e@0: def score(self, X): e@0: return self.forward(X) e@0: e@0: def forward(self, X): e@0: T = X.shape[0] e@0: f = None e@0: for t in range(1, T+1): e@0: f = self._forward_message(f, X[0:t, :]) e@0: e@0: return f e@0: e@0: def backward(self, X): e@0: T = X.shape[0] e@0: b = None e@0: for t in range(1,T+1): e@0: b = self._backward_message(b, X[-t:, :]) e@0: e@0: return b e@0: e@0: def _backward_message(self, b, X): e@0: N = self.n_classes e@0: e@0: e@0: beta = zeros((N, )) e@0: transmat = self.transmat e@0: e@0: x = X[0, :] e@0: e@0: if X.shape[0] == 1: e@0: beta = log(ones((N,))/N) e@0: return beta e@0: else: e@0: tosum = zeros((N, )) e@0: btnew = zeros((N, )) e@0: for j in range(0, N): e@0: for i in range(0, N): e@0: tosum[i] = log(self.estimate_emission_probability(x,j)) + b[i] + log(transmat[i,j]) e@0: e@0: btnew[j] = logsumexp(tosum) e@0: return btnew e@0: e@0: def _forward_message(self, f, X): e@0: N = self.n_classes e@0: a = self.prior e@0: e@0: alpha = zeros((N, )) e@0: transmat = self.transmat e@0: e@0: x = X[-1, :] e@0: e@0: if X.shape[0] == 1: e@0: for i in range(0, N): e@0: alpha[i] = log(a[i]) + log(self.estimate_emission_probability(x, i)) e@0: return alpha e@0: e@0: else: e@0: tosum = zeros((N,)) e@0: ftnew = zeros((N,)) e@0: for j in range(0, N): e@0: for i in range(0, N): e@0: tosum[i] = f[i] + log(transmat[i,j]) e@0: Sum = logsumexp(tosum) e@0: bj = self.estimate_emission_probability(x, j) e@0: ftnew[j] = Sum+log(bj) e@0: e@0: return ftnew e@0: e@0: def _forward(self, t, X): e@0: N = self.n_classes e@0: a = self.prior e@0: alpha = zeros((N, )) e@0: transmat = self.transmat e@0: x = X[t, :] e@0: e@0: if t == 0: e@0: for i in range(0, N): e@0: alpha[i] = log(a[i]) + log(self.estimate_emission_probability(x, i)) e@0: alpha = lognorm(alpha) e@0: e@0: return alpha e@0: else: e@0: tosum = zeros((N, )) e@0: ft = self._forward(t-1, X) e@0: ftnew = zeros((N, )) e@0: for j in range(0, N): e@0: for i in range(0, N): e@0: tosum[i] = ft[i] + log(transmat[i,j]) e@0: e@0: Sum = logsumexp(tosum) e@0: bj = self.estimate_emission_probability(x, j) e@0: ftnew[j] = Sum+log(bj) e@0: ftnew = lognorm(ftnew) e@0: e@0: return ftnew e@0: e@0: e@0: class NBClassifier: e@0: def __init__(self): e@0: print "[II] Gaussian Naive Bayes Classifier" e@0: self.name = "Naive Bayes" e@0: self.smallname = "gnbc" e@0: self.gnb = GaussianNB() e@0: e@0: def getName(self): e@0: return self.name e@0: e@0: def score(self, features, parameters): e@0: predicted_states = self.predict(features) e@0: return accuracy_score(parameters, predicted_states) e@0: e@0: def fit(self, X, states): e@0: self.gnb.fit(X, states) e@0: e@0: def predict(self, X): e@0: return self.gnb.predict(X) e@0: e@0: class SVMClassifier(LinearSVC): e@0: def __init__(self,**arguments): e@0: LinearSVC.__init__(self,dual=False) e@0: self.smallname = "svmc" e@0: e@0: e@0: e@0: class SinkHoleClassifier(BaseEstimator): e@0: def __init__(self,name="SinkholeClassifier", N=2, n_components=1): e@0: self.chain_size = N e@0: self.n_components = n_components e@0: self.classifierNB = NBClassifier() e@0: # self.classifierSVM = MyAVAClassifier() e@0: self.classifierSVM = LinearSVC(dual=False) e@0: self.classifierHMM = HmmClassifier(N=N, n_components=n_components) e@0: self.classifierHMMSVM = HMMsvmClassifier(N=N) e@0: self.name = name e@0: self.smallname = "sinkholec" e@0: e@0: def score(self, features, parameters): e@0: predicted_states = self.predict(features) e@0: return accuracy_score(parameters, predicted_states) e@0: e@0: def getName(self): e@0: return self.name e@0: e@0: def get_params(self, deep=True): e@0: return {"N":self.chain_size, "n_components":self.n_components} e@0: e@0: def set_params(self, **parameters): e@0: for parameter, value in parameters.items(): e@0: self.setattr(parameter, value) e@0: return self e@0: e@0: def fit(self, X, y): e@0: self.n_classes = max(unique(y))+1 e@0: e@0: self.classifierNB.fit(X,y) e@0: self.classifierSVM.fit(X,y) e@0: self.classifierHMM.fit(X,y) e@0: self.classifierHMMSVM.fit(X,y) e@0: e@0: e@0: def predict(self, X): e@0: predictedNB = self.classifierNB.predict(X) e@0: predictedSVM = self.classifierSVM.predict(X) e@0: predictedHMM = self.classifierHMM.predict(X) e@0: predictedHMMSVM = self.classifierHMMSVM.predict(X) e@0: e@0: e@0: e@0: e@0: predicted = zeros((X.shape[0], )) e@0: e@0: for i in range(0, len(predicted)): e@0: candidates = [predictedNB[i], predictedSVM[i], predictedHMM[i], predictedHMMSVM[i]] e@0: e@0: c = Counter(candidates) e@0: e@0: most_common = c.most_common() e@0: e@0: # If there is an equal voting, select something at random e@0: if len(unique([k[1] for k in most_common])) == 1: e@0: predicted[i] = most_common[randint(len(most_common))][0] e@0: else: e@0: predicted[i] = most_common[0][0] e@0: e@0: return predicted e@0: e@0: class MyAVAClassifier: e@0: e@0: def __init__(self): e@0: self.classifiers = {} e@0: self.name = "Linear SVM Classifier" e@0: self.smallname = "svc-ava" e@0: e@0: e@0: def getName(self): e@0: return self.name e@0: def fit(self, X, y, flr = 0, C=0.7): e@0: e@0: n_classes = max(unique(y)) + 1 e@0: e@0: if len(unique(y)) == 1: e@0: self.only_one_class = True e@0: self.n_classes = 1 e@0: self.one_class_label = y[0] e@0: return e@0: elif len(unique(y)) == 2: e@0: e@0: self.n_classes = n_classes e@0: self.svm = svm.SVC(degree=2,probability = True, kernel='rbf', gamma=2, C = C) e@0: self.svm.fit(X,y) e@0: classes_ = unique(y) e@0: self.classifiers[(classes_[0], classes_[1])] = self.svm e@0: self.only_two_classes = True e@0: self.only_one_class = False e@0: e@0: return e@0: else: e@0: self.only_two_classes = False e@0: self.only_one_class = False e@0: e@0: e@0: classes = arange(0, n_classes) e@0: self.n_classes = n_classes e@0: e@0: h = histogram(y, n_classes)[0].astype(float) e@0: self.prior = h/sum(h) e@0: e@0: transmat = zeros((n_classes, n_classes)) e@0: e@0: for i in range(1, len(y)): e@0: prev = y[i-1] e@0: cur = y[i] e@0: transmat[prev,cur] += 1 e@0: e@0: transmat = transmat/sum(transmat) e@0: e@0: self.transmat = transmat e@0: e@0: # Add a very small probability for random jump to avoid zero values e@0: e@0: self.transmat += flr e@0: self.transmat = self.transmat/sum(self.transmat) e@0: e@0: for i in range(0, n_classes): e@0: for j in range(0, n_classes): e@0: if i != j and (i,j) not in self.classifiers and (j,i) not in self.classifiers: e@0: e@0: idx_ = bitwise_or(y == classes[i], y == classes[j]) e@0: e@0: X_ = X[idx_, :] e@0: e@0: y_ = y[idx_] e@0: e@0: if len(unique(y_)) > 1: e@0: svm_ = svm.SVC(probability = True, kernel='poly', gamma=2, C = C) e@0: e@0: svm_.fit(X_, y_) e@0: self.classifiers[(i,j)] = svm_ e@0: e@0: e@0: def estimate_pairwise_class_probability(self, i, j, x): e@0: e@0: e@0: if (i,j) not in self.classifiers and (j,i) in self.classifiers: e@0: return self.classifiers[(j,i)].predict_proba(x)[0,1] e@0: elif (i,j) not in self.classifiers and (j,i) not in self.classifiers: e@0: return 0.0 e@0: else: e@0: return self.classifiers[(i,j)].predict_proba(x)[0,0] e@0: e@0: def estimate_posterior_probability(self, i, x): e@0: mus = zeros((self.n_classes,)) e@0: for j in range(0, self.n_classes): e@0: if i != j: e@0: pcp = self.estimate_pairwise_class_probability(i,j,x) e@0: pcp += 1e-18 e@0: mus[j] = 1/pcp e@0: S = sum(mus) - (self.n_classes - 2) e@0: return 1/S e@0: e@0: def estimate_posterior_probability_vector(self, x): e@0: posterior = zeros((self.n_classes,)) e@0: for i in range(0, len(posterior)): e@0: posterior[i] = self.estimate_posterior_probability(i, x) e@0: e@0: return posterior e@0: e@0: e@0: def predict(self, X): e@0: predicted = zeros((X.shape[0],)) e@0: e@0: if self.only_one_class == True: e@0: return ones((X.shape[0],))*self.one_class_label e@0: elif self.only_two_classes == True: e@0: return self.svm.predict(X) e@0: e@0: e@0: for i in range(0, X.shape[0]): e@0: x = X[i,:] e@0: P = zeros((self.n_classes,)) e@0: e@0: e@0: for c in range(0, len(P)): e@0: P[c] = self.estimate_posterior_probability(c, x) e@0: e@0: pred = argmax(P) e@0: predicted[i] = pred e@0: e@0: return predicted e@0: e@0: class HMMsvmClassifier(BaseEstimator): e@0: def __init__(self, N=2): e@0: self.classifiers = {} e@0: self.name = "HMM-SVM Classifier" e@0: self.smallname ="svmhmm" e@0: self.obs = MyAVAClassifier() e@0: self.chain_size = N e@0: e@0: def get_params(self, deep=True): e@0: return {"N":self.chain_size} e@0: e@0: def set_params(self, **parameters): e@0: for parameter, value in parameters.items(): e@0: self.setattr(parameter, value) e@0: return self e@0: e@0: def getName(self): e@0: return self.name e@0: e@0: def score(self, features, parameters): e@0: predicted_states = self.predict(features) e@0: return accuracy_score(parameters, predicted_states) e@0: e@0: def fit(self, X, y): e@0: self.n_classes = max(unique(y))+1 e@0: e@0: self.obs.fit(X,y) e@0: self.hmm = HMMsvm(self.obs) e@0: self.hmm.fit([X],[y]) e@0: e@0: def predict(self, X): e@0: return self.hmm.predict(X) e@0: e@0: def confidence(self, x, q): e@0: return self.hmm.estimate_emission_probability(x, q) e@0: e@0: e@0: class HmmClassifier(BaseEstimator): e@0: def __init__(self, N=2,n_components = 1): e@0: self.name = "HMM (%d time steps, %d components)" % (N, n_components) e@0: self.smallname = "ghmm" e@0: self.n_components = n_components e@0: self.chain_size = N e@0: e@0: e@0: def get_params(self, deep=True): e@0: return {"N":self.chain_size, "n_components":self.n_components} e@0: e@0: def set_params(self, **parameters): e@0: for parameter, value in parameters.items(): e@0: self.setattr(parameter, value) e@0: return self e@0: e@0: def getName(self): e@0: return self.name e@0: e@0: def score(self, features, parameters): e@0: predicted_states = self.predict(features) e@0: return accuracy_score(parameters, predicted_states) e@0: e@0: def fit(self, features, parameters): e@0: e@0: n_classes = max(unique(parameters)) + 1 e@0: e@0: if n_classes == 1: e@0: self.only_one_class = True e@0: return e@0: else: e@0: self.only_one_class = False e@0: e@0: hmms = [None]*n_classes e@0: e@0: chain_size = self.chain_size e@0: obs = [None]*n_classes e@0: e@0: for i in range(chain_size, len(parameters)): e@0: class_ = parameters[i] e@0: seq = features[i-chain_size:i,:] e@0: e@0: e@0: if obs[class_] is None: e@0: obs[class_] = [seq] e@0: else: e@0: obs[class_].append(seq) e@0: e@0: e@0: e@0: for i in range(0, len(obs)): e@0: e@0: if obs[i] is not None and len(obs[i]) != 0: e@0: hmm_ = hmm.GaussianHMM(n_components=self.n_components, covariance_type='diag') e@0: obs_ = concatenate(obs[i]) e@0: hmm_.fit(obs_, [self.chain_size]*len(obs[i])) e@0: e@0: hmms[i] = hmm_ e@0: e@0: self.hmms = hmms e@0: e@0: return obs e@0: e@0: def predict(self, features, mfilt=20): e@0: e@0: if self.only_one_class == True: e@0: return zeros((features.shape[0], )) e@0: e@0: chain_size = self.chain_size e@0: hmms = self.hmms e@0: predicted_classes = zeros((features.shape[0],)) e@0: e@0: e@0: for i in range(chain_size, features.shape[0]): e@0: scores = zeros((len(hmms),)) e@0: e@0: seq = features[i-chain_size:i, :] e@0: e@0: for j in range(0, len(hmms)): e@0: if hmms[j] is not None: e@0: scores[j] = hmms[j].score(seq) e@0: else: e@0: scores[j] = -infty e@0: e@0: predicted_classes[i] = argmax(scores) e@0: e@0: e@0: return predicted_classes e@0: e@0: e@0: e@0: e@0: def vector_to_states(X): e@0: """ e@0: Input: a vector MxN with N samples and M variables e@0: Output: a codeword dictionary `parameters_alphabet', e@0: state_seq, inverse `parameters_alphabet_inv' """ e@0: e@0: e@0: parameters_alphabet = {} e@0: n = 0 e@0: e@0: for i in range(0, X.shape[1]): e@0: vec = tuple(ravel(X[:,i])) e@0: if vec not in parameters_alphabet: e@0: parameters_alphabet[vec] = n e@0: n += 1 e@0: e@0: parameters_alphabet_inv = dict([(parameters_alphabet[m],m) for m in parameters_alphabet]) e@0: e@0: state_seq = array([parameters_alphabet[tuple(ravel(X[:,m]))] for m in range(0, parameters_vector.shape[1])] ) e@0: e@0: return state_seq, parameters_alphabet, parameters_alphabet_inv e@0: e@0: e@0: def states_to_vector(predicted, parameters_alphabet_inv): e@0: estimated = matrix(zeros((len(parameters_alphabet_inv[0]), len(predicted)))) e@0: for i in range(0, len(predicted)): e@0: # print "[II] Predicted: ", predicted[i] e@0: estimated[:, i] = matrix(parameters_alphabet_inv[predicted[i]]).T e@0: e@0: return estimated e@0: e@0: if __name__=="__main__": e@0: if len(argv) != 2: e@0: print "[EE] Wrong number of arguments" e@0: print "[II] Correct syntax is:" e@0: print "[II] \t%s " e@0: print "[II] where is the directory containing the features files in the format *_features.yaml" e@0: print "[II] and the parameters in the format *_parameters.yaml" e@0: exit(-1) e@0: e@0: totalstart = time() e@0: e@0: traindir = argv[1] e@0: e@0: songs_in_dir = glob("%s/*_features.yaml" % traindir) e@0: e@0: e@0: # Shuffle to avoid using the same songs in the same sets in cross-validation e@0: e@0: #random.shuffle(songs_in_dir) e@0: e@0: e@0: features_vector = None e@0: parameters_vector = None e@0: index_vector = None e@0: e@0: e@0: idx = 0 e@0: e@0: for f_ in songs_in_dir: e@0: infile = f_ e@0: paramfile = "%s_parameters.yaml" % f_.split('_features.yaml')[0] e@0: e@0: print "[II] Using features file: %s" % infile e@0: print "[II] and parameters file: %s" % paramfile e@0: e@0: e@0: features_pool = YamlInput(filename = infile)() e@0: parameters_pool = YamlInput(filename = paramfile)() e@0: e@0: d1 = parameters_pool['parameters.d1'] e@0: da = parameters_pool['parameters.da'] e@0: g1 = parameters_pool['parameters.g1'] e@0: gc = parameters_pool['parameters.gc'] e@0: G = parameters_pool['parameters.G'] e@0: e@0: feature_captions = features_pool.descriptorNames() e@0: parameter_captions = ['d1','da','g1','gc','G'] e@0: e@0: e@0: for c in features_pool.descriptorNames(): e@0: if c.split('.')[0] == 'metadata' or c.split('.')[0] == 'parameter': e@0: feature_captions.remove(c) e@0: e@0: e@0: print "[II] %d Features Available: " % len(feature_captions) e@0: print str(feature_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7] e@0: e@0: nfeatures_in = len(feature_captions) e@0: nparameters_in = 5 e@0: features_vector_one = zeros((nfeatures_in, len(features_pool[feature_captions[0]]))) e@0: index_vector_one = ones((len(features_pool[feature_captions[0]]),)) e@0: parameters_vector_one = ones((nparameters_in, len(features_pool[feature_captions[0]]))) e@0: e@0: e@0: for i in range(0, nfeatures_in): e@0: features_vector_one[i, :] = features_pool[feature_captions[i]].T e@0: e@0: e@0: parameters_vector_one[0, :] = d1*parameters_vector_one[0,:] e@0: parameters_vector_one[1, :] = g1*parameters_vector_one[1,:] e@0: parameters_vector_one[2, :] = da*parameters_vector_one[2,:] e@0: parameters_vector_one[3, :] = gc*parameters_vector_one[3,:] e@0: parameters_vector_one[4, :] = G*parameters_vector_one[4,:] e@0: # Stores example index number e@0: e@0: index_vector_one *= idx e@0: e@0: print "[II] %d parameters used:" % len(parameter_captions) e@0: print str(parameter_captions).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7].replace('parameter.','') e@0: e@0: if features_vector is None: e@0: features_vector = features_vector_one e@0: else: e@0: features_vector = append(features_vector, features_vector_one, 1) e@0: e@0: if parameters_vector is None: e@0: parameters_vector = parameters_vector_one e@0: else: e@0: parameters_vector = append(parameters_vector, parameters_vector_one, 1) e@0: e@0: if index_vector is None: e@0: index_vector = index_vector_one e@0: else: e@0: index_vector = append(index_vector, index_vector_one) e@0: e@0: e@0: idx += 1 e@0: e@0: print "[II] Marking silent parts" e@0: #features_full_nontransformed_train = features_vector.copy() e@0: e@0: silent_parts = zeros((1, features_vector.shape[1])) e@0: e@0: rms = features_vector[feature_captions.index('rms'), :] e@0: e@0: # Implementing Hysteresis Gate -- High threshold is halfway between e@0: # the mean and the max and Low is halfway between the mean dn the min e@0: e@0: rms_threshold_mean = mean(rms) e@0: e@0: rms_threshold_max = max(rms) e@0: rms_threshold_min = min(rms) e@0: e@0: rms_threshold_high = 0.1 * rms_threshold_mean e@0: rms_threshold_low = 0.01 * rms_threshold_mean e@0: e@0: for n in range(1, len(rms)): e@0: prev = rms[n-1] e@0: curr = rms[n] e@0: e@0: if prev >= rms_threshold_high: e@0: if curr < rms_threshold_low: e@0: silent_parts[0,n] = 1 e@0: else: e@0: silent_parts[0,n] = 0 e@0: elif prev <= rms_threshold_low: e@0: if curr > rms_threshold_high: e@0: silent_parts[0,n] = 0 e@0: else: e@0: silent_parts[0,n] = 1 e@0: else: e@0: silent_parts[0,n] = silent_parts[0,n-1] e@0: e@0: e@0: if silent_parts[0,1] == 1: e@0: silent_parts[0, 0] = 1 e@0: e@0: e@0: ## START COMMENTING HERE FOR KEEPING ACTIVE PARTS ONLY e@0: e@0: active_index = invert(silent_parts.flatten().astype(bool)) e@0: # Keep only active parts e@0: e@0: # Uncomment this e@0: features_vector = features_vector[:, active_index] e@0: e@0: # Keep this for debugging purposes e@0: parameters_vector = parameters_vector[:, active_index] e@0: e@0: e@0: e@0: e@0: ## UP TO HERE e@0: parameters_vector[0,:] = (parameters_vector[0,:] - d1_min)/d1_max #d1 e@0: parameters_vector[1,:] = (parameters_vector[1,:] - g1_min)/g1_max #g1 e@0: parameters_vector[2,:] = (parameters_vector[2,:] - da_min)/da_max #g1 e@0: parameters_vector[3,:] = (parameters_vector[3,:] - G_min)/G_max #G e@0: parameters_vector[4,:] = (parameters_vector[4,:] - gc_min)/gc_max #gc e@0: e@0: # Store moments e@0: moments_vector = zeros((features_vector.shape[0], 2)) e@0: features_vector_original = features_vector.copy() e@0: e@0: print "[II] Storing moments vector" e@0: for i in range(0, features_vector.shape[0]): e@0: mean_ = mean(features_vector[i,:]) e@0: std_ = std(features_vector[i,:], ddof=1) e@0: moments_vector[i,0] = mean_ e@0: moments_vector[i,1] = std_ e@0: e@0: features_vector[i,:] = (features_vector[i,:] - mean_)/std_ e@0: e@0: features_vector_old_train = features_vector.copy() e@0: e@0: e@0: e@0: e@0: e@0: e@0: e@0: print "[II] Extracting PCA configuration " e@0: kernel, q, featurelist = extract_pca_configuration_from_data(features_vector) e@0: e@0: q = kernel.shape[0] e@0: e@0: e@0: print "[II] Optimal number of PCs to keep: %d" % q e@0: e@0: e@0: e@0: feature_captions_array = array(feature_captions) e@0: features_to_keep = list(feature_captions_array[featurelist]) e@0: e@0: e@0: print "[II] Decided to keep %d features:" % len(features_to_keep) e@0: print str(features_to_keep).replace("', ","\n").replace('[','').replace("'","[II]\t ")[:-7] e@0: e@0: featurelist_bak = featurelist e@0: features_kept_data = features_vector[featurelist,:] e@0: features_kept_data_bak = features_kept_data.copy() e@0: features_vector = (kernel.T*features_kept_data)[0:q,:] e@0: features_vector_new_train = features_vector.copy() e@0: e@0: e@0: features_vector_unsmoothed = features_vector e@0: features_vector = smooth_matrix_1D(features_vector) e@0: e@0: e@0: # Construct parameters alphabet, each symbol is going to be a different column vector e@0: # in parameter code matrix e@0: e@0: parameters_state, parameter_state_alphabet, parameter_state_alphabet_inv = vector_to_states(parameters_vector) e@0: e@0: e@0: e@0: e@0: CV = 10 e@0: print "[II] %d-fold cross validation" % CV e@0: e@0: e@0: print "[II] GaussianNB..." e@0: gnbc = GaussianNB() e@0: predictsGNBC = cross_val_predict(gnbc,features_vector.T,parameters_state,cv=CV) e@0: scoresGNBC = f1_score(parameters_state, predictsGNBC, average='weighted') e@0: parametersGNBC = states_to_vector(predictsGNBC, parameter_state_alphabet_inv) e@0: msesGNBC = mse(parametersGNBC, parameters_vector) e@0: print "[II] f1-weighted: %s" % scoresGNBC e@0: print "[II] mse: %s" % msesGNBC e@0: e@0: print "[II] o-v-r SVM..." e@0: ovrc = LinearSVC(dual=False) e@0: predictsOVRC = cross_val_predict(ovrc,features_vector.T,parameters_state,cv=CV) e@0: scoresOVRC = f1_score(parameters_state, predictsOVRC, average='weighted') e@0: parametersOVRC = states_to_vector(predictsOVRC, parameter_state_alphabet_inv) e@0: msesOVRC = mse(parametersOVRC, parameters_vector) e@0: print "[II] %s" % scoresOVRC e@0: print "[II] mse: %s" % msesOVRC e@0: e@0: e@0: print "[II] GaussianHMM..." e@0: start = time() e@0: e@0: e@0: e@0: e@0: tempfolder = tempfile.mkdtemp() e@0: e@0: _scores_cs = memmap(os.path.join(tempfolder,'scores'), dtype=float,shape=98, mode='w+') e@0: _mses_cs = memmap(os.path.join(tempfolder,'mses'), dtype=float,shape=98, mode='w+') e@0: e@0: _chain_sizes = memmap(os.path.join(tempfolder,'cs'), dtype=int,shape=98, mode='w+') e@0: e@0: e@0: print "[II] Calibrating GaussianHMM please wait..." e@0: def iter_(scores, cs, n): e@0: _n = n e@0: e@0: try: e@0: hmmc = HmmClassifier(N=_n, n_components=1) e@0: predictsHMMC = cross_val_predict(hmmc,features_vector.T,parameters_state,cv=CV) e@0: # scoresHMMC = cross_val_score(hmmc,features_vector.T,parameters_state,cv=CV, scoring='f1_weighted') e@0: scores[n-2] = f1_score(parameters_state, predictsHMMC, average='weighted') e@0: parametersHMMC = states_to_vector(predictsHMMC, parameter_state_alphabet_inv) e@0: _mses_cs[n-2] = mse(parametersHMMC, parameters_vector) e@0: e@0: except: e@0: scores[n-2] = 0 e@0: cs[n-2] = _n e@0: e@0: e@0: e@0: e@0: print "[II] Calibrating GaussianHMM please wait..." e@0: e@0: e@0: def iter_(scores, nc, n): e@0: try: e@0: hmmc = HmmClassifier(N=opt_N, n_components=n) e@0: predictsHMMC = cross_val_predict(hmmc,features_vector.T,parameters_state,cv=CV) e@0: e@0: # scoresHMMC = cross_val_score(hmmc,features_vector.T,parameters_state,cv=CV,n_jobs=2, scoring='f1_weighted') e@0: scores[n-1] = f1_score(parameters_state, predictsHMMC, average='weighted') e@0: parametersHMMC = states_to_vector(predictsHMMC, parameter_state_alphabet_inv) e@0: _mses_nc[n-1] = mse(parametersHMMC, parameters_vector) e@0: except: e@0: scores[n-1] = 0 e@0: nc[n-1] = n e@0: e@0: e@0: print "[II] Configuring using cross-validation disabled for performance reasons, please ignore results or uncomment the relevant code." e@0: e@0: e@0: e@0: e@0: finish = time() e@0: e@0: e@0: e@0: outp = "" e@0: e@0: outp+= "[II] Results (F1-Scores):\n" e@0: e@0: outp+= "[II] Estimator\tMean\n" e@0: outp+= "[II] %s\t%.2f\t%.5f\n" % ("GaussianNB", scoresGNBC, msesGNBC) e@0: outp+= "[II] %s\t%.2f\t%.5f\n" % ("O-V-R SVM", scoresOVRC, msesOVRC) e@0: e@0: e@0: e@0: totalfinish = time() e@0: totalduration = totalfinish - totalstart e@0: e@0: outp+= "[II] Total time: %dm:%ds\n" % (int(totalduration) / 60, int(totalduration) % 60) e@0: print outp e@0: shutil.rmtree(tempfolder) e@0: e@0: gnbc = NBClassifier() e@0: gnbc.fit(features_vector.T, parameters_state) e@0: e@0: outp+= "[GNBC] Testing against full data:\n" e@0: predictsGNBC = gnbc.predict(features_vector.T) e@0: scoresGNBC = f1_score(parameters_state, predictsGNBC, average='weighted') e@0: parametersGNBC = states_to_vector(predictsGNBC, parameter_state_alphabet_inv) e@0: msesGNBC = mse(parametersGNBC, parameters_vector) e@0: outp+= "[II] f1-weighted: %s\n" % scoresGNBC e@0: outp+= "[II] mse: %s\n" % msesGNBC e@0: e@0: e@0: e@0: e@0: e@0: ovrc = SVMClassifier() e@0: ovrc.fit(features_vector.T, parameters_state) e@0: e@0: outp+= "[OVRC ] Testing against full data:\n" e@0: predictsOVRC = ovrc.predict(features_vector.T) e@0: scoresOVRC = f1_score(parameters_state, predictsOVRC , average='weighted') e@0: parametersOVRC = states_to_vector(predictsOVRC , parameter_state_alphabet_inv) e@0: msesOVRC = mse(parametersOVRC , parameters_vector) e@0: outp+= "[II] f1-weighted: %s\n" % scoresOVRC e@0: outp+= "[II] mse: %s\n" % msesOVRC e@0: e@0: e@0: e@0: model_gnb = ReverbModel("Naive Bayes Classifier Model", kernel, q, features_to_keep, gnbc, parameter_dictionary=parameter_state_alphabet_inv, moments_vector=moments_vector ) e@0: model_svm = ReverbModel("OVR LinearSVM Classifier Model", kernel, q, features_to_keep, ovrc, parameter_dictionary=parameter_state_alphabet_inv, moments_vector=moments_vector ) e@0: e@0: e@0: model_gnb.save("model_gnb.dat") e@0: model_svm.save("model_svm.dat") e@0: e@0: e@0: print outp e@0: f = open('results.txt','w') e@0: f.write(outp) e@0: f.close()