Mercurial > hg > plosone_underreview

--- a/load_dataset.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Mar 15 22:52:57 2017
-
-@author: mariapanteli
-"""
-
-import numpy as np
-import pandas as pd
-import pickle
-
-import load_features
-import util_dataset
-import util_filter_dataset
-
-
-#METADATA_FILE = 'sample_dataset/metadata.csv'
-#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
-WIN_SIZE = 2
-METADATA_FILE = 'data/metadata_BLSM_language_all.csv'
-#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle']
-#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle']
-OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
-                '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
-                '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
-
-def extract_features(df, win2sec=8.0):
-    """Extract features from melspec and chroma.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Metadata including class label and path to audio, melspec, chroma
-    win2sec : float
-        The window size for the second frame decomposition of the features
-
-    Returns
-    -------
-    X : np.array
-        The features for every frame x every audio file in the dataset
-    Y : np.array
-        The class labels for every frame in the dataset
-    Y_audio : np.array
-        The audio labels
-    """
-    feat_loader = load_features.FeatureLoader(win2sec=win2sec)
-    frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df)
-    print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape
-    X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1)
-    Y = Y_df.get_values()
-    Y_audio = Y_audio_df.get_values()
-    return X, Y, Y_audio
-
-
-if __name__ == '__main__':
-    # load dataset
-    df = pd.read_csv(METADATA_FILE)
-    df = util_filter_dataset.remove_missing_data(df)
-    subset_idx = util_dataset.subset_labels(df['Country'].get_values())
-    df = df.iloc[subset_idx, :]
-    X, Y = np.arange(len(df)), df['Country'].get_values()
-
-    # split in train, val, test set
-    train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y)
-
-    # extract features and write output
-    X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
-    with open(OUTPUT_FILES[0], 'wb') as f:
-        pickle.dump([X_train, Y_train, Y_audio_train], f)
-
-    X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
-    with open(OUTPUT_FILES[1], 'wb') as f:
-        pickle.dump([X_val, Y_val, Y_audio_val], f)
-
-    X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE)
-    with open(OUTPUT_FILES[2], 'wb') as f:
-        pickle.dump([X_test, Y_test, Y_audio_test], f)
-
-#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle'
-#    pickle.dump([X_test, Y_test, Y_audio_test], f)
-#with open(out_file, 'wb') as f:
--- a/load_features.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,287 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Mar 16 01:50:57 2017
-
-@author: mariapanteli
-"""
-
-import numpy as np
-import pandas as pd
-import os
-from sklearn.decomposition import NMF
-import OPMellin as opm
-import MFCC as mfc
-import PitchBihist as pbi
-
-
-class FeatureLoader:
-    def __init__(self, win2sec=8):
-        self.win2sec = float(win2sec)
-        self.sr = 44100.
-        self.win1 = int(round(0.04*self.sr))
-        self.hop1 = int(round(self.win1/8.))
-        self.framessr = self.sr/float(self.hop1)
-        self.win2 = int(round(self.win2sec*self.framessr))
-        self.hop2 = int(round(0.5*self.framessr))
-        self.framessr2 = self.framessr/float(self.hop2)
-
-
-    def get_op_mfcc_for_file(self, melspec_file=None, scale=True):
-        op = []
-        mfc = []
-        if not os.path.exists(melspec_file):
-            return op, mfc
-        print 'extracting onset patterns and mfccs...'
-        songframes = pd.read_csv(melspec_file, engine="c", header=None)
-        songframes.iloc[np.where(np.isnan(songframes))] = 0
-        songframes = songframes.iloc[0:min(len(songframes), 18000), :]  # only first 1.5 minutes
-        melspec = songframes.get_values().T
-        op = self.get_op_from_melspec(melspec, K=2)
-        mfc = self.get_mfcc_from_melspec(melspec)
-        if scale:
-            # scale all frames by mean and std of recording
-            op = (op - np.nanmean(op)) / np.nanstd(op)
-            mfc = (mfc - np.nanmean(mfc)) / np.nanstd(mfc)
-        return op, mfc
-
-
-    def get_chroma_for_file(self, chroma_file=None, scale=True):
-        ch = []
-        if not os.path.exists(chroma_file):
-            return ch
-        print 'extracting chroma...'
-        songframes = pd.read_csv(chroma_file, engine="c", header=None)
-        songframes.iloc[np.where(np.isnan(songframes))] = 0
-        songframes = songframes.iloc[0:min(len(songframes), 18000), :]  # only first 1.5 minutes
-        chroma = songframes.get_values().T
-        ch = self.get_ave_chroma(chroma)
-        if scale:
-            # scale all frames by mean and std of recording
-            ch = (ch - np.nanmean(ch)) / np.nanstd(ch)
-        return ch
-
-
-    def get_music_idx_from_bounds(self, bounds, sr=None):
-        music_idx = []
-        if len(bounds) == 0:
-            # bounds is empty list
-            return music_idx
-        nbounds = bounds.shape[0]
-        if len(np.where(bounds[:,2]=='m')[0])==0:
-            # no music segments
-            return music_idx
-        elif len(np.where(bounds[:,2]=='s')[0])==nbounds:
-            # all segments are speech
-            return music_idx
-        else:
-            half_win_hop = int(round(0.5 * self.win2 / float(self.hop2)))
-            music_bounds = np.where(bounds[:, 2] == 'm')[0]
-            bounds_in_frames = np.round(np.array(bounds[:, 0], dtype=float) * sr)
-            duration_in_frames = np.round(np.array(bounds[:, 1], dtype=float) * sr)
-            for music_bound in music_bounds:
-                lower_bound = np.max([0, bounds_in_frames[music_bound] - half_win_hop])
-                upper_bound = lower_bound + duration_in_frames[music_bound] - half_win_hop
-                music_idx.append(np.arange(lower_bound, upper_bound, dtype=int))
-            if len(music_idx)>0:
-                music_idx = np.sort(np.concatenate(music_idx))  # it should be sorted, but just in case segments overlap -- remove duplicates if segments overlap
-        return music_idx
-
-
-    def get_music_idx_for_file(self, segmenter_file=None):
-        music_idx = []
-        if os.path.exists(segmenter_file) and os.path.getsize(segmenter_file)>0:
-            print 'loading speech/music segments...'
-            bounds = pd.read_csv(segmenter_file, header=None, delimiter='\t').get_values()
-            if bounds.shape[1] == 1:  # depends on the computer platform
-                bounds = pd.read_csv(segmenter_file, header=None, delimiter=',').get_values()
-            music_idx = self.get_music_idx_from_bounds(bounds, sr=self.framessr2)
-        return music_idx
-
-
-    def get_features(self, df, class_label='Country'):
-        oplist = []
-        mflist = []
-        chlist = []
-        pblist = []
-        clabels = []
-        aulabels = []
-        n_files = len(df)
-        for i in range(n_files):
-            if not (os.path.exists(df['Melspec'].iloc[i]) and os.path.exists(df['Chroma'].iloc[i]) and os.path.exists(df['Melodia'].iloc[i])):
-                continue
-            print 'file ' + str(i) + ' of ' + str(n_files)
-            music_idx = self.get_music_idx_for_file(df['Speech'].iloc[i])
-            if len(music_idx)==0:
-                # no music segments -> skip this file
-                continue
-            try:
-                op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i])
-                ch = self.get_chroma_for_file(df['Chroma'].iloc[i])
-                #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i])
-                pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i])
-                #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i])
-            except:
-                continue
-            min_n_frames = np.min([len(op), len(mfcc), len(ch), len(pb)])  # ideally, features should have the same number of frames
-            if min_n_frames==0:
-                # no features extracted -> skip this file
-                continue
-            music_idx = music_idx[music_idx<min_n_frames]
-            n_frames = len(music_idx)
-            oplist.append(op.iloc[music_idx, :])
-            mflist.append(mfcc.iloc[music_idx, :])
-            chlist.append(ch.iloc[music_idx, :])
-            pblist.append(pb.iloc[music_idx, :])
-            clabels.append(pd.DataFrame(np.repeat(df[class_label].iloc[i], n_frames)))
-            aulabels.append(pd.DataFrame(np.repeat(df['Audio'].iloc[i], n_frames)))
-        print len(oplist), len(mflist), len(chlist), len(pblist), len(clabels), len(aulabels)
-        return pd.concat(oplist), pd.concat(mflist), pd.concat(chlist), pd.concat(pblist), pd.concat(clabels), pd.concat(aulabels)
-
-
-    def get_op_from_melspec(self, melspec, K=None):
-        op = opm.OPMellin(win2sec=self.win2sec)
-        opmellin = op.get_opmellin_from_melspec(melspec=melspec, melsr=self.framessr)
-        if K is not None:
-            opmel =  self.mean_K_bands(opmellin.T, K)
-        opmel = pd.DataFrame(opmel)
-        return opmel
-
-
-    def get_mfcc_from_melspec(self, melspec, deltamfcc=True, avelocalframes=True, stdlocalframes=True):
-        mf = mfc.MFCCs()
-        mfcc = mf.get_mfccs_from_melspec(melspec=melspec, melsr=self.framessr)
-        if deltamfcc:
-            ff = mfcc
-            ffdiff = np.diff(ff, axis=1)
-            ffdelta = np.concatenate((ffdiff, ffdiff[:,-1,None]), axis=1)
-            frames = np.concatenate([ff,ffdelta], axis=0)
-            mfcc = frames
-        if avelocalframes:
-            mfcc = self.average_local_frames(mfcc, getstd=stdlocalframes)
-        mfcc = pd.DataFrame(mfcc.T)
-        return mfcc
-
-
-    def get_ave_chroma(self, chroma, avelocalframes=True, stdlocalframes=True, alignchroma=True):
-        chroma[np.where(np.isnan(chroma))] = 0
-        if alignchroma:
-            maxind = np.argmax(np.sum(chroma, axis=1))
-            chroma = np.roll(chroma, -maxind, axis=0)
-        if avelocalframes:
-            chroma = self.average_local_frames(chroma, getstd=stdlocalframes)
-        chroma = pd.DataFrame(chroma.T)
-        return chroma
-
-
-    def average_local_frames(self, frames, getstd=False):
-        nbins, norigframes = frames.shape
-        if norigframes<self.win2:
-            nframes = 1
-        else:
-            nframes = int(1+np.floor((norigframes-self.win2)/float(self.hop2)))
-        if getstd:
-            aveframes = np.empty((nbins+nbins, nframes))
-            for i in range(nframes):  # loop over all 8-sec frames
-                meanf = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1)
-                stdf = np.nanstd(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1)
-                aveframes[:,i] = np.concatenate((meanf,stdf))
-        else:
-            aveframes = np.empty((nbins, nframes))
-            for i in range(nframes):  # loop over all 8-sec frames
-                aveframes[:,i] = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1)
-        return aveframes
-
-
-    def mean_K_bands(self, songframes, K=40, nmels=40):
-        [F, P] = songframes.shape
-        Pproc = int((P/nmels)*K)
-        procframes = np.zeros([F, Pproc])
-        niters = int(nmels/K)
-        nbins = P/nmels  # must be 200 bins
-        for k in range(K):
-            for j in range(k*niters, (k+1)*niters):
-                procframes[:, (k*nbins):((k+1)*nbins)] += songframes[:, (j*nbins):((j+1)*nbins)]
-        procframes /= float(niters)
-        return procframes
-
-
-    def nmfpitchbihist(self, frames):
-        nbins, nfr = frames.shape
-        npc = 2
-        nb = int(np.sqrt(nbins))  # assume structure of pitch bihist is nbins*nbins
-        newframes = np.empty(((nb+nb)*npc, nfr))
-        for fr in range(nfr):
-            pb = np.reshape(frames[:, fr], (nb, nb))
-            try:
-                nmfmodel = NMF(n_components=npc).fit(pb)
-                W = nmfmodel.transform(pb)
-                H = nmfmodel.components_.T
-                newframes[:, fr, None] = np.concatenate((W, H)).flatten()[:, None]
-            except:
-                newframes[:, fr, None] = np.zeros(((nb+nb)*npc, 1))
-        return newframes
-
-
-    def get_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True):
-        pb = []
-        if not os.path.exists(melodia_file):
-            return pb
-        print 'extracting pitch bihist from melodia...'
-        pb = pbi.PitchBihist(win2sec=self.win2sec)
-        pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=90.0)
-        if nmfpb is True:
-            pbihist = self.nmfpitchbihist(pbihist)
-        pbihist = pd.DataFrame(pbihist.T)
-        if scale:
-            # scale all frames by mean and std of recording
-            pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
-        return pbihist
-
-
-    def load_precomputed_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True):
-        base = os.path.basename(melodia_file)
-        root = '/import/c4dm-05/mariap/Melodia-melody-'+str(int(self.win2sec))+'sec/'
-        print 'load precomputed pitch bihist', root
-        if self.win2sec == 8:
-            pbihist = pd.read_csv(os.path.join(root, base)).iloc[1:,1:]
-        else:
-            pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T
-            if nmfpb is True:
-                pbihist = self.nmfpitchbihist(pbihist)
-            pbihist = pd.DataFrame(pbihist.T)
-        print pbihist.shape
-        if scale:
-            # scale all frames by mean and std of recording
-            pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
-        return pbihist
-
-
-#    def get_pb_chroma_for_file(self, chroma_file=None, scale=True):
-#        ch = []
-#        pb = []
-#        if not os.path.exists(chroma_file):
-#            return ch, pb
-#        songframes = pd.read_csv(chroma_file, engine="c", header=None)
-#        songframes.iloc[np.where(np.isnan(songframes))] = 0
-#        songframes = songframes.iloc[0:min(len(songframes), 18000), :]  # only first 1.5 minutes
-#        chroma = songframes.get_values().T
-#        ch = self.get_ave_chroma(chroma)
-#        pb = self.get_pbihist_from_chroma(chroma)
-#        if scale:
-#            # scale all frames by mean and std of recording
-#            ch = (ch - np.nanmean(ch)) / np.nanstd(ch)
-#            pb = (pb - np.nanmean(pb)) / np.nanstd(pb)
-#        return ch, pb
-
-
-#    def get_pbihist_from_chroma(self, chroma, alignchroma=True, nmfpb=True):
-#        pb = pbi.PitchBihist(win2sec=self.win2sec)
-#        chroma[np.where(np.isnan(chroma))] = 0
-#        if alignchroma:
-#            maxind = np.argmax(np.sum(chroma, axis=1))
-#            chroma = np.roll(chroma, -maxind, axis=0)
-#        pbihist = pb.get_pitchbihist_from_chroma(chroma=chroma, chromasr=self.framessr)
-#        if nmfpb is True:
-#            pbihist = self.nmfpitchbihist(pbihist)
-#        pbihist = pd.DataFrame(pbihist.T)
-#        return pbihist
\ No newline at end of file
--- a/results.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,131 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Jul 12 20:49:48 2016
-
-@author: mariapanteli
-"""
-
-import numpy as np
-import pandas as pd
-import pickle
-from collections import Counter
-from sklearn.cluster import KMeans
-
-import utils
-import utils_spatial
-
-
-def country_outlier_df(counts, labels, out_file=None, normalize=False):
-    if len(counts.keys()) < len(np.unique(labels)):
-        for label in np.unique(labels):
-            if not counts.has_key(label):
-                counts.update({label:0})
-    if normalize is True:
-        counts = normalize_outlier_counts(counts, Counter(labels))
-    df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
-    df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
-    if out_file is not None:
-        df.to_csv(out_file, index=False)
-    return df
-
-
-def normalize_outlier_counts(outlier_counts, country_counts):
-    '''Normalize a dictionary of outlier counts per country by
-        the total number of recordings per country
-    '''
-    for key in outlier_counts.keys():
-        # dictionaries should have the same keys
-        outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
-    return outlier_counts
-
-
-def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
-    threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
-    global_counts = Counter(Y[y_pred])
-    df = country_outlier_df(global_counts, Y, normalize=True)
-    if out_file is not None:
-        df.to_csv(out_file, index=False)
-    return df, threshold, MD
-
-
-def print_most_least_outliers_topN(df, N=10):
-    sort_inds = df['Outliers'].argsort()  # ascending order
-    df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
-    df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
-    print "most outliers "
-    print df_most
-    print "least outliers "
-    print df_least
-
-
-def load_metadata(Yaudio, metadata_file):
-    df = pd.read_csv(metadata_file)
-    df_audio = pd.DataFrame({'Audio':Yaudio})
-    ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
-    return ddf
-
-
-def clusters_metadata(df, cl_pred, out_file=None):
-    def get_top_N_counts(labels, N=3):
-        ulab, ucount = np.unique(labels, return_counts=True)
-        inds = np.argsort(ucount)
-        return zip(ulab[inds[-N:]],ucount[inds[-N:]])
-    info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
-    styles_description = []
-    uniq_cl = np.unique(cl_pred)
-    for ccl in uniq_cl:
-        inds = np.where(cl_pred==ccl)[0]
-        styles_description.append(get_top_N_counts(info[inds], N=3))
-    df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
-    print df_styles.to_latex()
-    if out_file is not None:
-        df_styles.to_csv(out_file, index=False)
-
-
-# load LDA-transformed frames
-X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
-ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
-w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
-w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
-Xrhy, Xmel, Xmfc, Xchr = X_list
-X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
-
-# global outliers
-df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
-print_most_least_outliers_topN(df_global, N=10)
-
-spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
-spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
-df_local = country_outlier_df(spatial_counts, Y, normalize=True)
-print_most_least_outliers_topN(df_local, N=10)
-
-feat = [Xrhy, Xmel, Xmfc, Xchr]
-feat_labels = ['rhy', 'mel', 'mfc', 'chr']
-tabs_feat = []
-for i in range(len(feat)):
-    XX = feat[i]
-    df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
-    print_most_least_outliers_topN(df_feat, N=5)
-
-# how many styles are there
-#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
-bestncl = 13
-
-# get cluster predictions and metadata for each cluster
-cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
-centroids = cluster_model.cluster_centers_
-cl_pred = cluster_model.predict(X)
-ddf['Clusters'] = cl_pred
-clusters_metadata(ddf, cl_pred)
-
-# how similar are the cultures and which ones seem to be global outliers
-cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
-
-# Moran on Mahalanobis distances
-data = cluster_freq.get_values()
-data_countries = cluster_freq.index
-#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
-threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
-y = np.sqrt(MD)
-utils_spatial.print_Moran_outliers(y, w, data_countries)
-utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/MFCC.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 19 12:58:07 2015
+
+@author: mariapanteli
+"""
+import librosa
+import scipy.signal
+import numpy
+
+class MFCCs:
+    def __init__(self):
+        self.y = None
+        self.sr = None
+        self.melspec = None
+        self.melsr = None
+        self.win1 = None
+        self.hop1 = None
+        self.mfccs = None
+
+    def load_audiofile(self, filename='test.wav', sr=None, segment=True):
+        self.y, self.sr = librosa.load(filename, sr=sr)
+        if segment:
+            tracklength = self.y.shape[0]/float(self.sr)
+            startSample = 0
+            endSample = None
+            if tracklength > 90:
+                startPointSec = (tracklength/2.)-20
+                startSample = round(startPointSec*self.sr)
+                endSample = startSample+45*self.sr
+            self.y = self.y[startSample:endSample]
+
+    def mel_spectrogram(self, y=None, sr=None):
+        if self.y is None:
+            self.y = y
+        if self.sr is None:
+            self.sr = sr
+        win1 = int(round(0.04*self.sr))
+        hop1 = int(round(win1/8.))
+        nfft1 = int(2**numpy.ceil(numpy.log2(win1)))
+        nmels = 40
+        D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming))**2
+        #melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels)
+        melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels, fmax=8000)
+        melsr = self.sr/float(hop1)
+        self.melspec = melspec
+        self.melsr = melsr
+
+    def calc_mfccs(self, y=None, sr=None):
+        if self.y is None:
+            self.y = y
+        if self.sr is None:
+            self.sr = sr
+        # require log-amplitude
+        self.mfccs = librosa.feature.mfcc(S=librosa.logamplitude(self.melspec), n_mfcc=21)
+        # remove DCT component
+        self.mfccs = self.mfccs[1:,:]
+
+    def get_mfccs(self, filename='test.wav', secondframedecomp=False):
+        self.load_audiofile(filename=filename)
+        self.mel_spectrogram()
+        self.calc_mfccs()
+
+        if secondframedecomp:
+            win2 = int(round(8*self.melsr))
+            hop2 = int(round(0.5*self.melsr))
+            nbins, norigframes = self.melspec.shape
+            nframes = int(1+numpy.floor((norigframes-win2)/float(hop2)))
+            avemfccs = numpy.empty((nbins, nframes))
+            for i in range(nframes):  # loop over all 8-sec frames
+                avemfccs[:,i] = numpy.mean(self.mfccs[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True)
+            self.mfccs = avemfccs
+        return self.mfccs
+
+    def get_mfccs_from_melspec(self, melspec=[], melsr=[]):
+        self.melspec = melspec
+        self.melsr = melsr
+        self.calc_mfccs()
+        return self.mfccs
+
+if __name__ == '__main__':
+    mfs = MFCCs()
+    mfs.get_mfccs()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/OPMellin.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,241 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Aug 17 13:37:00 2015
+
+@author: mariapanteli
+"""
+
+import librosa
+import scipy.signal
+# import matplotlib.pyplot as plt
+import numpy
+
+class OPMellin:
+    def __init__(self, win2sec=8):
+        self.y = None
+        self.sr = None
+        self.melspec = None
+        self.melsr = None
+        self.op = None
+        self.opmellin = None
+        self.win2sec = win2sec
+
+    def load_audiofile(self, filename='test.wav', sr=None, segment=True):
+        self.y, self.sr = librosa.load(filename, sr=sr)
+        #self.y, self.sr = librosa.load(filename) # default sr=22050
+        if segment:
+            tracklength = self.y.shape[0]/float(self.sr)
+            startSample = 0
+            endSample = None
+            if tracklength > 90:
+                startPointSec = (tracklength/2.)-20
+                startSample = round(startPointSec*self.sr)
+                endSample = startSample+45*self.sr
+            self.y = self.y[startSample:endSample]
+
+
+    # def mel_spectrogram(self, y=None, sr=None, filename='1_21.wav'):
+    def mel_spectrogram(self, y=None, sr=None):
+        if self.y is None:
+            self.y = y
+        if self.sr is None:
+            self.sr = sr
+        win1 = int(round(0.04*self.sr))
+        hop1 = int(round(win1/8.))
+        nfft1 = int(2**numpy.ceil(numpy.log2(win1)))
+        nmels = 40
+        # y, sr = librosa.load(filename, sr=fs)
+        # melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=nfft, win_length=win1, window=scipy.signal.hamming, hop_length=hop1, n_mels=nmels)
+        # melspec = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_fft=nfft, hop_length=hop1, n_mels=nmels)
+        D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming))**2
+        #melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels)
+        melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels, fmax=8000)
+        melsr = self.sr/float(hop1)
+        # librosa.display.specshow(melspec)
+        # return melspec, melsr
+        self.melspec = melspec
+        self.melsr = melsr
+
+    # post process spectrogram
+    def post_process_spec(self, melspec=None, log=True, medianfilt=False, sqrt=False, smooth=False, diff=True, subtractmean=True, halfwave=True, maxnormal=True):
+        if self.melspec is None:
+            self.melspec = melspec
+        if log:
+            self.melspec = librosa.logamplitude(self.melspec)
+        if medianfilt:
+            ks = int(0.1 * self.melsr) # 100ms kernel size
+            if ks % 2 == 0: # ks must be odd
+                ks += 1
+            nmels = self.melspec.shape[0]
+            for i in range(nmels):
+                self.melspec[i,:] = scipy.signal.medfilt(self.melspec[i,:],kernel_size = ks)
+        if sqrt:
+            self.melspec = self.melspec**.5
+        if smooth:
+            step = 50.0
+            nmels = self.melspec.shape[0]
+            mm = self.melspec
+            for i in range(nmels):
+                mm[i,:] = numpy.convolve(mm[i,:], numpy.ones(step)/step, mode='same')
+            self.melspec = mm
+        if diff:
+            self.melspec = numpy.concatenate((self.melspec,self.melspec[:,-1,None]),axis=1)  # append one frame before diff to keep number of frames the same
+            self.melspec = numpy.diff(self.melspec, n=1, axis=1)
+        if subtractmean:
+            mean = self.melspec.mean(axis=1)
+            mean.shape = (mean.shape[0], 1)
+            self.melspec = self.melspec - mean
+        if halfwave:
+            self.melspec[numpy.where(self.melspec < 0)] = 0
+        if maxnormal:
+            if self.melspec.max() != 0:  # avoid division by 0
+                self.melspec = self.melspec/self.melspec.max()
+        # return melspec
+
+
+    def onset_patterns(self, melspec=None, melsr=None, fft=True, logfilter=False, center=True, bpmrange=True):
+        if self.melspec is None:
+            self.melspec = melspec
+        if self.melsr is None:
+            self.melsr = melsr
+        win2 = int(round(self.win2sec*self.melsr))
+        hop2 = int(round(0.5*self.melsr))
+        nmels, nmelframes = self.melspec.shape
+        if fft:  # periodicity via fft
+            #nfft2 = int(2**numpy.ceil(numpy.log2(win2)))
+            nfft2 = 2048  # nfft2 does not depend on win2??
+            melspectemp = self.melspec
+            if ((nfft2 > win2) and (center is False)):
+                # pad the signal by nfft2-win2 so that frame decomposition (n_frames)
+                # is still based on win2 and not nfft2
+                melspectemp = numpy.concatenate([numpy.zeros((nmels,int((nfft2 - win2) // 2))),self.melspec, numpy.zeros((nmels,int((nfft2 - win2) // 2)))],axis=1)
+            if melspectemp.shape[1]<nfft2:
+                # pad with zeros to have at least one 8-sec window
+                nzeros = nfft2 - melspectemp.shape[1]
+                melspectemp = numpy.concatenate([numpy.zeros((nmels,int(numpy.ceil(nzeros / 2.)))),melspectemp, numpy.zeros((nmels,int(numpy.ceil(nzeros / 2.))))],axis=1)
+                #melspectemp = numpy.concatenate([melspectemp, numpy.zeros((nmels,nzeros))],axis=1)
+            # nframes = int(round(numpy.ceil(round(nmelframes/hop2))))
+            ff0 = numpy.abs(librosa.stft(y=melspectemp[0, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center))
+            nframes = ff0.shape[1]
+            # nmags, nframes = ff0.shape
+            if logfilter:  # log filterbank
+                nbins = 25
+                binsperoct = 5
+                fft2 = numpy.zeros((nmels, nbins, nframes))
+                logfb = librosa.filters.logfrequency(self.melsr, nfft2, n_bins=nbins, bins_per_octave=binsperoct, fmin=0.5)
+                # logfb = librosa.filters.logfrequency(self.melsr, nfft2, n_bins=25, bins_per_octave=5, fmin=0.5)
+                for i in range(nmels):
+                    fftmags = numpy.abs(librosa.stft(y=melspectemp[i, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center))
+                    fftmags = numpy.dot(logfb, fftmags)  # log
+                    fft2[i, :, :] = fftmags
+            else:
+                # nmags = min(round(nfft2/2), 200)
+                if bpmrange:
+                    # freqresinbpm = float(self.melsr)/float(nfft2)*60.
+                    freqresinbpm = float(self.melsr)/float(nfft2/2.)*60.
+                    minmag = int(numpy.floor(30./freqresinbpm))  # min tempo 30bpm
+                    maxmag = int(numpy.ceil(960./freqresinbpm))  # max tempo 960 bpm
+                    magsinds = range(minmag, maxmag)
+                else:
+                    magsinds = range(0, 200)
+                nmags = len(magsinds)
+                fft2 = numpy.zeros((nmels, nmags, nframes))
+                for i in range(nmels):
+                    fftmags = numpy.abs(librosa.stft(y=melspectemp[i, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center))
+                    fftmags = fftmags[magsinds, :]
+                    fft2[i, :, :] = fftmags
+            op = fft2
+        else:  # periodicity via autocorrelation
+            nmags = min(win2, 1000)
+            if nmelframes < win2:
+                nframes = 1
+            else:
+                nframes = int(nmelframes/float(win2))
+            autocor = numpy.zeros((nmels, nmags, nframes))
+            for i in range(nmels):
+                ac = numpy.zeros((nmags, nframes))
+                for j in range(nframes):
+                    ac[:, j] = librosa.autocorrelate(self.melspec[i, (j*win2):min((j+1)*win2, nmelframes)], max_size=nmags)
+                autocor[i, :, :] = ac
+            op = autocor
+        self.op = op
+
+    def post_process_op(self, medianfiltOP=False):
+        if medianfiltOP:
+            hop2 = int(round(0.5*self.melsr))
+            ssr = self.melsr/float(hop2)
+            ks = int(0.5 * ssr)  # 100ms kernel size
+            if ks % 2 == 0:  # ks must be odd
+                ks += 1
+            nmels, nmags, nframes = self.op.shape
+            for i in range(nmels):
+                for j in range(nframes):
+                    #self.op[i,:,j] = scipy.signal.medfilt(self.op[i,:,j],kernel_size = ks)
+                    self.op[i,:,j] = numpy.convolve(self.op[i,:,j], numpy.ones(ks)/ks, mode='same')
+
+    def mellin_transform(self, op=None):
+        if self.op is None:
+            self.op = op
+        nmels, nmags, nframes = self.op.shape
+        #nmagsout = min(200, nmags)
+        nmagsout = 200
+        u_max = numpy.log(nmags)
+        delta_c = numpy.pi / u_max
+        c_max = nmagsout
+        c = numpy.arange(delta_c, c_max, delta_c)
+        k = range(1, nmags)
+        exponent = 0.5 - c*1j
+
+        normMat = 1./(exponent * numpy.sqrt(2*numpy.pi))
+        normMat.shape = (normMat.shape[0], 1)
+        normMat = numpy.repeat(normMat.T, nmels, axis=0)
+        kernelMat = numpy.asarray([numpy.power(ki, exponent) for ki in k])
+        opmellin = numpy.zeros((nmels, nmagsout, nframes))
+        for i in range(nframes):
+            self.op[:, -1, i] = 0
+            deltaMat = - numpy.diff(self.op[:, :, i])
+            mellin = numpy.abs(numpy.dot(deltaMat, kernelMat) * normMat)
+            opmellin[:, :, i] = mellin[:, :nmagsout]
+        self.opmellin = opmellin
+
+
+    def post_process_opmellin(self, opmellin=None, aveFreq=False, normFrame=True):
+        if self.opmellin is None:
+            self.opmellin = opmellin
+        if aveFreq:
+            self.opmellin = numpy.mean(self.opmellin, axis=0)
+        else:  # just reshape
+            nmels, nmags, nframes = self.opmellin.shape
+            self.opmellin = self.opmellin.reshape((nmels*nmags, nframes))
+        if normFrame:
+            min_opmellin = numpy.amin(self.opmellin, axis=0, keepdims=True)
+            max_opmellin = numpy.amax(self.opmellin, axis=0, keepdims=True)
+            denom = max_opmellin - min_opmellin
+            denom[denom==0] = 1 # avoid division by 0 if frame is all 0s-silent
+            self.opmellin = (self.opmellin - min_opmellin) / denom
+
+
+    def get_opmellin(self, filename='test.wav', logfilter=False, center=True, medianfiltspec=False, medianfiltOP=False):
+        self.load_audiofile(filename=filename)
+        self.mel_spectrogram()
+        #self.post_process_spec(medianfilt=medianfiltspec)
+        self.post_process_spec(log=False, sqrt=True, medianfilt=medianfiltspec)  # sqrt seems to work better
+        self.onset_patterns(logfilter=logfilter, center=center)
+        self.post_process_op(medianfiltOP=medianfiltOP)
+        self.mellin_transform()
+        self.post_process_opmellin()
+        return self.opmellin
+
+    def get_opmellin_from_melspec(self, melspec=[], melsr=[]):
+        self.melspec = melspec
+        self.melsr = melsr
+        self.post_process_spec(log=False, sqrt=True, medianfilt=True)  # sqrt seems to work better
+        self.onset_patterns(logfilter=False, center=False)
+        self.post_process_op(medianfiltOP=True)
+        self.mellin_transform()
+        self.post_process_opmellin()
+        return self.opmellin
+
+if __name__ == '__main__':
+    op = OPMellin()
+    op.get_opmellin()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/PitchBihist.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb  2 22:26:10 2016
+
+@author: mariapanteli
+"""
+import smoothiecore as s
+import numpy
+import scipy.signal
+# import librosa
+
+
+class PitchBihist:
+    def __init__(self, win2sec=8):
+        self.y = None
+        self.sr = None
+        self.chroma = None
+        self.chromasr = None
+        self.bihist = None
+        self.win2sec = win2sec
+
+    def bihist_from_chroma(self, filename='test.wav', secondframedecomp=True):
+        self.chroma, self.chromasr = s.get_smoothie_for_bihist(filename=filename, hopinsec=0.005)
+        if secondframedecomp:
+            win2 = int(round(8*self.chromasr))
+            hop2 = int(round(0.5*self.chromasr))
+            nbins, norigframes = self.chroma.shape
+            if norigframes<win2:
+                nframes = 1
+            else:
+                nframes = int(1+numpy.floor((norigframes-win2)/float(hop2)))
+            bihistframes = numpy.empty((nbins*nbins, nframes))
+            for i in range(nframes):  # loop over all 8-sec frames
+                frame = self.chroma[:, (i*hop2):min((i*hop2+win2),norigframes)]
+                bihist = self.bihistogram(frame)
+                bihist = numpy.reshape(bihist, -1)
+                bihistframes[:, i] = bihist
+            self.bihist = bihistframes
+        else:
+            self.bihist = numpy.reshape(self.bihistogram(), -1)
+
+    def bihist_from_melodia(self, filename='sample_melodia.csv', secondframedecomp=True, stop_sec=None):
+        def hz_to_cents(freq_Hz, ref_Hz=32.703, n_cents=1200):
+            """ convert frequency values from Hz to cents
+                reference frequency at C1
+            """
+            freq_cents = numpy.round(n_cents * numpy.log2(freq_Hz/ref_Hz))
+            return freq_cents
+        def wrap_to_octave(cents, octave_length=1200):
+            """ wrap to a single octave 0-1200
+            """
+            octave_cents = cents % octave_length
+            return octave_cents
+
+        n_bins = 60
+        data = numpy.loadtxt(filename, delimiter=',')
+        times, freqs = (data[:, 0], data[:, 1])
+        self.chromasr = 1. / (times[1] - times[0])
+        #self.chromasr = 128.
+        if stop_sec is not None:
+            stop_idx = numpy.where(times < stop_sec)[0]
+            times, freqs = times[stop_idx], freqs[stop_idx]
+        freqs[freqs<=0] = numpy.nan
+        #melody = freqs[freqs>0]
+        melody = freqs
+        n_frames = len(melody)
+        melody_cents = hz_to_cents(melody, n_cents=n_bins)
+        melody_octave = wrap_to_octave(melody_cents, octave_length=n_bins)
+        melody_matrix = numpy.zeros((n_bins, n_frames))
+        for time, pitch in enumerate(melody_octave):
+            if not numpy.isnan(pitch):
+                melody_matrix[int(pitch), time] = 1
+        if secondframedecomp:
+            win2 = int(round(self.win2sec*self.chromasr))
+            hop2 = int(round(0.5*self.chromasr))
+            nbins, norigframes = melody_matrix.shape
+            if norigframes<win2:
+                nframes = 1
+                win2 = norigframes
+            else:
+                nframes = int(1+numpy.floor((norigframes-win2)/float(hop2)))
+            bihistframes = numpy.empty((nbins*nbins, nframes))
+            for i in range(nframes):  # loop over all 8-sec frames
+                frame = melody_matrix[:, (i*hop2):(i*hop2+win2)]
+                bihist = self.bihistogram(frame)
+                bihist = numpy.reshape(bihist, -1)
+                bihistframes[:, i] = bihist
+            self.bihist = bihistframes
+        else:
+            self.bihist = self.bihistogram(melody_matrix)
+        return self.bihist
+
+    def bihistogram(self, spec, winsec=0.5, align=True):
+        win = int(round(winsec*self.chromasr))
+        ker = numpy.concatenate([numpy.zeros((win, 1)), numpy.ones((win+1, 1))], axis=0)
+        spec = spec.T  # transpose to have franes as rows in convolution
+
+        # energy threshold
+        thr = 0.3*numpy.max(spec)
+        spec[spec < max(thr, 0)] = 0
+
+        # transitions via convolution
+        tra = scipy.signal.convolve2d(spec, ker, mode='same')
+        tra[spec > 0] = 0
+
+        # multiply with original
+        B = numpy.dot(tra.T, spec)
+
+        # normalize
+        mxB = numpy.max(B)
+        mnB = numpy.min(B)
+        if mxB != mnB:
+            B = (B - mnB)/float(mxB-mnB)
+
+        # circshift to highest?
+        if align:
+            ref = numpy.argmax(numpy.sum(spec, axis=0))
+            B = numpy.roll(B, -ref, axis=0)
+            B = numpy.roll(B, -ref, axis=1)
+        return B
+
+    def bihist_from_precomp_chroma(self, align=False):
+        win2 = int(round(self.win2sec*self.chromasr))
+        hop2 = int(round(0.5*self.chromasr))
+        nbins, norigframes = self.chroma.shape
+        if norigframes<win2:
+            nframes = 1
+        else:
+            nframes = int(1+numpy.floor((norigframes-win2)/float(hop2)))
+        bihistframes = numpy.empty((nbins*nbins, nframes))
+        for i in range(nframes):  # loop over all 8-sec frames
+            frame = self.chroma[:, (i*hop2):min((i*hop2+win2),norigframes)]
+            bihist = self.bihistogram(frame, align=align)
+            bihist = numpy.reshape(bihist, -1)
+            bihistframes[:, i] = bihist
+        self.bihist = bihistframes
+
+    def get_pitchbihist(self, filename='test.wav'):
+        self.bihist_from_chroma(filename=filename)
+        return self.bihist
+
+    def get_pitchbihist_from_chroma(self, chroma=[], chromasr=[]):
+        self.chroma = chroma
+        self.chromasr = chromasr
+        self.bihist_from_precomp_chroma(align=False)
+        return self.bihist
+
+
+if __name__ == '__main__':
+    pb = PitchBihist()
+    pb.get_pitchbihist()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/load_dataset.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 15 22:52:57 2017
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+import pickle
+
+import load_features
+import util_dataset
+import util_filter_dataset
+
+
+#METADATA_FILE = 'sample_dataset/metadata.csv'
+#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
+WIN_SIZE = 2
+METADATA_FILE = 'data/metadata_BLSM_language_all.csv'
+#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle']
+#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle']
+OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
+                '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
+                '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
+
+def extract_features(df, win2sec=8.0):
+    """Extract features from melspec and chroma.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Metadata including class label and path to audio, melspec, chroma
+    win2sec : float
+        The window size for the second frame decomposition of the features
+
+    Returns
+    -------
+    X : np.array
+        The features for every frame x every audio file in the dataset
+    Y : np.array
+        The class labels for every frame in the dataset
+    Y_audio : np.array
+        The audio labels
+    """
+    feat_loader = load_features.FeatureLoader(win2sec=win2sec)
+    frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df)
+    print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape
+    X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1)
+    Y = Y_df.get_values()
+    Y_audio = Y_audio_df.get_values()
+    return X, Y, Y_audio
+
+
+if __name__ == '__main__':
+    # load dataset
+    df = pd.read_csv(METADATA_FILE)
+    df = util_filter_dataset.remove_missing_data(df)
+    subset_idx = util_dataset.subset_labels(df['Country'].get_values())
+    df = df.iloc[subset_idx, :]
+    X, Y = np.arange(len(df)), df['Country'].get_values()
+
+    # split in train, val, test set
+    train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y)
+
+    # extract features and write output
+    X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
+    with open(OUTPUT_FILES[0], 'wb') as f:
+        pickle.dump([X_train, Y_train, Y_audio_train], f)
+
+    X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
+    with open(OUTPUT_FILES[1], 'wb') as f:
+        pickle.dump([X_val, Y_val, Y_audio_val], f)
+
+    X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE)
+    with open(OUTPUT_FILES[2], 'wb') as f:
+        pickle.dump([X_test, Y_test, Y_audio_test], f)
+
+#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle'
+#    pickle.dump([X_test, Y_test, Y_audio_test], f)
+#with open(out_file, 'wb') as f:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/load_features.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,287 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar 16 01:50:57 2017
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+import os
+from sklearn.decomposition import NMF
+import OPMellin as opm
+import MFCC as mfc
+import PitchBihist as pbi
+
+
+class FeatureLoader:
+    def __init__(self, win2sec=8):
+        self.win2sec = float(win2sec)
+        self.sr = 44100.
+        self.win1 = int(round(0.04*self.sr))
+        self.hop1 = int(round(self.win1/8.))
+        self.framessr = self.sr/float(self.hop1)
+        self.win2 = int(round(self.win2sec*self.framessr))
+        self.hop2 = int(round(0.5*self.framessr))
+        self.framessr2 = self.framessr/float(self.hop2)
+
+
+    def get_op_mfcc_for_file(self, melspec_file=None, scale=True):
+        op = []
+        mfc = []
+        if not os.path.exists(melspec_file):
+            return op, mfc
+        print 'extracting onset patterns and mfccs...'
+        songframes = pd.read_csv(melspec_file, engine="c", header=None)
+        songframes.iloc[np.where(np.isnan(songframes))] = 0
+        songframes = songframes.iloc[0:min(len(songframes), 18000), :]  # only first 1.5 minutes
+        melspec = songframes.get_values().T
+        op = self.get_op_from_melspec(melspec, K=2)
+        mfc = self.get_mfcc_from_melspec(melspec)
+        if scale:
+            # scale all frames by mean and std of recording
+            op = (op - np.nanmean(op)) / np.nanstd(op)
+            mfc = (mfc - np.nanmean(mfc)) / np.nanstd(mfc)
+        return op, mfc
+
+
+    def get_chroma_for_file(self, chroma_file=None, scale=True):
+        ch = []
+        if not os.path.exists(chroma_file):
+            return ch
+        print 'extracting chroma...'
+        songframes = pd.read_csv(chroma_file, engine="c", header=None)
+        songframes.iloc[np.where(np.isnan(songframes))] = 0
+        songframes = songframes.iloc[0:min(len(songframes), 18000), :]  # only first 1.5 minutes
+        chroma = songframes.get_values().T
+        ch = self.get_ave_chroma(chroma)
+        if scale:
+            # scale all frames by mean and std of recording
+            ch = (ch - np.nanmean(ch)) / np.nanstd(ch)
+        return ch
+
+
+    def get_music_idx_from_bounds(self, bounds, sr=None):
+        music_idx = []
+        if len(bounds) == 0:
+            # bounds is empty list
+            return music_idx
+        nbounds = bounds.shape[0]
+        if len(np.where(bounds[:,2]=='m')[0])==0:
+            # no music segments
+            return music_idx
+        elif len(np.where(bounds[:,2]=='s')[0])==nbounds:
+            # all segments are speech
+            return music_idx
+        else:
+            half_win_hop = int(round(0.5 * self.win2 / float(self.hop2)))
+            music_bounds = np.where(bounds[:, 2] == 'm')[0]
+            bounds_in_frames = np.round(np.array(bounds[:, 0], dtype=float) * sr)
+            duration_in_frames = np.round(np.array(bounds[:, 1], dtype=float) * sr)
+            for music_bound in music_bounds:
+                lower_bound = np.max([0, bounds_in_frames[music_bound] - half_win_hop])
+                upper_bound = lower_bound + duration_in_frames[music_bound] - half_win_hop
+                music_idx.append(np.arange(lower_bound, upper_bound, dtype=int))
+            if len(music_idx)>0:
+                music_idx = np.sort(np.concatenate(music_idx))  # it should be sorted, but just in case segments overlap -- remove duplicates if segments overlap
+        return music_idx
+
+
+    def get_music_idx_for_file(self, segmenter_file=None):
+        music_idx = []
+        if os.path.exists(segmenter_file) and os.path.getsize(segmenter_file)>0:
+            print 'loading speech/music segments...'
+            bounds = pd.read_csv(segmenter_file, header=None, delimiter='\t').get_values()
+            if bounds.shape[1] == 1:  # depends on the computer platform
+                bounds = pd.read_csv(segmenter_file, header=None, delimiter=',').get_values()
+            music_idx = self.get_music_idx_from_bounds(bounds, sr=self.framessr2)
+        return music_idx
+
+
+    def get_features(self, df, class_label='Country'):
+        oplist = []
+        mflist = []
+        chlist = []
+        pblist = []
+        clabels = []
+        aulabels = []
+        n_files = len(df)
+        for i in range(n_files):
+            if not (os.path.exists(df['Melspec'].iloc[i]) and os.path.exists(df['Chroma'].iloc[i]) and os.path.exists(df['Melodia'].iloc[i])):
+                continue
+            print 'file ' + str(i) + ' of ' + str(n_files)
+            music_idx = self.get_music_idx_for_file(df['Speech'].iloc[i])
+            if len(music_idx)==0:
+                # no music segments -> skip this file
+                continue
+            try:
+                op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i])
+                ch = self.get_chroma_for_file(df['Chroma'].iloc[i])
+                #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i])
+                pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i])
+                #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i])
+            except:
+                continue
+            min_n_frames = np.min([len(op), len(mfcc), len(ch), len(pb)])  # ideally, features should have the same number of frames
+            if min_n_frames==0:
+                # no features extracted -> skip this file
+                continue
+            music_idx = music_idx[music_idx<min_n_frames]
+            n_frames = len(music_idx)
+            oplist.append(op.iloc[music_idx, :])
+            mflist.append(mfcc.iloc[music_idx, :])
+            chlist.append(ch.iloc[music_idx, :])
+            pblist.append(pb.iloc[music_idx, :])
+            clabels.append(pd.DataFrame(np.repeat(df[class_label].iloc[i], n_frames)))
+            aulabels.append(pd.DataFrame(np.repeat(df['Audio'].iloc[i], n_frames)))
+        print len(oplist), len(mflist), len(chlist), len(pblist), len(clabels), len(aulabels)
+        return pd.concat(oplist), pd.concat(mflist), pd.concat(chlist), pd.concat(pblist), pd.concat(clabels), pd.concat(aulabels)
+
+
+    def get_op_from_melspec(self, melspec, K=None):
+        op = opm.OPMellin(win2sec=self.win2sec)
+        opmellin = op.get_opmellin_from_melspec(melspec=melspec, melsr=self.framessr)
+        if K is not None:
+            opmel =  self.mean_K_bands(opmellin.T, K)
+        opmel = pd.DataFrame(opmel)
+        return opmel
+
+
+    def get_mfcc_from_melspec(self, melspec, deltamfcc=True, avelocalframes=True, stdlocalframes=True):
+        mf = mfc.MFCCs()
+        mfcc = mf.get_mfccs_from_melspec(melspec=melspec, melsr=self.framessr)
+        if deltamfcc:
+            ff = mfcc
+            ffdiff = np.diff(ff, axis=1)
+            ffdelta = np.concatenate((ffdiff, ffdiff[:,-1,None]), axis=1)
+            frames = np.concatenate([ff,ffdelta], axis=0)
+            mfcc = frames
+        if avelocalframes:
+            mfcc = self.average_local_frames(mfcc, getstd=stdlocalframes)
+        mfcc = pd.DataFrame(mfcc.T)
+        return mfcc
+
+
+    def get_ave_chroma(self, chroma, avelocalframes=True, stdlocalframes=True, alignchroma=True):
+        chroma[np.where(np.isnan(chroma))] = 0
+        if alignchroma:
+            maxind = np.argmax(np.sum(chroma, axis=1))
+            chroma = np.roll(chroma, -maxind, axis=0)
+        if avelocalframes:
+            chroma = self.average_local_frames(chroma, getstd=stdlocalframes)
+        chroma = pd.DataFrame(chroma.T)
+        return chroma
+
+
+    def average_local_frames(self, frames, getstd=False):
+        nbins, norigframes = frames.shape
+        if norigframes<self.win2:
+            nframes = 1
+        else:
+            nframes = int(1+np.floor((norigframes-self.win2)/float(self.hop2)))
+        if getstd:
+            aveframes = np.empty((nbins+nbins, nframes))
+            for i in range(nframes):  # loop over all 8-sec frames
+                meanf = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1)
+                stdf = np.nanstd(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1)
+                aveframes[:,i] = np.concatenate((meanf,stdf))
+        else:
+            aveframes = np.empty((nbins, nframes))
+            for i in range(nframes):  # loop over all 8-sec frames
+                aveframes[:,i] = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1)
+        return aveframes
+
+
+    def mean_K_bands(self, songframes, K=40, nmels=40):
+        [F, P] = songframes.shape
+        Pproc = int((P/nmels)*K)
+        procframes = np.zeros([F, Pproc])
+        niters = int(nmels/K)
+        nbins = P/nmels  # must be 200 bins
+        for k in range(K):
+            for j in range(k*niters, (k+1)*niters):
+                procframes[:, (k*nbins):((k+1)*nbins)] += songframes[:, (j*nbins):((j+1)*nbins)]
+        procframes /= float(niters)
+        return procframes
+
+
+    def nmfpitchbihist(self, frames):
+        nbins, nfr = frames.shape
+        npc = 2
+        nb = int(np.sqrt(nbins))  # assume structure of pitch bihist is nbins*nbins
+        newframes = np.empty(((nb+nb)*npc, nfr))
+        for fr in range(nfr):
+            pb = np.reshape(frames[:, fr], (nb, nb))
+            try:
+                nmfmodel = NMF(n_components=npc).fit(pb)
+                W = nmfmodel.transform(pb)
+                H = nmfmodel.components_.T
+                newframes[:, fr, None] = np.concatenate((W, H)).flatten()[:, None]
+            except:
+                newframes[:, fr, None] = np.zeros(((nb+nb)*npc, 1))
+        return newframes
+
+
+    def get_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True):
+        pb = []
+        if not os.path.exists(melodia_file):
+            return pb
+        print 'extracting pitch bihist from melodia...'
+        pb = pbi.PitchBihist(win2sec=self.win2sec)
+        pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=90.0)
+        if nmfpb is True:
+            pbihist = self.nmfpitchbihist(pbihist)
+        pbihist = pd.DataFrame(pbihist.T)
+        if scale:
+            # scale all frames by mean and std of recording
+            pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
+        return pbihist
+
+
+    def load_precomputed_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True):
+        base = os.path.basename(melodia_file)
+        root = '/import/c4dm-05/mariap/Melodia-melody-'+str(int(self.win2sec))+'sec/'
+        print 'load precomputed pitch bihist', root
+        if self.win2sec == 8:
+            pbihist = pd.read_csv(os.path.join(root, base)).iloc[1:,1:]
+        else:
+            pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T
+            if nmfpb is True:
+                pbihist = self.nmfpitchbihist(pbihist)
+            pbihist = pd.DataFrame(pbihist.T)
+        print pbihist.shape
+        if scale:
+            # scale all frames by mean and std of recording
+            pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist)
+        return pbihist
+
+
+#    def get_pb_chroma_for_file(self, chroma_file=None, scale=True):
+#        ch = []
+#        pb = []
+#        if not os.path.exists(chroma_file):
+#            return ch, pb
+#        songframes = pd.read_csv(chroma_file, engine="c", header=None)
+#        songframes.iloc[np.where(np.isnan(songframes))] = 0
+#        songframes = songframes.iloc[0:min(len(songframes), 18000), :]  # only first 1.5 minutes
+#        chroma = songframes.get_values().T
+#        ch = self.get_ave_chroma(chroma)
+#        pb = self.get_pbihist_from_chroma(chroma)
+#        if scale:
+#            # scale all frames by mean and std of recording
+#            ch = (ch - np.nanmean(ch)) / np.nanstd(ch)
+#            pb = (pb - np.nanmean(pb)) / np.nanstd(pb)
+#        return ch, pb
+
+
+#    def get_pbihist_from_chroma(self, chroma, alignchroma=True, nmfpb=True):
+#        pb = pbi.PitchBihist(win2sec=self.win2sec)
+#        chroma[np.where(np.isnan(chroma))] = 0
+#        if alignchroma:
+#            maxind = np.argmax(np.sum(chroma, axis=1))
+#            chroma = np.roll(chroma, -maxind, axis=0)
+#        pbihist = pb.get_pitchbihist_from_chroma(chroma=chroma, chromasr=self.framessr)
+#        if nmfpb is True:
+#            pbihist = self.nmfpitchbihist(pbihist)
+#        pbihist = pd.DataFrame(pbihist.T)
+#        return pbihist
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/results.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 12 20:49:48 2016
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+import pickle
+from collections import Counter
+from sklearn.cluster import KMeans
+
+import utils
+import utils_spatial
+
+
+def country_outlier_df(counts, labels, out_file=None, normalize=False):
+    if len(counts.keys()) < len(np.unique(labels)):
+        for label in np.unique(labels):
+            if not counts.has_key(label):
+                counts.update({label:0})
+    if normalize is True:
+        counts = normalize_outlier_counts(counts, Counter(labels))
+    df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
+    df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True)
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
+    return df
+
+
+def normalize_outlier_counts(outlier_counts, country_counts):
+    '''Normalize a dictionary of outlier counts per country by
+        the total number of recordings per country
+    '''
+    for key in outlier_counts.keys():
+        # dictionaries should have the same keys
+        outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key])
+    return outlier_counts
+
+
+def get_outliers_df(X, Y, chi2thr=0.999, out_file=None):
+    threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr)
+    global_counts = Counter(Y[y_pred])
+    df = country_outlier_df(global_counts, Y, normalize=True)
+    if out_file is not None:
+        df.to_csv(out_file, index=False)
+    return df, threshold, MD
+
+
+def print_most_least_outliers_topN(df, N=10):
+    sort_inds = df['Outliers'].argsort()  # ascending order
+    df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]]
+    df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]]
+    print "most outliers "
+    print df_most
+    print "least outliers "
+    print df_least
+
+
+def load_metadata(Yaudio, metadata_file):
+    df = pd.read_csv(metadata_file)
+    df_audio = pd.DataFrame({'Audio':Yaudio})
+    ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio
+    return ddf
+
+
+def clusters_metadata(df, cl_pred, out_file=None):
+    def get_top_N_counts(labels, N=3):
+        ulab, ucount = np.unique(labels, return_counts=True)
+        inds = np.argsort(ucount)
+        return zip(ulab[inds[-N:]],ucount[inds[-N:]])
+    info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))])
+    styles_description = []
+    uniq_cl = np.unique(cl_pred)
+    for ccl in uniq_cl:
+        inds = np.where(cl_pred==ccl)[0]
+        styles_description.append(get_top_N_counts(info[inds], N=3))
+    df_styles = pd.DataFrame(data=styles_description, index=uniq_cl)
+    print df_styles.to_latex()
+    if out_file is not None:
+        df_styles.to_csv(out_file, index=False)
+
+
+# load LDA-transformed frames
+X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb'))
+ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
+w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
+w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
+Xrhy, Xmel, Xmfc, Xchr = X_list
+X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
+
+# global outliers
+df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
+print_most_least_outliers_topN(df_global, N=10)
+
+spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True)
+spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers]))
+df_local = country_outlier_df(spatial_counts, Y, normalize=True)
+print_most_least_outliers_topN(df_local, N=10)
+
+feat = [Xrhy, Xmel, Xmfc, Xchr]
+feat_labels = ['rhy', 'mel', 'mfc', 'chr']
+tabs_feat = []
+for i in range(len(feat)):
+    XX = feat[i]
+    df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999)
+    print_most_least_outliers_topN(df_feat, N=5)
+
+# how many styles are there
+#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine")
+bestncl = 13
+
+# get cluster predictions and metadata for each cluster
+cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X)
+centroids = cluster_model.cluster_centers_
+cl_pred = cluster_model.predict(X)
+ddf['Clusters'] = cl_pred
+clusters_metadata(ddf, cl_pred)
+
+# how similar are the cultures and which ones seem to be global outliers
+cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
+
+# Moran on Mahalanobis distances
+data = cluster_freq.get_values()
+data_countries = cluster_freq.index
+#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999)
+threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999)
+y = np.sqrt(MD)
+utils_spatial.print_Moran_outliers(y, w, data_countries)
+utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/smoothiecore.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,328 @@
+"""Smoothie
+
+A module aimed at melody salience and melody features. It contains
+
+* a frequency transform inspired by constant-Q transforms
+* an NMF-based melodic salience function
+* methods that transform this salience into melody features (not implemented)
+"""
+
+import sys
+import numpy as np
+from scipy.signal import hann
+from scipy.sparse import csr_matrix
+import librosa
+
+p = {# smoothie q mapping parameters
+     'bpo'           :     36,
+     'break1'        :    500,
+     'break2'        :   4000,
+     'f_max'         :   8000,
+     'fs'            :  16000,
+     # spectrogram parameters
+     'step_size'     :    160,
+     'use_A_wtg'     :  False,
+     'harm_remove'   :  False,
+     # NMF and NMF dictionary parameters
+     's'             :      0.85, # decay parameter
+     'n_harm'        :     50, # I'm gonna do you no harm
+     'min_midi'      :     25,
+     'max_midi'      :     90.8,
+     'bps'           :      5,
+     'n_iter'        :     30
+     }
+
+def update_nmf_simple(H, W, V):
+	"""Update the gain matrix H using the multiplicative NMF update equation.
+
+	Keyword arguments:
+	H -- gain matrix
+	W -- template matrix
+	V -- target data matrix
+	"""
+	WHV = np.dot(W, H) ** (-1) * V
+	H = H * np.dot(W.transpose(), WHV)
+	return H
+
+def make_a_weight(f):
+	"""Return the values of A-weighting at the input frequencies f."""
+	f = np.array(f)
+	zaehler = 12200 * 12200* (f**4)
+	nenner = (f*f + 20.6*20.6) * np.sqrt((f*f + 107.7*107.7) * (f*f + 737.9*737.9)) * (f*f + 12200*12200)
+	return zaehler / nenner
+
+def nextpow2(x):
+    """Return the smallest integer n such that 2 ** n > x."""
+    return np.ceil(np.log2(x))
+
+def get_smoothie_frequencies(p):
+	"""Calculate filter centres and widths for the smooth Q transform."""
+	# I think this calculation should be reconsidered in order to have
+	# a more principled transition between linear and exponential
+	n = 1.0 / (2.0**(1.0/p['bpo']) - 1)
+	x = p['break1'] * 1.0 / n
+
+	# first linear stretch
+	# f = (np.arange(1, int(np.round(n)) + 1) * x).round().tolist()
+	f = (np.arange(1, int(np.round(n)) + 1) * x).tolist()
+
+	# exponential stretch
+	while max(f) < p['break2']:
+		f.append(max(f) * 2**(1.0/p['bpo']))
+
+	# final linear stretch
+	lastdiff = f[-1] - f[-2]
+	while max(f) < p['f_max']:
+		f.append(max(f) + lastdiff)
+
+	deltaf = np.diff(np.array(f))
+	f = f[:-1]
+
+	return f, deltaf
+
+def create_smoothie_kernel(f, deltaf, fs):
+	"""Create a sparse matrix that maps the complex DFT to the complex
+	smoothie representation.
+	"""
+	print >>sys.stdout, "[ SMOOTHIE Q kernel calculation ... ]"
+	n_filter = len(f)
+	n_fft = 2**nextpow2(np.ceil(fs/min(deltaf)))
+
+	thresh = 0.0054
+	smoothie_kernel = np.zeros([n_fft, n_filter], np.complex64)
+
+	for i_filter in range(n_filter-1, -1, -1): # descending
+		# print i_filter
+		Q = f[i_filter] * 1.0 / deltaf[i_filter] # local Q for this filter
+		lgth = int(np.ceil(fs * 1.0 / deltaf[i_filter]))
+		lgthinv = 1.0 / lgth
+		offset = int(n_fft/2 - np.ceil(lgth * 0.5))
+		temp = hann(lgth) * lgthinv * \
+			np.exp(2j * np.pi * Q * (np.arange(lgth) - offset) * lgthinv)
+		# print(sum(hann(lgth)), Q, lgth, offset)
+		temp_kernel = np.zeros(n_fft, dtype = np.complex64)
+		temp_kernel[np.arange(lgth) + offset] = temp
+		spec_kernel = np.fft.fft(temp_kernel)
+		spec_kernel[abs(spec_kernel) <= thresh] = 0
+		smoothie_kernel[:,i_filter] = spec_kernel
+	return csr_matrix(smoothie_kernel.conj()/n_fft)
+
+def smoothie_q_spectrogram(x, p):
+	"""Calculate the actual spectrogram with smooth Q frequencies"""
+	print >>sys.stdout, "[ SMOOTHIE Q spectrogram ... ]"
+
+	# precalculate smoothie kernel
+	f, deltaf = get_smoothie_frequencies(p)
+	smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs'])
+	n_fft, n_filter = smoothie_kernel.shape
+
+	# some preparations
+	n_sample = len(x)
+	# n_frame = int(np.floor(n_sample / p['step_size']))
+	n_frame = int(np.ceil(n_sample / float(p['step_size']))) # added mp
+	t = (np.arange(n_frame) * p['step_size']) * 1.0 / p['fs']
+	smoothie_kernelT = smoothie_kernel.T
+
+	# allocate
+	s = np.zeros((n_filter, n_frame), dtype = np.complex64)
+
+	# pad (if wanted)
+	x = np.concatenate((np.zeros(n_fft/2), x, np.zeros(n_fft/2)))
+
+	for i_frame in range(n_frame):
+		smpl = p['step_size'] * i_frame
+		block = x[smpl + np.arange(n_fft)]
+		s[:, i_frame] = smoothie_kernelT.dot(np.fft.fft(block))
+
+	if p['use_A_wtg']:
+		a_wtg = make_a_weight(f)
+		s = s * a_wtg[:, np.newaxis]
+
+	return s, t
+
+def mel_triangles(input_f):
+	"""Make matrix with mel filters at the given frequencies.
+	Warning: this is a very coarse mel filterbank.
+	"""
+	n_linearfilters = 3
+	n_logfilters0 = 30 # just something high, will be pruned later
+	linear_spacing = 100
+	log_spacing = 6.0/4
+	n_filter0 = n_linearfilters + n_logfilters0
+
+	freqs = np.zeros(n_filter0+2) # includes one more on either side, hence +2
+	freqs[range(n_linearfilters+1)] = \
+		np.arange(-1,n_linearfilters) * linear_spacing
+	freqs[range(n_linearfilters+1, n_filter0+2)] = \
+		freqs[n_linearfilters] * log_spacing ** np.arange(1, n_logfilters0+2)
+
+	centre_freqs = freqs[1:-1]
+	lower_freqs  = freqs[0:-2]
+	upper_freqs = freqs[2:]
+
+	n_filter = list(np.nonzero(lower_freqs < max(input_f)))[0][-1] + 1
+
+	n_input_f = len(input_f)
+	mtr = np.zeros((n_input_f, n_filter))
+
+	for i_filter in range(n_filter):
+		for i_freq in range(n_input_f):
+			if input_f[i_freq] > lower_freqs[i_filter] \
+				and input_f[i_freq] <= upper_freqs[i_filter]:
+
+				if input_f[i_freq] <= centre_freqs[i_filter]:
+					mtr[i_freq, i_filter] = \
+						(input_f[i_freq] - lower_freqs[i_filter]) * 1.0 / \
+						(centre_freqs[i_filter] - lower_freqs[i_filter])
+				else:
+					mtr[i_freq, i_filter] = \
+						1 - (input_f[i_freq] - centre_freqs[i_filter]) / \
+						(upper_freqs[i_filter] - centre_freqs[i_filter])
+	return mtr
+
+def create_smoothie_nfm_dict(smoothie_kernel, filterf, p):
+	"""Create dictionary matrix with note templates."""
+	n_note = int((p['max_midi'] - p['min_midi']) * p['bps'] + 1)
+	n_fft, n_filter = smoothie_kernel.shape
+	t = ((np.arange(n_fft) + 1) - ((n_fft + 1)*0.5))/p['fs']
+
+	mtr = mel_triangles(filterf)
+	n_template = n_note + mtr.shape[1]
+
+	w = np.zeros((n_filter, n_template), dtype = np.complex64)
+	w[:,n_note:] = mtr
+	f0s = []
+
+	smoothie_kernelT = smoothie_kernel.T
+
+	for i_note in range(n_note):
+		midi = p['min_midi'] + i_note * 1.0 / p['bps']
+		f0 = 440 * 2 ** ((midi-69)*1.0/12)
+		f0s.append(f0)
+		sig = np.zeros(len(t))
+		for i_harm in range(p['n_harm']):
+			f = f0 * (i_harm + 1)
+			if f > p['fs'] * 0.5:
+				continue
+			x = np.sin(2*np.pi*f*t) * p['s']**(i_harm)
+			sig += x
+		w[:, i_note] = smoothie_kernelT.dot(np.fft.fft(sig))
+
+	for i in range(mtr.shape[1]):
+		f0s.append(np.nan)
+
+	w = abs(w)
+	col_sums = w.sum(axis = 0)
+	w = w / col_sums[np.newaxis, :] # normalisation
+	return w, np.array(f0s)
+
+def smoothie_salience(x, p, do_tune = False):
+	"""Calculate melodic salience."""
+	print >>sys.stdout, "[ SMOOTHIE Q salience ... ]"
+
+	# precalculate nmf kernel
+	f, deltaf = get_smoothie_frequencies(p)
+	smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs'])
+	w, f0s = create_smoothie_nfm_dict(smoothie_kernel, f, p)
+
+	# calculate smoothiegram
+	s, t = smoothie_q_spectrogram(x, p)
+	s[s==0] = np.spacing(1) # very small number
+	s = abs(s)
+
+	# run NMF
+	n_frame = len(t)
+	print >>sys.stdout, "[ SMOOTHIE Q : NMF updates ... ]"
+	sal = np.ones((w.shape[1], n_frame))
+	for i_iter in range(p['n_iter']):
+		sal = update_nmf_simple(sal, w, s)
+
+	if do_tune:
+		error("Tuning isn't yet implemented in the Python version")
+
+	return sal, t, f0s
+
+def qnormalise(x, q, dim):
+    nrm = np.sum(x**q, axis=dim, keepdims=True)**(1./float(q))
+    nrmmatrix = np.repeat(nrm, x.shape[0], axis=0)
+    x = x / nrmmatrix
+    return x
+
+def wrap_to_octave(sal, param):
+    epsilon = 0.00001
+    nsal = qnormalise(sal, 2, 0)
+
+    step = 1./float(param['bps'])
+    NMFpitch = np.arange(param['min_midi'],param['max_midi']+step,step)
+    nNote = len(NMFpitch)
+    nsal = nsal[0:nNote, :]
+
+    allsal = nsal
+    allsal[nsal<epsilon] = epsilon
+
+    # chroma mapping
+    n = param['bps']*12
+    mmap = np.tile(np.eye(n),(1,5))
+    allchroma = mmap.dot(allsal[0:(n*5),:])
+    return allchroma
+
+def segment_audio(y, sr):
+    tracklength = y.shape[0]/float(sr)
+    startSample = 0
+    endSample = None
+    if tracklength > 90:
+        startPointSec = (tracklength/2.)-20
+        startSample = round(startPointSec*sr)
+        endSample = startSample+45*sr
+    y = y[startSample:endSample]
+    return y
+
+def get_smoothie(filename = 'test.wav', param = None, segment=True, secondframedecomp=False, hopinsec=None):
+    if param is None:
+        param = p
+        param['fs'] = None
+        param['step_size'] = 128
+
+    y, sr = librosa.load(filename, sr = param['fs'])
+    param['fs'] = sr
+    if hopinsec is not None:
+        param['step_size'] = int(round(hopinsec*param['fs']))
+    if segment:
+        y = segment_audio(y,sr)
+    # sg, t = smoothie_q_spectrogram(y, param)
+    # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param)
+    sal, t, f0s = smoothie_salience(y, param)
+    sal = sal[-np.isnan(f0s),:]
+    chroma = wrap_to_octave(sal, param)
+
+    if secondframedecomp:
+        chromasr = param['fs']/float(param['step_size'])
+        win2 = int(round(8*chromasr))
+        hop2 = int(round(0.5*chromasr))
+        nbins, norigframes = chroma.shape
+        nframes = int(1+np.floor((norigframes-win2)/float(hop2)))
+        avechroma = np.empty((nbins, nframes))
+        for i in range(nframes):  # loop over all 8-sec frames
+            avechroma[:,i] = np.mean(chroma[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True)
+        chroma = avechroma
+    return chroma
+
+def get_smoothie_for_bihist(filename = 'test.wav', param = None, segment=True, hopinsec=None):
+    if param is None:
+        param = p
+        param['fs'] = None # no downsample
+        param['step_size'] = 128
+
+    y, sr = librosa.load(filename, sr = param['fs'])
+    param['fs'] = sr
+    if hopinsec is not None:
+        param['step_size'] = int(round(hopinsec*param['fs']))
+    if segment:
+        y = segment_audio(y,sr)
+    # sg, t = smoothie_q_spectrogram(y, param)
+    # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param)
+    sal, t, f0s = smoothie_salience(y, param)
+    sal = sal[-np.isnan(f0s),:]
+    chroma = wrap_to_octave(sal, param)
+    chromasr = param['fs']/float(param['step_size'])
+    return (chroma, chromasr)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/util_dataset.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 15 23:04:24 2017
+
+@author: mariapanteli
+"""
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+
+def get_train_val_test_idx(X, Y, seed=None):
+    """ Split in train, validation, test sets.
+
+    Parameters
+    ----------
+    X : np.array
+        Data or indices.
+    Y : np.array
+        Class labels for data in X.
+    seed: int
+        Random seed.
+    Returns
+    -------
+    (X_train, Y_train) : tuple
+        Data X and labels y for the train set
+    (X_val, Y_val) : tuple
+        Data X and labels y for the validation set
+    (X_test, Y_test) : tuple
+        Data X and labels y for the test set
+
+    """
+    X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y)
+    X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test)
+    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
+
+
+def subset_labels(Y, N_min=10, N_max=100, seed=None):
+    """ Subset dataset to contain minimum N_min and maximum N_max instances
+        per class. Return indices for this subset.
+
+    Parameters
+    ----------
+    Y : np.array
+        Class labels
+    N_min : int
+        Minimum instances per class
+    N_max : int
+        Maximum instances per class
+    seed: int
+        Random seed.
+
+    Returns
+    -------
+    subset_idx : np.array
+        Indices for a subset with classes of size bounded by N_min, N_max
+
+    """
+    subset_idx = []
+    labels = np.unique(Y)
+    for label in labels:
+        label_idx = np.where(Y==label)[0]
+        counts = len(label_idx)
+        if counts>=N_max:
+            subset_idx.append(np.random.choice(label_idx, N_max, replace=False))
+        elif counts>=N_min and counts<N_max:
+            subset_idx.append(label_idx)
+        else:
+            # not enough samples for this class, skip
+            continue
+    if len(subset_idx)>0:
+        subset_idx = np.concatenate(subset_idx, axis=0)
+    return subset_idx
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/utils.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed May 17 11:26:01 2017
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn import metrics
+from collections import Counter
+from sklearn.decomposition import PCA
+from scipy import stats
+from scipy.spatial.distance import mahalanobis
+from sklearn.covariance import MinCovDet
+
+
+def get_outliers(X, chi2thr=0.975):
+    robust_cov = MinCovDet().fit(X)
+    MD = robust_cov.mahalanobis(X)
+    chi2 = stats.chi2
+    degrees_of_freedom = X.shape[1]
+    threshold = chi2.ppf(chi2thr, degrees_of_freedom)
+    y_pred = MD>threshold
+    return threshold, y_pred, MD
+
+
+def get_outliers_Mahal(X, chi2thr=0.975):
+    n_samples = X.shape[0]
+    inv_cov = np.linalg.inv(np.cov(X.T))
+    col_mean = np.mean(X, axis=0)
+    MD = np.zeros(n_samples, dtype='float')
+    for i in range(n_samples):
+        MD[i] = mahalanobis(X[i,:], col_mean, inv_cov)
+    MD = MD ** 2
+    degrees_of_freedom = X.shape[1]
+    chi2 = stats.chi2
+    threshold = chi2.ppf(chi2thr, degrees_of_freedom)
+    y_pred = MD>threshold
+    return threshold, y_pred, MD
+
+
+def pca_data(X, min_variance=None):
+    # rotate data to avoid singularity in Mahalanobis/covariance matrix
+    model = PCA(whiten=True).fit(X)
+    model.explained_variance_ratio_.sum()
+    if min_variance is None:
+        n_pc = X.shape[1]
+    else:
+        n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0]
+    X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X)
+    return X_pca, n_pc
+
+
+def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False):
+    uniq_labels = np.unique(Y)
+    spatial_outliers = []
+    for uniq_label in uniq_labels:
+        countries_neighbors = w_dict[uniq_label]
+        if len(countries_neighbors)==0:
+            print uniq_label, " no neighbors found"
+            continue
+        inds_neighborhood = []
+        for country in countries_neighbors:
+            inds = np.where(Y==country)[0]
+            inds_neighborhood.append(inds) # append neighboring countries
+        if len(np.concatenate(inds_neighborhood))==0:
+            print "no neighbors found"
+            continue
+        inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country
+        inds_neighborhood = np.concatenate(inds_neighborhood)
+        if do_pca:
+            XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99)
+        else:
+            XX = X[inds_neighborhood, :]  # assume X is already in PCA
+        print len(inds_neighborhood)
+        if len(inds_neighborhood)<XX.shape[1]:
+            print uniq_label, " neighborhood too small for number of features"
+            continue
+        threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr)
+        counts = Counter(Y[inds_neighborhood[y_pred]])
+        spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood])
+    return spatial_outliers
+
+
+def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'):
+    ave_silh = []
+    for i in range(min_ncl):
+        ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering
+    for ncl in range(min_ncl, max_ncl):
+        print ncl
+        cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X)
+        ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg
+    ave_silh = np.array(ave_silh)
+    bestncl = np.nanargmax(ave_silh)
+    return bestncl, ave_silh
+
+
+def get_cluster_freq_linear(X, Y, centroids):
+    """ for each label in Y get the distribution of clusters by linear encoding
+    """
+    def encode_linear(X, centroids):
+        """Linear encoding via the dot product
+        """
+        return np.dot(X, centroids.T)
+    encoding = encode_linear(X, centroids)
+    encoding_df = pd.DataFrame(data=encoding, index=Y)
+    encoding_df_sum = encoding_df.groupby(encoding_df.index).sum()
+    cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum)
+    cluster_freq.index.name = 'labels'
+    return cluster_freq
+
+
+def get_cluster_predictions(X, n_clusters=10):
+    cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X)
+    return cl_pred
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/utils_spatial.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed May 17 11:35:51 2017
+
+@author: mariapanteli
+"""
+import numpy as np
+import json
+import pysal # before shapely in util_plots
+import fiona
+import sys
+sys.path.append('../misc')
+import matplotlib.pyplot as plt
+
+
+def neighbors_from_json_file(data_countries, json_DB='../MergeBL-Smith/data/countries.json'):
+    neighbors = {}
+    with open(json_DB) as json_file:
+        countries_dict = json.load(json_file)
+    country_names = []
+    country_iso = []
+    country_borders_iso = []
+    for country_info in countries_dict:
+        country_names.append(country_info['name']['common'])
+        country_iso.append(country_info['cca3'])
+        country_borders_iso.append(country_info['borders'])
+    # temporary fixes of country names to match json data
+    country_names[country_names.index('United States')] = 'United States of America'
+    country_names[country_names.index('Tanzania')] = 'United Republic of Tanzania'
+    country_names[country_names.index('DR Congo')] = 'Democratic Republic of the Congo'
+    country_names[country_names.index('Czechia')] = 'Czech Republic'
+    for i, country in enumerate(data_countries):
+        neighbors[i] = {}
+        if country in country_names:
+            if len(country_borders_iso[country_names.index(country)])>0:
+                # if country has neighbors according to json file
+                neighbors_iso = country_borders_iso[country_names.index(country)]
+                neighbors_names = [country_names[country_iso.index(nn)] for nn in neighbors_iso]
+                for neighbor in neighbors_names:
+                    if neighbor in data_countries:
+                        neighbor_idx = np.where(data_countries==neighbor)[0][0]
+                        neighbors[i][neighbor_idx] = 1.0
+    w = pysal.weights.W(neighbors, id_order=range(len(data_countries)))
+    return w
+
+
+def get_countries_from_shapefile(shapefile):
+    shp = fiona.open(shapefile, 'r')
+    countries = []
+    if shp[0]["properties"].has_key("ADMIN"):
+        country_keyword = "ADMIN"
+    elif shp[0]["properties"].has_key("NAME"):
+        country_keyword = "NAME"
+    else:
+        country_keyword = "admin"
+    for line in shp:
+        countries.append(line["properties"][country_keyword])
+    shp.close()
+    return countries
+
+
+def replace_empty_neighbours_with_KNN(data_countries, w):
+    shapefile = "../MergeBL-Smith/shapefiles/ne_10m_admin_0_countries.shp"
+    no_neighbors_idx = w.islands
+    knn = 10
+    wknn = pysal.knnW_from_shapefile(shapefile, knn)
+    knn_countries = get_countries_from_shapefile(shapefile)
+    neighbors = w.neighbors
+    for nn_idx in no_neighbors_idx:
+        country = data_countries[nn_idx]
+        print country
+        if country not in knn_countries:
+            continue
+        knn_country_idx = knn_countries.index(country)
+        knn_country_neighbors = [knn_countries[nn] for nn in wknn.neighbors[knn_country_idx]]
+        for knn_nn in knn_country_neighbors:
+            if len(neighbors[nn_idx])>2:
+                continue
+            data_country_idx = np.where(data_countries==knn_nn)[0]
+            if len(data_country_idx)>0:
+                neighbors[nn_idx][data_country_idx[0]] = 1.0
+    w = pysal.weights.W(neighbors, id_order=range(len(data_countries)))
+    return w
+
+
+def get_neighbors_for_countries_in_dataset(Y):
+    # neighbors
+    data_countries = np.unique(Y)
+    w = neighbors_from_json_file(data_countries)
+    w = replace_empty_neighbours_with_KNN(data_countries, w)
+    return w, data_countries
+
+
+def from_weights_to_dict(w, data_countries):
+    w_dict = {}
+    for i in w.neighbors:
+        w_dict[data_countries[i]] = [data_countries[nn] for nn in w.neighbors[i]]
+    return w_dict
+
+
+def get_LH_HL_idx(lm, p_vals):
+    sig_idx = np.where(p_vals<0.05)[0]
+    LH_idx = sig_idx[np.where(lm.q[sig_idx]==2)[0]]
+    HL_idx = sig_idx[np.where(lm.q[sig_idx]==4)[0]]
+    return LH_idx, HL_idx
+
+
+def print_Moran_outliers(y, w, data_countries):
+    lm = pysal.Moran_Local(y, w)
+    p_vals = lm.p_z_sim
+    LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals)
+    print 'LH', zip(data_countries[LH_idx], p_vals[LH_idx])  # LH
+    print 'HL', zip(data_countries[HL_idx], p_vals[HL_idx])  # HL
+
+
+def plot_Moran_scatterplot(y, w, data_countries, out_file=None):
+    lm = pysal.Moran_Local(y, w)
+    p_vals = lm.p_z_sim
+    LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals)
+
+    ylt  = pysal.lag_spatial(lm.w, lm.y)
+    yt = lm.y
+    yt = (yt - np.mean(yt))/np.std(yt)
+    ylt = (ylt - np.mean(ylt))/np.std(ylt)
+    colors = plt.cm.spectral(np.linspace(0,1,5))
+    quad = np.zeros(yt.shape, dtype=int)
+    quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH
+    quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH
+    quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL
+    quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL
+    marker_color = colors[quad]
+    marker_size = 40*np.ones(yt.shape, dtype=int)
+    marker_size[LH_idx] = 140
+    marker_size[HL_idx] = 140
+
+    plt.figure()
+    plt.scatter(yt, ylt, c=marker_color, s=marker_size, alpha=0.7)
+    plt.xlabel('Value')
+    plt.ylabel('Spatially Lagged Value')
+    plt.axvline(c='black', ls='--')
+    plt.axhline(c='black', ls='--')
+    plt.ylim(min(ylt)-0.5, max(ylt)+0.5)
+    plt.xlim(min(yt)-0.5, max(yt)+1.5)
+    for i in np.concatenate((LH_idx, HL_idx)):
+        plt.annotate(data_countries[i], (yt[i], ylt[i]), xytext=(yt[i]*1.1, ylt[i]*1.1),textcoords='data',arrowprops=dict(arrowstyle="->",connectionstyle="arc3"))
+    extreme_points = np.concatenate(([np.argmin(ylt)], [np.argmax(ylt)], np.where(yt>np.mean(yt)+2.8*np.std(yt))[0], np.where(ylt>np.mean(yt)+2.8*np.std(ylt))[0]))
+    extreme_points = np.array(list(set(extreme_points) - set(np.concatenate((LH_idx, HL_idx)))))
+    for i in extreme_points:
+        plt.annotate(data_countries[i], (yt[i]+0.1, ylt[i]))
+    if out_file is not None:
+        plt.savefig(out_file)
--- a/util_dataset.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Mar 15 23:04:24 2017
-
-@author: mariapanteli
-"""
-
-import numpy as np
-from sklearn.model_selection import train_test_split
-
-
-def get_train_val_test_idx(X, Y, seed=None):
-    """ Split in train, validation, test sets.
-
-    Parameters
-    ----------
-    X : np.array
-        Data or indices.
-    Y : np.array
-        Class labels for data in X.
-    seed: int
-        Random seed.
-    Returns
-    -------
-    (X_train, Y_train) : tuple
-        Data X and labels y for the train set
-    (X_val, Y_val) : tuple
-        Data X and labels y for the validation set
-    (X_test, Y_test) : tuple
-        Data X and labels y for the test set
-
-    """
-    X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y)
-    X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test)
-    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
-
-
-def subset_labels(Y, N_min=10, N_max=100, seed=None):
-    """ Subset dataset to contain minimum N_min and maximum N_max instances
-        per class. Return indices for this subset.
-
-    Parameters
-    ----------
-    Y : np.array
-        Class labels
-    N_min : int
-        Minimum instances per class
-    N_max : int
-        Maximum instances per class
-    seed: int
-        Random seed.
-
-    Returns
-    -------
-    subset_idx : np.array
-        Indices for a subset with classes of size bounded by N_min, N_max
-
-    """
-    subset_idx = []
-    labels = np.unique(Y)
-    for label in labels:
-        label_idx = np.where(Y==label)[0]
-        counts = len(label_idx)
-        if counts>=N_max:
-            subset_idx.append(np.random.choice(label_idx, N_max, replace=False))
-        elif counts>=N_min and counts<N_max:
-            subset_idx.append(label_idx)
-        else:
-            # not enough samples for this class, skip
-            continue
-    if len(subset_idx)>0:
-        subset_idx = np.concatenate(subset_idx, axis=0)
-    return subset_idx
--- a/util_filter_dataset.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,76 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Apr 10 17:44:48 2017
-
-@author: mariapanteli
-"""
-
-import os
-import numpy as np
-import pandas as pd
-
-
-def get_speech_vamp(df):
-    jspeech = df.columns.get_loc("Speech")
-    nfiles = len(df)
-    speechinds = []
-    for i in range(nfiles):
-        #print i
-        if os.path.exists(df.iat[i, jspeech]) and os.path.getsize(df.iat[i, jspeech])>0:
-            bounds = pd.read_csv(df.iat[i, jspeech], header=None, delimiter='\t').get_values()
-            if len(bounds)>0:
-                if len(np.where(bounds[:,2]=='m')[0])==0 or len(np.where(bounds[:,2]=='s')[0])==len(bounds):
-                    speechinds.append(i)
-    return speechinds
-
-
-def get_speech_meta(df):
-    genres = np.array(df["Genre_Album"].get_values(), dtype=str)
-    speechinds_genre = []
-    invalid_genres = ["Spoken Word", "Language Instruction", "Classical",
-                        "Poetry", "Nature|Sounds", "Music Instruction",
-                        "Soundtracks &amp", "Contemporary &amp", "Jazz &amp",
-                        "Sounds", "Ragtime", "Nature", "Electronic",
-                        "African American Spoken", "Blues", "Gospel",
-                        "Psychology &amp"]
-    for i in range(len(genres)):
-        genre = genres[i]
-        #if genre in invalid_genres:
-        if any(x in genre for x in invalid_genres):
-            speechinds_genre.append(i)
-    return speechinds_genre
-
-
-def get_missing_csv(df):
-    nfiles = len(df)
-    missing_csv = []
-    for i in range(nfiles):
-        if not (os.path.exists(df["Melspec"].iloc[i]) and os.path.exists(df["Chroma"].iloc[i]) and os.path.exists(df["Melodia"].iloc[i])):
-            missing_csv.append(i)
-    return missing_csv
-
-
-def get_missing_country_meta(df):
-    nfiles = len(df)
-    missing_country = []
-    country_labels = np.array(df['Country'].get_values(), dtype=str)
-    invalid_countries = ['Unidentified', 'unknown', 'nan',
-                         'Yugoslavia (former)', 'Pathian village  Wangulei ',
-                         'Joulouloum  either Senegal or The Gambia ']
-    for i in range(nfiles):
-        country = country_labels[i]
-        if country in invalid_countries:
-            missing_country.append(i)
-    return missing_country
-
-
-def remove_missing_data(df):
-    speechinds_vamp = get_speech_vamp(df)
-    speechinds_genre = get_speech_meta(df)
-    speechinds = set(speechinds_vamp) | set(speechinds_genre)
-    missing = set(get_missing_csv(df))
-    missing_country = set(get_missing_country_meta(df))
-    selectinds = np.asarray(list(set(range(len(df))) - (missing | speechinds | missing_country)))
-
-    df = df.iloc[selectinds, :]
-    return df
\ No newline at end of file
--- a/utils.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,117 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed May 17 11:26:01 2017
-
-@author: mariapanteli
-"""
-
-import numpy as np
-import pandas as pd
-from sklearn.cluster import KMeans
-from sklearn import metrics
-from collections import Counter
-from sklearn.decomposition import PCA
-from scipy import stats
-from scipy.spatial.distance import mahalanobis
-from sklearn.covariance import MinCovDet
-
-
-def get_outliers(X, chi2thr=0.975):
-    robust_cov = MinCovDet().fit(X)
-    MD = robust_cov.mahalanobis(X)
-    chi2 = stats.chi2
-    degrees_of_freedom = X.shape[1]
-    threshold = chi2.ppf(chi2thr, degrees_of_freedom)
-    y_pred = MD>threshold
-    return threshold, y_pred, MD
-
-
-def get_outliers_Mahal(X, chi2thr=0.975):
-    n_samples = X.shape[0]
-    inv_cov = np.linalg.inv(np.cov(X.T))
-    col_mean = np.mean(X, axis=0)
-    MD = np.zeros(n_samples, dtype='float')
-    for i in range(n_samples):
-        MD[i] = mahalanobis(X[i,:], col_mean, inv_cov)
-    MD = MD ** 2
-    degrees_of_freedom = X.shape[1]
-    chi2 = stats.chi2
-    threshold = chi2.ppf(chi2thr, degrees_of_freedom)
-    y_pred = MD>threshold
-    return threshold, y_pred, MD
-
-
-def pca_data(X, min_variance=None):
-    # rotate data to avoid singularity in Mahalanobis/covariance matrix
-    model = PCA(whiten=True).fit(X)
-    model.explained_variance_ratio_.sum()
-    if min_variance is None:
-        n_pc = X.shape[1]
-    else:
-        n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0]
-    X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X)
-    return X_pca, n_pc
-
-
-def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False):
-    uniq_labels = np.unique(Y)
-    spatial_outliers = []
-    for uniq_label in uniq_labels:
-        countries_neighbors = w_dict[uniq_label]
-        if len(countries_neighbors)==0:
-            print uniq_label, " no neighbors found"
-            continue
-        inds_neighborhood = []
-        for country in countries_neighbors:
-            inds = np.where(Y==country)[0]
-            inds_neighborhood.append(inds) # append neighboring countries
-        if len(np.concatenate(inds_neighborhood))==0:
-            print "no neighbors found"
-            continue
-        inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country
-        inds_neighborhood = np.concatenate(inds_neighborhood)
-        if do_pca:
-            XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99)
-        else:
-            XX = X[inds_neighborhood, :]  # assume X is already in PCA
-        print len(inds_neighborhood)
-        if len(inds_neighborhood)<XX.shape[1]:
-            print uniq_label, " neighborhood too small for number of features"
-            continue
-        threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr)
-        counts = Counter(Y[inds_neighborhood[y_pred]])
-        spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood])
-    return spatial_outliers
-
-
-def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'):
-    ave_silh = []
-    for i in range(min_ncl):
-        ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering
-    for ncl in range(min_ncl, max_ncl):
-        print ncl
-        cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X)
-        ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg
-    ave_silh = np.array(ave_silh)
-    bestncl = np.nanargmax(ave_silh)
-    return bestncl, ave_silh
-
-
-def get_cluster_freq_linear(X, Y, centroids):
-    """ for each label in Y get the distribution of clusters by linear encoding
-    """
-    def encode_linear(X, centroids):
-        """Linear encoding via the dot product
-        """
-        return np.dot(X, centroids.T)
-    encoding = encode_linear(X, centroids)
-    encoding_df = pd.DataFrame(data=encoding, index=Y)
-    encoding_df_sum = encoding_df.groupby(encoding_df.index).sum()
-    cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum)
-    cluster_freq.index.name = 'labels'
-    return cluster_freq
-
-
-def get_cluster_predictions(X, n_clusters=10):
-    cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X)
-    return cl_pred
--- a/utils_spatial.py	Mon Sep 11 11:32:45 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,151 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed May 17 11:35:51 2017
-
-@author: mariapanteli
-"""
-import numpy as np
-import json
-import pysal # before shapely in util_plots
-import fiona
-import sys
-sys.path.append('../misc')
-import matplotlib.pyplot as plt
-
-
-def neighbors_from_json_file(data_countries, json_DB='../MergeBL-Smith/data/countries.json'):
-    neighbors = {}
-    with open(json_DB) as json_file:
-        countries_dict = json.load(json_file)
-    country_names = []
-    country_iso = []
-    country_borders_iso = []
-    for country_info in countries_dict:
-        country_names.append(country_info['name']['common'])
-        country_iso.append(country_info['cca3'])
-        country_borders_iso.append(country_info['borders'])
-    # temporary fixes of country names to match json data
-    country_names[country_names.index('United States')] = 'United States of America'
-    country_names[country_names.index('Tanzania')] = 'United Republic of Tanzania'
-    country_names[country_names.index('DR Congo')] = 'Democratic Republic of the Congo'
-    country_names[country_names.index('Czechia')] = 'Czech Republic'
-    for i, country in enumerate(data_countries):
-        neighbors[i] = {}
-        if country in country_names:
-            if len(country_borders_iso[country_names.index(country)])>0:
-                # if country has neighbors according to json file
-                neighbors_iso = country_borders_iso[country_names.index(country)]
-                neighbors_names = [country_names[country_iso.index(nn)] for nn in neighbors_iso]
-                for neighbor in neighbors_names:
-                    if neighbor in data_countries:
-                        neighbor_idx = np.where(data_countries==neighbor)[0][0]
-                        neighbors[i][neighbor_idx] = 1.0
-    w = pysal.weights.W(neighbors, id_order=range(len(data_countries)))
-    return w
-
-
-def get_countries_from_shapefile(shapefile):
-    shp = fiona.open(shapefile, 'r')
-    countries = []
-    if shp[0]["properties"].has_key("ADMIN"):
-        country_keyword = "ADMIN"
-    elif shp[0]["properties"].has_key("NAME"):
-        country_keyword = "NAME"
-    else:
-        country_keyword = "admin"
-    for line in shp:
-        countries.append(line["properties"][country_keyword])
-    shp.close()
-    return countries
-
-
-def replace_empty_neighbours_with_KNN(data_countries, w):
-    shapefile = "../MergeBL-Smith/shapefiles/ne_10m_admin_0_countries.shp"
-    no_neighbors_idx = w.islands
-    knn = 10
-    wknn = pysal.knnW_from_shapefile(shapefile, knn)
-    knn_countries = get_countries_from_shapefile(shapefile)
-    neighbors = w.neighbors
-    for nn_idx in no_neighbors_idx:
-        country = data_countries[nn_idx]
-        print country
-        if country not in knn_countries:
-            continue
-        knn_country_idx = knn_countries.index(country)
-        knn_country_neighbors = [knn_countries[nn] for nn in wknn.neighbors[knn_country_idx]]
-        for knn_nn in knn_country_neighbors:
-            if len(neighbors[nn_idx])>2:
-                continue
-            data_country_idx = np.where(data_countries==knn_nn)[0]
-            if len(data_country_idx)>0:
-                neighbors[nn_idx][data_country_idx[0]] = 1.0
-    w = pysal.weights.W(neighbors, id_order=range(len(data_countries)))
-    return w
-
-
-def get_neighbors_for_countries_in_dataset(Y):
-    # neighbors
-    data_countries = np.unique(Y)
-    w = neighbors_from_json_file(data_countries)
-    w = replace_empty_neighbours_with_KNN(data_countries, w)
-    return w, data_countries
-
-
-def from_weights_to_dict(w, data_countries):
-    w_dict = {}
-    for i in w.neighbors:
-        w_dict[data_countries[i]] = [data_countries[nn] for nn in w.neighbors[i]]
-    return w_dict
-
-
-def get_LH_HL_idx(lm, p_vals):
-    sig_idx = np.where(p_vals<0.05)[0]
-    LH_idx = sig_idx[np.where(lm.q[sig_idx]==2)[0]]
-    HL_idx = sig_idx[np.where(lm.q[sig_idx]==4)[0]]
-    return LH_idx, HL_idx
-
-
-def print_Moran_outliers(y, w, data_countries):
-    lm = pysal.Moran_Local(y, w)
-    p_vals = lm.p_z_sim
-    LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals)
-    print 'LH', zip(data_countries[LH_idx], p_vals[LH_idx])  # LH
-    print 'HL', zip(data_countries[HL_idx], p_vals[HL_idx])  # HL
-
-
-def plot_Moran_scatterplot(y, w, data_countries, out_file=None):
-    lm = pysal.Moran_Local(y, w)
-    p_vals = lm.p_z_sim
-    LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals)
-
-    ylt  = pysal.lag_spatial(lm.w, lm.y)
-    yt = lm.y
-    yt = (yt - np.mean(yt))/np.std(yt)
-    ylt = (ylt - np.mean(ylt))/np.std(ylt)
-    colors = plt.cm.spectral(np.linspace(0,1,5))
-    quad = np.zeros(yt.shape, dtype=int)
-    quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH
-    quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH
-    quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL
-    quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL
-    marker_color = colors[quad]
-    marker_size = 40*np.ones(yt.shape, dtype=int)
-    marker_size[LH_idx] = 140
-    marker_size[HL_idx] = 140
-
-    plt.figure()
-    plt.scatter(yt, ylt, c=marker_color, s=marker_size, alpha=0.7)
-    plt.xlabel('Value')
-    plt.ylabel('Spatially Lagged Value')
-    plt.axvline(c='black', ls='--')
-    plt.axhline(c='black', ls='--')
-    plt.ylim(min(ylt)-0.5, max(ylt)+0.5)
-    plt.xlim(min(yt)-0.5, max(yt)+1.5)
-    for i in np.concatenate((LH_idx, HL_idx)):
-        plt.annotate(data_countries[i], (yt[i], ylt[i]), xytext=(yt[i]*1.1, ylt[i]*1.1),textcoords='data',arrowprops=dict(arrowstyle="->",connectionstyle="arc3"))
-    extreme_points = np.concatenate(([np.argmin(ylt)], [np.argmax(ylt)], np.where(yt>np.mean(yt)+2.8*np.std(yt))[0], np.where(ylt>np.mean(yt)+2.8*np.std(ylt))[0]))
-    extreme_points = np.array(list(set(extreme_points) - set(np.concatenate((LH_idx, HL_idx)))))
-    for i in extreme_points:
-        plt.annotate(data_countries[i], (yt[i]+0.1, ylt[i]))
-    if out_file is not None:
-        plt.savefig(out_file)