Mercurial > hg > plosone_underreview
changeset 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | 230a0cf17de0 |
children | 543744ed1ae7 |
files | load_dataset.py load_features.py results.py scripts/MFCC.py scripts/OPMellin.py scripts/PitchBihist.py scripts/load_dataset.py scripts/load_features.py scripts/results.py scripts/smoothiecore.py scripts/util_dataset.py scripts/utils.py scripts/utils_spatial.py util_dataset.py util_filter_dataset.py utils.py utils_spatial.py |
diffstat | 17 files changed, 1643 insertions(+), 916 deletions(-) [+] |
line wrap: on
line diff
--- a/load_dataset.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 15 22:52:57 2017 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -import pickle - -import load_features -import util_dataset -import util_filter_dataset - - -#METADATA_FILE = 'sample_dataset/metadata.csv' -#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] -WIN_SIZE = 2 -METADATA_FILE = 'data/metadata_BLSM_language_all.csv' -#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] -#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] -OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', - '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', - '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] - -def extract_features(df, win2sec=8.0): - """Extract features from melspec and chroma. - - Parameters - ---------- - df : pd.DataFrame - Metadata including class label and path to audio, melspec, chroma - win2sec : float - The window size for the second frame decomposition of the features - - Returns - ------- - X : np.array - The features for every frame x every audio file in the dataset - Y : np.array - The class labels for every frame in the dataset - Y_audio : np.array - The audio labels - """ - feat_loader = load_features.FeatureLoader(win2sec=win2sec) - frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) - print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape - X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) - Y = Y_df.get_values() - Y_audio = Y_audio_df.get_values() - return X, Y, Y_audio - - -if __name__ == '__main__': - # load dataset - df = pd.read_csv(METADATA_FILE) - df = util_filter_dataset.remove_missing_data(df) - subset_idx = util_dataset.subset_labels(df['Country'].get_values()) - df = df.iloc[subset_idx, :] - X, Y = np.arange(len(df)), df['Country'].get_values() - - # split in train, val, test set - train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) - - # extract features and write output - X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[0], 'wb') as f: - pickle.dump([X_train, Y_train, Y_audio_train], f) - - X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[1], 'wb') as f: - pickle.dump([X_val, Y_val, Y_audio_val], f) - - X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[2], 'wb') as f: - pickle.dump([X_test, Y_test, Y_audio_test], f) - -#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' -# pickle.dump([X_test, Y_test, Y_audio_test], f) -#with open(out_file, 'wb') as f:
--- a/load_features.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,287 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 16 01:50:57 2017 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -import os -from sklearn.decomposition import NMF -import OPMellin as opm -import MFCC as mfc -import PitchBihist as pbi - - -class FeatureLoader: - def __init__(self, win2sec=8): - self.win2sec = float(win2sec) - self.sr = 44100. - self.win1 = int(round(0.04*self.sr)) - self.hop1 = int(round(self.win1/8.)) - self.framessr = self.sr/float(self.hop1) - self.win2 = int(round(self.win2sec*self.framessr)) - self.hop2 = int(round(0.5*self.framessr)) - self.framessr2 = self.framessr/float(self.hop2) - - - def get_op_mfcc_for_file(self, melspec_file=None, scale=True): - op = [] - mfc = [] - if not os.path.exists(melspec_file): - return op, mfc - print 'extracting onset patterns and mfccs...' - songframes = pd.read_csv(melspec_file, engine="c", header=None) - songframes.iloc[np.where(np.isnan(songframes))] = 0 - songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes - melspec = songframes.get_values().T - op = self.get_op_from_melspec(melspec, K=2) - mfc = self.get_mfcc_from_melspec(melspec) - if scale: - # scale all frames by mean and std of recording - op = (op - np.nanmean(op)) / np.nanstd(op) - mfc = (mfc - np.nanmean(mfc)) / np.nanstd(mfc) - return op, mfc - - - def get_chroma_for_file(self, chroma_file=None, scale=True): - ch = [] - if not os.path.exists(chroma_file): - return ch - print 'extracting chroma...' - songframes = pd.read_csv(chroma_file, engine="c", header=None) - songframes.iloc[np.where(np.isnan(songframes))] = 0 - songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes - chroma = songframes.get_values().T - ch = self.get_ave_chroma(chroma) - if scale: - # scale all frames by mean and std of recording - ch = (ch - np.nanmean(ch)) / np.nanstd(ch) - return ch - - - def get_music_idx_from_bounds(self, bounds, sr=None): - music_idx = [] - if len(bounds) == 0: - # bounds is empty list - return music_idx - nbounds = bounds.shape[0] - if len(np.where(bounds[:,2]=='m')[0])==0: - # no music segments - return music_idx - elif len(np.where(bounds[:,2]=='s')[0])==nbounds: - # all segments are speech - return music_idx - else: - half_win_hop = int(round(0.5 * self.win2 / float(self.hop2))) - music_bounds = np.where(bounds[:, 2] == 'm')[0] - bounds_in_frames = np.round(np.array(bounds[:, 0], dtype=float) * sr) - duration_in_frames = np.round(np.array(bounds[:, 1], dtype=float) * sr) - for music_bound in music_bounds: - lower_bound = np.max([0, bounds_in_frames[music_bound] - half_win_hop]) - upper_bound = lower_bound + duration_in_frames[music_bound] - half_win_hop - music_idx.append(np.arange(lower_bound, upper_bound, dtype=int)) - if len(music_idx)>0: - music_idx = np.sort(np.concatenate(music_idx)) # it should be sorted, but just in case segments overlap -- remove duplicates if segments overlap - return music_idx - - - def get_music_idx_for_file(self, segmenter_file=None): - music_idx = [] - if os.path.exists(segmenter_file) and os.path.getsize(segmenter_file)>0: - print 'loading speech/music segments...' - bounds = pd.read_csv(segmenter_file, header=None, delimiter='\t').get_values() - if bounds.shape[1] == 1: # depends on the computer platform - bounds = pd.read_csv(segmenter_file, header=None, delimiter=',').get_values() - music_idx = self.get_music_idx_from_bounds(bounds, sr=self.framessr2) - return music_idx - - - def get_features(self, df, class_label='Country'): - oplist = [] - mflist = [] - chlist = [] - pblist = [] - clabels = [] - aulabels = [] - n_files = len(df) - for i in range(n_files): - if not (os.path.exists(df['Melspec'].iloc[i]) and os.path.exists(df['Chroma'].iloc[i]) and os.path.exists(df['Melodia'].iloc[i])): - continue - print 'file ' + str(i) + ' of ' + str(n_files) - music_idx = self.get_music_idx_for_file(df['Speech'].iloc[i]) - if len(music_idx)==0: - # no music segments -> skip this file - continue - try: - op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i]) - ch = self.get_chroma_for_file(df['Chroma'].iloc[i]) - #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i]) - pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i]) - #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i]) - except: - continue - min_n_frames = np.min([len(op), len(mfcc), len(ch), len(pb)]) # ideally, features should have the same number of frames - if min_n_frames==0: - # no features extracted -> skip this file - continue - music_idx = music_idx[music_idx<min_n_frames] - n_frames = len(music_idx) - oplist.append(op.iloc[music_idx, :]) - mflist.append(mfcc.iloc[music_idx, :]) - chlist.append(ch.iloc[music_idx, :]) - pblist.append(pb.iloc[music_idx, :]) - clabels.append(pd.DataFrame(np.repeat(df[class_label].iloc[i], n_frames))) - aulabels.append(pd.DataFrame(np.repeat(df['Audio'].iloc[i], n_frames))) - print len(oplist), len(mflist), len(chlist), len(pblist), len(clabels), len(aulabels) - return pd.concat(oplist), pd.concat(mflist), pd.concat(chlist), pd.concat(pblist), pd.concat(clabels), pd.concat(aulabels) - - - def get_op_from_melspec(self, melspec, K=None): - op = opm.OPMellin(win2sec=self.win2sec) - opmellin = op.get_opmellin_from_melspec(melspec=melspec, melsr=self.framessr) - if K is not None: - opmel = self.mean_K_bands(opmellin.T, K) - opmel = pd.DataFrame(opmel) - return opmel - - - def get_mfcc_from_melspec(self, melspec, deltamfcc=True, avelocalframes=True, stdlocalframes=True): - mf = mfc.MFCCs() - mfcc = mf.get_mfccs_from_melspec(melspec=melspec, melsr=self.framessr) - if deltamfcc: - ff = mfcc - ffdiff = np.diff(ff, axis=1) - ffdelta = np.concatenate((ffdiff, ffdiff[:,-1,None]), axis=1) - frames = np.concatenate([ff,ffdelta], axis=0) - mfcc = frames - if avelocalframes: - mfcc = self.average_local_frames(mfcc, getstd=stdlocalframes) - mfcc = pd.DataFrame(mfcc.T) - return mfcc - - - def get_ave_chroma(self, chroma, avelocalframes=True, stdlocalframes=True, alignchroma=True): - chroma[np.where(np.isnan(chroma))] = 0 - if alignchroma: - maxind = np.argmax(np.sum(chroma, axis=1)) - chroma = np.roll(chroma, -maxind, axis=0) - if avelocalframes: - chroma = self.average_local_frames(chroma, getstd=stdlocalframes) - chroma = pd.DataFrame(chroma.T) - return chroma - - - def average_local_frames(self, frames, getstd=False): - nbins, norigframes = frames.shape - if norigframes<self.win2: - nframes = 1 - else: - nframes = int(1+np.floor((norigframes-self.win2)/float(self.hop2))) - if getstd: - aveframes = np.empty((nbins+nbins, nframes)) - for i in range(nframes): # loop over all 8-sec frames - meanf = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1) - stdf = np.nanstd(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1) - aveframes[:,i] = np.concatenate((meanf,stdf)) - else: - aveframes = np.empty((nbins, nframes)) - for i in range(nframes): # loop over all 8-sec frames - aveframes[:,i] = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1) - return aveframes - - - def mean_K_bands(self, songframes, K=40, nmels=40): - [F, P] = songframes.shape - Pproc = int((P/nmels)*K) - procframes = np.zeros([F, Pproc]) - niters = int(nmels/K) - nbins = P/nmels # must be 200 bins - for k in range(K): - for j in range(k*niters, (k+1)*niters): - procframes[:, (k*nbins):((k+1)*nbins)] += songframes[:, (j*nbins):((j+1)*nbins)] - procframes /= float(niters) - return procframes - - - def nmfpitchbihist(self, frames): - nbins, nfr = frames.shape - npc = 2 - nb = int(np.sqrt(nbins)) # assume structure of pitch bihist is nbins*nbins - newframes = np.empty(((nb+nb)*npc, nfr)) - for fr in range(nfr): - pb = np.reshape(frames[:, fr], (nb, nb)) - try: - nmfmodel = NMF(n_components=npc).fit(pb) - W = nmfmodel.transform(pb) - H = nmfmodel.components_.T - newframes[:, fr, None] = np.concatenate((W, H)).flatten()[:, None] - except: - newframes[:, fr, None] = np.zeros(((nb+nb)*npc, 1)) - return newframes - - - def get_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True): - pb = [] - if not os.path.exists(melodia_file): - return pb - print 'extracting pitch bihist from melodia...' - pb = pbi.PitchBihist(win2sec=self.win2sec) - pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=90.0) - if nmfpb is True: - pbihist = self.nmfpitchbihist(pbihist) - pbihist = pd.DataFrame(pbihist.T) - if scale: - # scale all frames by mean and std of recording - pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist) - return pbihist - - - def load_precomputed_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True): - base = os.path.basename(melodia_file) - root = '/import/c4dm-05/mariap/Melodia-melody-'+str(int(self.win2sec))+'sec/' - print 'load precomputed pitch bihist', root - if self.win2sec == 8: - pbihist = pd.read_csv(os.path.join(root, base)).iloc[1:,1:] - else: - pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T - if nmfpb is True: - pbihist = self.nmfpitchbihist(pbihist) - pbihist = pd.DataFrame(pbihist.T) - print pbihist.shape - if scale: - # scale all frames by mean and std of recording - pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist) - return pbihist - - -# def get_pb_chroma_for_file(self, chroma_file=None, scale=True): -# ch = [] -# pb = [] -# if not os.path.exists(chroma_file): -# return ch, pb -# songframes = pd.read_csv(chroma_file, engine="c", header=None) -# songframes.iloc[np.where(np.isnan(songframes))] = 0 -# songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes -# chroma = songframes.get_values().T -# ch = self.get_ave_chroma(chroma) -# pb = self.get_pbihist_from_chroma(chroma) -# if scale: -# # scale all frames by mean and std of recording -# ch = (ch - np.nanmean(ch)) / np.nanstd(ch) -# pb = (pb - np.nanmean(pb)) / np.nanstd(pb) -# return ch, pb - - -# def get_pbihist_from_chroma(self, chroma, alignchroma=True, nmfpb=True): -# pb = pbi.PitchBihist(win2sec=self.win2sec) -# chroma[np.where(np.isnan(chroma))] = 0 -# if alignchroma: -# maxind = np.argmax(np.sum(chroma, axis=1)) -# chroma = np.roll(chroma, -maxind, axis=0) -# pbihist = pb.get_pitchbihist_from_chroma(chroma=chroma, chromasr=self.framessr) -# if nmfpb is True: -# pbihist = self.nmfpitchbihist(pbihist) -# pbihist = pd.DataFrame(pbihist.T) -# return pbihist \ No newline at end of file
--- a/results.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Tue Jul 12 20:49:48 2016 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -import pickle -from collections import Counter -from sklearn.cluster import KMeans - -import utils -import utils_spatial - - -def country_outlier_df(counts, labels, out_file=None, normalize=False): - if len(counts.keys()) < len(np.unique(labels)): - for label in np.unique(labels): - if not counts.has_key(label): - counts.update({label:0}) - if normalize is True: - counts = normalize_outlier_counts(counts, Counter(labels)) - df = pd.DataFrame.from_dict(counts, orient='index').reset_index() - df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) - if out_file is not None: - df.to_csv(out_file, index=False) - return df - - -def normalize_outlier_counts(outlier_counts, country_counts): - '''Normalize a dictionary of outlier counts per country by - the total number of recordings per country - ''' - for key in outlier_counts.keys(): - # dictionaries should have the same keys - outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) - return outlier_counts - - -def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): - threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) - global_counts = Counter(Y[y_pred]) - df = country_outlier_df(global_counts, Y, normalize=True) - if out_file is not None: - df.to_csv(out_file, index=False) - return df, threshold, MD - - -def print_most_least_outliers_topN(df, N=10): - sort_inds = df['Outliers'].argsort() # ascending order - df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] - df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] - print "most outliers " - print df_most - print "least outliers " - print df_least - - -def load_metadata(Yaudio, metadata_file): - df = pd.read_csv(metadata_file) - df_audio = pd.DataFrame({'Audio':Yaudio}) - ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio - return ddf - - -def clusters_metadata(df, cl_pred, out_file=None): - def get_top_N_counts(labels, N=3): - ulab, ucount = np.unique(labels, return_counts=True) - inds = np.argsort(ucount) - return zip(ulab[inds[-N:]],ucount[inds[-N:]]) - info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))]) - styles_description = [] - uniq_cl = np.unique(cl_pred) - for ccl in uniq_cl: - inds = np.where(cl_pred==ccl)[0] - styles_description.append(get_top_N_counts(info[inds], N=3)) - df_styles = pd.DataFrame(data=styles_description, index=uniq_cl) - print df_styles.to_latex() - if out_file is not None: - df_styles.to_csv(out_file, index=False) - - -# load LDA-transformed frames -X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) -ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') -w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) -w_dict = utils_spatial.from_weights_to_dict(w, data_countries) -Xrhy, Xmel, Xmfc, Xchr = X_list -X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) - -# global outliers -df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) -print_most_least_outliers_topN(df_global, N=10) - -spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) -spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) -df_local = country_outlier_df(spatial_counts, Y, normalize=True) -print_most_least_outliers_topN(df_local, N=10) - -feat = [Xrhy, Xmel, Xmfc, Xchr] -feat_labels = ['rhy', 'mel', 'mfc', 'chr'] -tabs_feat = [] -for i in range(len(feat)): - XX = feat[i] - df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) - print_most_least_outliers_topN(df_feat, N=5) - -# how many styles are there -#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") -bestncl = 13 - -# get cluster predictions and metadata for each cluster -cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) -centroids = cluster_model.cluster_centers_ -cl_pred = cluster_model.predict(X) -ddf['Clusters'] = cl_pred -clusters_metadata(ddf, cl_pred) - -# how similar are the cultures and which ones seem to be global outliers -cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) - -# Moran on Mahalanobis distances -data = cluster_freq.get_values() -data_countries = cluster_freq.index -#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) -threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) -y = np.sqrt(MD) -utils_spatial.print_Moran_outliers(y, w, data_countries) -utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/MFCC.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 19 12:58:07 2015 + +@author: mariapanteli +""" +import librosa +import scipy.signal +import numpy + +class MFCCs: + def __init__(self): + self.y = None + self.sr = None + self.melspec = None + self.melsr = None + self.win1 = None + self.hop1 = None + self.mfccs = None + + def load_audiofile(self, filename='test.wav', sr=None, segment=True): + self.y, self.sr = librosa.load(filename, sr=sr) + if segment: + tracklength = self.y.shape[0]/float(self.sr) + startSample = 0 + endSample = None + if tracklength > 90: + startPointSec = (tracklength/2.)-20 + startSample = round(startPointSec*self.sr) + endSample = startSample+45*self.sr + self.y = self.y[startSample:endSample] + + def mel_spectrogram(self, y=None, sr=None): + if self.y is None: + self.y = y + if self.sr is None: + self.sr = sr + win1 = int(round(0.04*self.sr)) + hop1 = int(round(win1/8.)) + nfft1 = int(2**numpy.ceil(numpy.log2(win1))) + nmels = 40 + D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming))**2 + #melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels) + melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels, fmax=8000) + melsr = self.sr/float(hop1) + self.melspec = melspec + self.melsr = melsr + + def calc_mfccs(self, y=None, sr=None): + if self.y is None: + self.y = y + if self.sr is None: + self.sr = sr + # require log-amplitude + self.mfccs = librosa.feature.mfcc(S=librosa.logamplitude(self.melspec), n_mfcc=21) + # remove DCT component + self.mfccs = self.mfccs[1:,:] + + def get_mfccs(self, filename='test.wav', secondframedecomp=False): + self.load_audiofile(filename=filename) + self.mel_spectrogram() + self.calc_mfccs() + + if secondframedecomp: + win2 = int(round(8*self.melsr)) + hop2 = int(round(0.5*self.melsr)) + nbins, norigframes = self.melspec.shape + nframes = int(1+numpy.floor((norigframes-win2)/float(hop2))) + avemfccs = numpy.empty((nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + avemfccs[:,i] = numpy.mean(self.mfccs[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True) + self.mfccs = avemfccs + return self.mfccs + + def get_mfccs_from_melspec(self, melspec=[], melsr=[]): + self.melspec = melspec + self.melsr = melsr + self.calc_mfccs() + return self.mfccs + +if __name__ == '__main__': + mfs = MFCCs() + mfs.get_mfccs() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/OPMellin.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 17 13:37:00 2015 + +@author: mariapanteli +""" + +import librosa +import scipy.signal +# import matplotlib.pyplot as plt +import numpy + +class OPMellin: + def __init__(self, win2sec=8): + self.y = None + self.sr = None + self.melspec = None + self.melsr = None + self.op = None + self.opmellin = None + self.win2sec = win2sec + + def load_audiofile(self, filename='test.wav', sr=None, segment=True): + self.y, self.sr = librosa.load(filename, sr=sr) + #self.y, self.sr = librosa.load(filename) # default sr=22050 + if segment: + tracklength = self.y.shape[0]/float(self.sr) + startSample = 0 + endSample = None + if tracklength > 90: + startPointSec = (tracklength/2.)-20 + startSample = round(startPointSec*self.sr) + endSample = startSample+45*self.sr + self.y = self.y[startSample:endSample] + + + # def mel_spectrogram(self, y=None, sr=None, filename='1_21.wav'): + def mel_spectrogram(self, y=None, sr=None): + if self.y is None: + self.y = y + if self.sr is None: + self.sr = sr + win1 = int(round(0.04*self.sr)) + hop1 = int(round(win1/8.)) + nfft1 = int(2**numpy.ceil(numpy.log2(win1))) + nmels = 40 + # y, sr = librosa.load(filename, sr=fs) + # melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=nfft, win_length=win1, window=scipy.signal.hamming, hop_length=hop1, n_mels=nmels) + # melspec = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_fft=nfft, hop_length=hop1, n_mels=nmels) + D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming))**2 + #melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels) + melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels, fmax=8000) + melsr = self.sr/float(hop1) + # librosa.display.specshow(melspec) + # return melspec, melsr + self.melspec = melspec + self.melsr = melsr + + # post process spectrogram + def post_process_spec(self, melspec=None, log=True, medianfilt=False, sqrt=False, smooth=False, diff=True, subtractmean=True, halfwave=True, maxnormal=True): + if self.melspec is None: + self.melspec = melspec + if log: + self.melspec = librosa.logamplitude(self.melspec) + if medianfilt: + ks = int(0.1 * self.melsr) # 100ms kernel size + if ks % 2 == 0: # ks must be odd + ks += 1 + nmels = self.melspec.shape[0] + for i in range(nmels): + self.melspec[i,:] = scipy.signal.medfilt(self.melspec[i,:],kernel_size = ks) + if sqrt: + self.melspec = self.melspec**.5 + if smooth: + step = 50.0 + nmels = self.melspec.shape[0] + mm = self.melspec + for i in range(nmels): + mm[i,:] = numpy.convolve(mm[i,:], numpy.ones(step)/step, mode='same') + self.melspec = mm + if diff: + self.melspec = numpy.concatenate((self.melspec,self.melspec[:,-1,None]),axis=1) # append one frame before diff to keep number of frames the same + self.melspec = numpy.diff(self.melspec, n=1, axis=1) + if subtractmean: + mean = self.melspec.mean(axis=1) + mean.shape = (mean.shape[0], 1) + self.melspec = self.melspec - mean + if halfwave: + self.melspec[numpy.where(self.melspec < 0)] = 0 + if maxnormal: + if self.melspec.max() != 0: # avoid division by 0 + self.melspec = self.melspec/self.melspec.max() + # return melspec + + + def onset_patterns(self, melspec=None, melsr=None, fft=True, logfilter=False, center=True, bpmrange=True): + if self.melspec is None: + self.melspec = melspec + if self.melsr is None: + self.melsr = melsr + win2 = int(round(self.win2sec*self.melsr)) + hop2 = int(round(0.5*self.melsr)) + nmels, nmelframes = self.melspec.shape + if fft: # periodicity via fft + #nfft2 = int(2**numpy.ceil(numpy.log2(win2))) + nfft2 = 2048 # nfft2 does not depend on win2?? + melspectemp = self.melspec + if ((nfft2 > win2) and (center is False)): + # pad the signal by nfft2-win2 so that frame decomposition (n_frames) + # is still based on win2 and not nfft2 + melspectemp = numpy.concatenate([numpy.zeros((nmels,int((nfft2 - win2) // 2))),self.melspec, numpy.zeros((nmels,int((nfft2 - win2) // 2)))],axis=1) + if melspectemp.shape[1]<nfft2: + # pad with zeros to have at least one 8-sec window + nzeros = nfft2 - melspectemp.shape[1] + melspectemp = numpy.concatenate([numpy.zeros((nmels,int(numpy.ceil(nzeros / 2.)))),melspectemp, numpy.zeros((nmels,int(numpy.ceil(nzeros / 2.))))],axis=1) + #melspectemp = numpy.concatenate([melspectemp, numpy.zeros((nmels,nzeros))],axis=1) + # nframes = int(round(numpy.ceil(round(nmelframes/hop2)))) + ff0 = numpy.abs(librosa.stft(y=melspectemp[0, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center)) + nframes = ff0.shape[1] + # nmags, nframes = ff0.shape + if logfilter: # log filterbank + nbins = 25 + binsperoct = 5 + fft2 = numpy.zeros((nmels, nbins, nframes)) + logfb = librosa.filters.logfrequency(self.melsr, nfft2, n_bins=nbins, bins_per_octave=binsperoct, fmin=0.5) + # logfb = librosa.filters.logfrequency(self.melsr, nfft2, n_bins=25, bins_per_octave=5, fmin=0.5) + for i in range(nmels): + fftmags = numpy.abs(librosa.stft(y=melspectemp[i, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center)) + fftmags = numpy.dot(logfb, fftmags) # log + fft2[i, :, :] = fftmags + else: + # nmags = min(round(nfft2/2), 200) + if bpmrange: + # freqresinbpm = float(self.melsr)/float(nfft2)*60. + freqresinbpm = float(self.melsr)/float(nfft2/2.)*60. + minmag = int(numpy.floor(30./freqresinbpm)) # min tempo 30bpm + maxmag = int(numpy.ceil(960./freqresinbpm)) # max tempo 960 bpm + magsinds = range(minmag, maxmag) + else: + magsinds = range(0, 200) + nmags = len(magsinds) + fft2 = numpy.zeros((nmels, nmags, nframes)) + for i in range(nmels): + fftmags = numpy.abs(librosa.stft(y=melspectemp[i, :], win_length=win2, hop_length=hop2, n_fft=nfft2, window=scipy.signal.hamming, center=center)) + fftmags = fftmags[magsinds, :] + fft2[i, :, :] = fftmags + op = fft2 + else: # periodicity via autocorrelation + nmags = min(win2, 1000) + if nmelframes < win2: + nframes = 1 + else: + nframes = int(nmelframes/float(win2)) + autocor = numpy.zeros((nmels, nmags, nframes)) + for i in range(nmels): + ac = numpy.zeros((nmags, nframes)) + for j in range(nframes): + ac[:, j] = librosa.autocorrelate(self.melspec[i, (j*win2):min((j+1)*win2, nmelframes)], max_size=nmags) + autocor[i, :, :] = ac + op = autocor + self.op = op + + def post_process_op(self, medianfiltOP=False): + if medianfiltOP: + hop2 = int(round(0.5*self.melsr)) + ssr = self.melsr/float(hop2) + ks = int(0.5 * ssr) # 100ms kernel size + if ks % 2 == 0: # ks must be odd + ks += 1 + nmels, nmags, nframes = self.op.shape + for i in range(nmels): + for j in range(nframes): + #self.op[i,:,j] = scipy.signal.medfilt(self.op[i,:,j],kernel_size = ks) + self.op[i,:,j] = numpy.convolve(self.op[i,:,j], numpy.ones(ks)/ks, mode='same') + + def mellin_transform(self, op=None): + if self.op is None: + self.op = op + nmels, nmags, nframes = self.op.shape + #nmagsout = min(200, nmags) + nmagsout = 200 + u_max = numpy.log(nmags) + delta_c = numpy.pi / u_max + c_max = nmagsout + c = numpy.arange(delta_c, c_max, delta_c) + k = range(1, nmags) + exponent = 0.5 - c*1j + + normMat = 1./(exponent * numpy.sqrt(2*numpy.pi)) + normMat.shape = (normMat.shape[0], 1) + normMat = numpy.repeat(normMat.T, nmels, axis=0) + kernelMat = numpy.asarray([numpy.power(ki, exponent) for ki in k]) + opmellin = numpy.zeros((nmels, nmagsout, nframes)) + for i in range(nframes): + self.op[:, -1, i] = 0 + deltaMat = - numpy.diff(self.op[:, :, i]) + mellin = numpy.abs(numpy.dot(deltaMat, kernelMat) * normMat) + opmellin[:, :, i] = mellin[:, :nmagsout] + self.opmellin = opmellin + + + def post_process_opmellin(self, opmellin=None, aveFreq=False, normFrame=True): + if self.opmellin is None: + self.opmellin = opmellin + if aveFreq: + self.opmellin = numpy.mean(self.opmellin, axis=0) + else: # just reshape + nmels, nmags, nframes = self.opmellin.shape + self.opmellin = self.opmellin.reshape((nmels*nmags, nframes)) + if normFrame: + min_opmellin = numpy.amin(self.opmellin, axis=0, keepdims=True) + max_opmellin = numpy.amax(self.opmellin, axis=0, keepdims=True) + denom = max_opmellin - min_opmellin + denom[denom==0] = 1 # avoid division by 0 if frame is all 0s-silent + self.opmellin = (self.opmellin - min_opmellin) / denom + + + def get_opmellin(self, filename='test.wav', logfilter=False, center=True, medianfiltspec=False, medianfiltOP=False): + self.load_audiofile(filename=filename) + self.mel_spectrogram() + #self.post_process_spec(medianfilt=medianfiltspec) + self.post_process_spec(log=False, sqrt=True, medianfilt=medianfiltspec) # sqrt seems to work better + self.onset_patterns(logfilter=logfilter, center=center) + self.post_process_op(medianfiltOP=medianfiltOP) + self.mellin_transform() + self.post_process_opmellin() + return self.opmellin + + def get_opmellin_from_melspec(self, melspec=[], melsr=[]): + self.melspec = melspec + self.melsr = melsr + self.post_process_spec(log=False, sqrt=True, medianfilt=True) # sqrt seems to work better + self.onset_patterns(logfilter=False, center=False) + self.post_process_op(medianfiltOP=True) + self.mellin_transform() + self.post_process_opmellin() + return self.opmellin + +if __name__ == '__main__': + op = OPMellin() + op.get_opmellin()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/PitchBihist.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 2 22:26:10 2016 + +@author: mariapanteli +""" +import smoothiecore as s +import numpy +import scipy.signal +# import librosa + + +class PitchBihist: + def __init__(self, win2sec=8): + self.y = None + self.sr = None + self.chroma = None + self.chromasr = None + self.bihist = None + self.win2sec = win2sec + + def bihist_from_chroma(self, filename='test.wav', secondframedecomp=True): + self.chroma, self.chromasr = s.get_smoothie_for_bihist(filename=filename, hopinsec=0.005) + if secondframedecomp: + win2 = int(round(8*self.chromasr)) + hop2 = int(round(0.5*self.chromasr)) + nbins, norigframes = self.chroma.shape + if norigframes<win2: + nframes = 1 + else: + nframes = int(1+numpy.floor((norigframes-win2)/float(hop2))) + bihistframes = numpy.empty((nbins*nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + frame = self.chroma[:, (i*hop2):min((i*hop2+win2),norigframes)] + bihist = self.bihistogram(frame) + bihist = numpy.reshape(bihist, -1) + bihistframes[:, i] = bihist + self.bihist = bihistframes + else: + self.bihist = numpy.reshape(self.bihistogram(), -1) + + def bihist_from_melodia(self, filename='sample_melodia.csv', secondframedecomp=True, stop_sec=None): + def hz_to_cents(freq_Hz, ref_Hz=32.703, n_cents=1200): + """ convert frequency values from Hz to cents + reference frequency at C1 + """ + freq_cents = numpy.round(n_cents * numpy.log2(freq_Hz/ref_Hz)) + return freq_cents + def wrap_to_octave(cents, octave_length=1200): + """ wrap to a single octave 0-1200 + """ + octave_cents = cents % octave_length + return octave_cents + + n_bins = 60 + data = numpy.loadtxt(filename, delimiter=',') + times, freqs = (data[:, 0], data[:, 1]) + self.chromasr = 1. / (times[1] - times[0]) + #self.chromasr = 128. + if stop_sec is not None: + stop_idx = numpy.where(times < stop_sec)[0] + times, freqs = times[stop_idx], freqs[stop_idx] + freqs[freqs<=0] = numpy.nan + #melody = freqs[freqs>0] + melody = freqs + n_frames = len(melody) + melody_cents = hz_to_cents(melody, n_cents=n_bins) + melody_octave = wrap_to_octave(melody_cents, octave_length=n_bins) + melody_matrix = numpy.zeros((n_bins, n_frames)) + for time, pitch in enumerate(melody_octave): + if not numpy.isnan(pitch): + melody_matrix[int(pitch), time] = 1 + if secondframedecomp: + win2 = int(round(self.win2sec*self.chromasr)) + hop2 = int(round(0.5*self.chromasr)) + nbins, norigframes = melody_matrix.shape + if norigframes<win2: + nframes = 1 + win2 = norigframes + else: + nframes = int(1+numpy.floor((norigframes-win2)/float(hop2))) + bihistframes = numpy.empty((nbins*nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + frame = melody_matrix[:, (i*hop2):(i*hop2+win2)] + bihist = self.bihistogram(frame) + bihist = numpy.reshape(bihist, -1) + bihistframes[:, i] = bihist + self.bihist = bihistframes + else: + self.bihist = self.bihistogram(melody_matrix) + return self.bihist + + def bihistogram(self, spec, winsec=0.5, align=True): + win = int(round(winsec*self.chromasr)) + ker = numpy.concatenate([numpy.zeros((win, 1)), numpy.ones((win+1, 1))], axis=0) + spec = spec.T # transpose to have franes as rows in convolution + + # energy threshold + thr = 0.3*numpy.max(spec) + spec[spec < max(thr, 0)] = 0 + + # transitions via convolution + tra = scipy.signal.convolve2d(spec, ker, mode='same') + tra[spec > 0] = 0 + + # multiply with original + B = numpy.dot(tra.T, spec) + + # normalize + mxB = numpy.max(B) + mnB = numpy.min(B) + if mxB != mnB: + B = (B - mnB)/float(mxB-mnB) + + # circshift to highest? + if align: + ref = numpy.argmax(numpy.sum(spec, axis=0)) + B = numpy.roll(B, -ref, axis=0) + B = numpy.roll(B, -ref, axis=1) + return B + + def bihist_from_precomp_chroma(self, align=False): + win2 = int(round(self.win2sec*self.chromasr)) + hop2 = int(round(0.5*self.chromasr)) + nbins, norigframes = self.chroma.shape + if norigframes<win2: + nframes = 1 + else: + nframes = int(1+numpy.floor((norigframes-win2)/float(hop2))) + bihistframes = numpy.empty((nbins*nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + frame = self.chroma[:, (i*hop2):min((i*hop2+win2),norigframes)] + bihist = self.bihistogram(frame, align=align) + bihist = numpy.reshape(bihist, -1) + bihistframes[:, i] = bihist + self.bihist = bihistframes + + def get_pitchbihist(self, filename='test.wav'): + self.bihist_from_chroma(filename=filename) + return self.bihist + + def get_pitchbihist_from_chroma(self, chroma=[], chromasr=[]): + self.chroma = chroma + self.chromasr = chromasr + self.bihist_from_precomp_chroma(align=False) + return self.bihist + + +if __name__ == '__main__': + pb = PitchBihist() + pb.get_pitchbihist()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/load_dataset.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 15 22:52:57 2017 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +import pickle + +import load_features +import util_dataset +import util_filter_dataset + + +#METADATA_FILE = 'sample_dataset/metadata.csv' +#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] +WIN_SIZE = 2 +METADATA_FILE = 'data/metadata_BLSM_language_all.csv' +#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] +#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] +OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] + +def extract_features(df, win2sec=8.0): + """Extract features from melspec and chroma. + + Parameters + ---------- + df : pd.DataFrame + Metadata including class label and path to audio, melspec, chroma + win2sec : float + The window size for the second frame decomposition of the features + + Returns + ------- + X : np.array + The features for every frame x every audio file in the dataset + Y : np.array + The class labels for every frame in the dataset + Y_audio : np.array + The audio labels + """ + feat_loader = load_features.FeatureLoader(win2sec=win2sec) + frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) + print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape + X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) + Y = Y_df.get_values() + Y_audio = Y_audio_df.get_values() + return X, Y, Y_audio + + +if __name__ == '__main__': + # load dataset + df = pd.read_csv(METADATA_FILE) + df = util_filter_dataset.remove_missing_data(df) + subset_idx = util_dataset.subset_labels(df['Country'].get_values()) + df = df.iloc[subset_idx, :] + X, Y = np.arange(len(df)), df['Country'].get_values() + + # split in train, val, test set + train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) + + # extract features and write output + X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) + with open(OUTPUT_FILES[0], 'wb') as f: + pickle.dump([X_train, Y_train, Y_audio_train], f) + + X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) + with open(OUTPUT_FILES[1], 'wb') as f: + pickle.dump([X_val, Y_val, Y_audio_val], f) + + X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) + with open(OUTPUT_FILES[2], 'wb') as f: + pickle.dump([X_test, Y_test, Y_audio_test], f) + +#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' +# pickle.dump([X_test, Y_test, Y_audio_test], f) +#with open(out_file, 'wb') as f:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/load_features.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 16 01:50:57 2017 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +import os +from sklearn.decomposition import NMF +import OPMellin as opm +import MFCC as mfc +import PitchBihist as pbi + + +class FeatureLoader: + def __init__(self, win2sec=8): + self.win2sec = float(win2sec) + self.sr = 44100. + self.win1 = int(round(0.04*self.sr)) + self.hop1 = int(round(self.win1/8.)) + self.framessr = self.sr/float(self.hop1) + self.win2 = int(round(self.win2sec*self.framessr)) + self.hop2 = int(round(0.5*self.framessr)) + self.framessr2 = self.framessr/float(self.hop2) + + + def get_op_mfcc_for_file(self, melspec_file=None, scale=True): + op = [] + mfc = [] + if not os.path.exists(melspec_file): + return op, mfc + print 'extracting onset patterns and mfccs...' + songframes = pd.read_csv(melspec_file, engine="c", header=None) + songframes.iloc[np.where(np.isnan(songframes))] = 0 + songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes + melspec = songframes.get_values().T + op = self.get_op_from_melspec(melspec, K=2) + mfc = self.get_mfcc_from_melspec(melspec) + if scale: + # scale all frames by mean and std of recording + op = (op - np.nanmean(op)) / np.nanstd(op) + mfc = (mfc - np.nanmean(mfc)) / np.nanstd(mfc) + return op, mfc + + + def get_chroma_for_file(self, chroma_file=None, scale=True): + ch = [] + if not os.path.exists(chroma_file): + return ch + print 'extracting chroma...' + songframes = pd.read_csv(chroma_file, engine="c", header=None) + songframes.iloc[np.where(np.isnan(songframes))] = 0 + songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes + chroma = songframes.get_values().T + ch = self.get_ave_chroma(chroma) + if scale: + # scale all frames by mean and std of recording + ch = (ch - np.nanmean(ch)) / np.nanstd(ch) + return ch + + + def get_music_idx_from_bounds(self, bounds, sr=None): + music_idx = [] + if len(bounds) == 0: + # bounds is empty list + return music_idx + nbounds = bounds.shape[0] + if len(np.where(bounds[:,2]=='m')[0])==0: + # no music segments + return music_idx + elif len(np.where(bounds[:,2]=='s')[0])==nbounds: + # all segments are speech + return music_idx + else: + half_win_hop = int(round(0.5 * self.win2 / float(self.hop2))) + music_bounds = np.where(bounds[:, 2] == 'm')[0] + bounds_in_frames = np.round(np.array(bounds[:, 0], dtype=float) * sr) + duration_in_frames = np.round(np.array(bounds[:, 1], dtype=float) * sr) + for music_bound in music_bounds: + lower_bound = np.max([0, bounds_in_frames[music_bound] - half_win_hop]) + upper_bound = lower_bound + duration_in_frames[music_bound] - half_win_hop + music_idx.append(np.arange(lower_bound, upper_bound, dtype=int)) + if len(music_idx)>0: + music_idx = np.sort(np.concatenate(music_idx)) # it should be sorted, but just in case segments overlap -- remove duplicates if segments overlap + return music_idx + + + def get_music_idx_for_file(self, segmenter_file=None): + music_idx = [] + if os.path.exists(segmenter_file) and os.path.getsize(segmenter_file)>0: + print 'loading speech/music segments...' + bounds = pd.read_csv(segmenter_file, header=None, delimiter='\t').get_values() + if bounds.shape[1] == 1: # depends on the computer platform + bounds = pd.read_csv(segmenter_file, header=None, delimiter=',').get_values() + music_idx = self.get_music_idx_from_bounds(bounds, sr=self.framessr2) + return music_idx + + + def get_features(self, df, class_label='Country'): + oplist = [] + mflist = [] + chlist = [] + pblist = [] + clabels = [] + aulabels = [] + n_files = len(df) + for i in range(n_files): + if not (os.path.exists(df['Melspec'].iloc[i]) and os.path.exists(df['Chroma'].iloc[i]) and os.path.exists(df['Melodia'].iloc[i])): + continue + print 'file ' + str(i) + ' of ' + str(n_files) + music_idx = self.get_music_idx_for_file(df['Speech'].iloc[i]) + if len(music_idx)==0: + # no music segments -> skip this file + continue + try: + op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i]) + ch = self.get_chroma_for_file(df['Chroma'].iloc[i]) + #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i]) + pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i]) + #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i]) + except: + continue + min_n_frames = np.min([len(op), len(mfcc), len(ch), len(pb)]) # ideally, features should have the same number of frames + if min_n_frames==0: + # no features extracted -> skip this file + continue + music_idx = music_idx[music_idx<min_n_frames] + n_frames = len(music_idx) + oplist.append(op.iloc[music_idx, :]) + mflist.append(mfcc.iloc[music_idx, :]) + chlist.append(ch.iloc[music_idx, :]) + pblist.append(pb.iloc[music_idx, :]) + clabels.append(pd.DataFrame(np.repeat(df[class_label].iloc[i], n_frames))) + aulabels.append(pd.DataFrame(np.repeat(df['Audio'].iloc[i], n_frames))) + print len(oplist), len(mflist), len(chlist), len(pblist), len(clabels), len(aulabels) + return pd.concat(oplist), pd.concat(mflist), pd.concat(chlist), pd.concat(pblist), pd.concat(clabels), pd.concat(aulabels) + + + def get_op_from_melspec(self, melspec, K=None): + op = opm.OPMellin(win2sec=self.win2sec) + opmellin = op.get_opmellin_from_melspec(melspec=melspec, melsr=self.framessr) + if K is not None: + opmel = self.mean_K_bands(opmellin.T, K) + opmel = pd.DataFrame(opmel) + return opmel + + + def get_mfcc_from_melspec(self, melspec, deltamfcc=True, avelocalframes=True, stdlocalframes=True): + mf = mfc.MFCCs() + mfcc = mf.get_mfccs_from_melspec(melspec=melspec, melsr=self.framessr) + if deltamfcc: + ff = mfcc + ffdiff = np.diff(ff, axis=1) + ffdelta = np.concatenate((ffdiff, ffdiff[:,-1,None]), axis=1) + frames = np.concatenate([ff,ffdelta], axis=0) + mfcc = frames + if avelocalframes: + mfcc = self.average_local_frames(mfcc, getstd=stdlocalframes) + mfcc = pd.DataFrame(mfcc.T) + return mfcc + + + def get_ave_chroma(self, chroma, avelocalframes=True, stdlocalframes=True, alignchroma=True): + chroma[np.where(np.isnan(chroma))] = 0 + if alignchroma: + maxind = np.argmax(np.sum(chroma, axis=1)) + chroma = np.roll(chroma, -maxind, axis=0) + if avelocalframes: + chroma = self.average_local_frames(chroma, getstd=stdlocalframes) + chroma = pd.DataFrame(chroma.T) + return chroma + + + def average_local_frames(self, frames, getstd=False): + nbins, norigframes = frames.shape + if norigframes<self.win2: + nframes = 1 + else: + nframes = int(1+np.floor((norigframes-self.win2)/float(self.hop2))) + if getstd: + aveframes = np.empty((nbins+nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + meanf = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1) + stdf = np.nanstd(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1) + aveframes[:,i] = np.concatenate((meanf,stdf)) + else: + aveframes = np.empty((nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + aveframes[:,i] = np.nanmean(frames[:, (i*self.hop2):min((i*self.hop2+self.win2),norigframes)], axis=1) + return aveframes + + + def mean_K_bands(self, songframes, K=40, nmels=40): + [F, P] = songframes.shape + Pproc = int((P/nmels)*K) + procframes = np.zeros([F, Pproc]) + niters = int(nmels/K) + nbins = P/nmels # must be 200 bins + for k in range(K): + for j in range(k*niters, (k+1)*niters): + procframes[:, (k*nbins):((k+1)*nbins)] += songframes[:, (j*nbins):((j+1)*nbins)] + procframes /= float(niters) + return procframes + + + def nmfpitchbihist(self, frames): + nbins, nfr = frames.shape + npc = 2 + nb = int(np.sqrt(nbins)) # assume structure of pitch bihist is nbins*nbins + newframes = np.empty(((nb+nb)*npc, nfr)) + for fr in range(nfr): + pb = np.reshape(frames[:, fr], (nb, nb)) + try: + nmfmodel = NMF(n_components=npc).fit(pb) + W = nmfmodel.transform(pb) + H = nmfmodel.components_.T + newframes[:, fr, None] = np.concatenate((W, H)).flatten()[:, None] + except: + newframes[:, fr, None] = np.zeros(((nb+nb)*npc, 1)) + return newframes + + + def get_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True): + pb = [] + if not os.path.exists(melodia_file): + return pb + print 'extracting pitch bihist from melodia...' + pb = pbi.PitchBihist(win2sec=self.win2sec) + pbihist = pb.bihist_from_melodia(filename=melodia_file, stop_sec=90.0) + if nmfpb is True: + pbihist = self.nmfpitchbihist(pbihist) + pbihist = pd.DataFrame(pbihist.T) + if scale: + # scale all frames by mean and std of recording + pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist) + return pbihist + + + def load_precomputed_pb_from_melodia(self, melodia_file=None, nmfpb=True, scale=True): + base = os.path.basename(melodia_file) + root = '/import/c4dm-05/mariap/Melodia-melody-'+str(int(self.win2sec))+'sec/' + print 'load precomputed pitch bihist', root + if self.win2sec == 8: + pbihist = pd.read_csv(os.path.join(root, base)).iloc[1:,1:] + else: + pbihist = np.loadtxt(os.path.join(root, base), delimiter=',').T + if nmfpb is True: + pbihist = self.nmfpitchbihist(pbihist) + pbihist = pd.DataFrame(pbihist.T) + print pbihist.shape + if scale: + # scale all frames by mean and std of recording + pbihist = (pbihist - np.nanmean(pbihist)) / np.nanstd(pbihist) + return pbihist + + +# def get_pb_chroma_for_file(self, chroma_file=None, scale=True): +# ch = [] +# pb = [] +# if not os.path.exists(chroma_file): +# return ch, pb +# songframes = pd.read_csv(chroma_file, engine="c", header=None) +# songframes.iloc[np.where(np.isnan(songframes))] = 0 +# songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes +# chroma = songframes.get_values().T +# ch = self.get_ave_chroma(chroma) +# pb = self.get_pbihist_from_chroma(chroma) +# if scale: +# # scale all frames by mean and std of recording +# ch = (ch - np.nanmean(ch)) / np.nanstd(ch) +# pb = (pb - np.nanmean(pb)) / np.nanstd(pb) +# return ch, pb + + +# def get_pbihist_from_chroma(self, chroma, alignchroma=True, nmfpb=True): +# pb = pbi.PitchBihist(win2sec=self.win2sec) +# chroma[np.where(np.isnan(chroma))] = 0 +# if alignchroma: +# maxind = np.argmax(np.sum(chroma, axis=1)) +# chroma = np.roll(chroma, -maxind, axis=0) +# pbihist = pb.get_pitchbihist_from_chroma(chroma=chroma, chromasr=self.framessr) +# if nmfpb is True: +# pbihist = self.nmfpitchbihist(pbihist) +# pbihist = pd.DataFrame(pbihist.T) +# return pbihist \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/results.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Jul 12 20:49:48 2016 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +import pickle +from collections import Counter +from sklearn.cluster import KMeans + +import utils +import utils_spatial + + +def country_outlier_df(counts, labels, out_file=None, normalize=False): + if len(counts.keys()) < len(np.unique(labels)): + for label in np.unique(labels): + if not counts.has_key(label): + counts.update({label:0}) + if normalize is True: + counts = normalize_outlier_counts(counts, Counter(labels)) + df = pd.DataFrame.from_dict(counts, orient='index').reset_index() + df.rename(columns={'index':'Country', 0:'Outliers'}, inplace=True) + if out_file is not None: + df.to_csv(out_file, index=False) + return df + + +def normalize_outlier_counts(outlier_counts, country_counts): + '''Normalize a dictionary of outlier counts per country by + the total number of recordings per country + ''' + for key in outlier_counts.keys(): + # dictionaries should have the same keys + outlier_counts[key] = float(outlier_counts[key]) / float(country_counts[key]) + return outlier_counts + + +def get_outliers_df(X, Y, chi2thr=0.999, out_file=None): + threshold, y_pred, MD = utils.get_outliers_Mahal(X, chi2thr=chi2thr) + global_counts = Counter(Y[y_pred]) + df = country_outlier_df(global_counts, Y, normalize=True) + if out_file is not None: + df.to_csv(out_file, index=False) + return df, threshold, MD + + +def print_most_least_outliers_topN(df, N=10): + sort_inds = df['Outliers'].argsort() # ascending order + df_most = df[['Country', 'Outliers']].iloc[sort_inds[::-1][:N]] + df_least = df[['Country', 'Outliers']].iloc[sort_inds[:N]] + print "most outliers " + print df_most + print "least outliers " + print df_least + + +def load_metadata(Yaudio, metadata_file): + df = pd.read_csv(metadata_file) + df_audio = pd.DataFrame({'Audio':Yaudio}) + ddf = pd.merge(df_audio, df, on='Audio', suffixes=['', '_r']) # in the order of Yaudio + return ddf + + +def clusters_metadata(df, cl_pred, out_file=None): + def get_top_N_counts(labels, N=3): + ulab, ucount = np.unique(labels, return_counts=True) + inds = np.argsort(ucount) + return zip(ulab[inds[-N:]],ucount[inds[-N:]]) + info = np.array([str(df['Country'].iloc[i]) for i in range(len(df))]) + styles_description = [] + uniq_cl = np.unique(cl_pred) + for ccl in uniq_cl: + inds = np.where(cl_pred==ccl)[0] + styles_description.append(get_top_N_counts(info[inds], N=3)) + df_styles = pd.DataFrame(data=styles_description, index=uniq_cl) + print df_styles.to_latex() + if out_file is not None: + df_styles.to_csv(out_file, index=False) + + +# load LDA-transformed frames +X_list, Y, Yaudio = pickle.load(open('data/lda_data_8.pickle','rb')) +ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') +w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) +w_dict = utils_spatial.from_weights_to_dict(w, data_countries) +Xrhy, Xmel, Xmfc, Xchr = X_list +X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) + +# global outliers +df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999) +print_most_least_outliers_topN(df_global, N=10) + +spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.999, do_pca=True) +spatial_counts = Counter(dict([(ll[0],ll[1]) for ll in spatial_outliers])) +df_local = country_outlier_df(spatial_counts, Y, normalize=True) +print_most_least_outliers_topN(df_local, N=10) + +feat = [Xrhy, Xmel, Xmfc, Xchr] +feat_labels = ['rhy', 'mel', 'mfc', 'chr'] +tabs_feat = [] +for i in range(len(feat)): + XX = feat[i] + df_feat, threshold, MD = get_outliers_df(XX, Y, chi2thr=0.999) + print_most_least_outliers_topN(df_feat, N=5) + +# how many styles are there +#bestncl, ave_silh = utils.best_n_clusters_silhouette(X, min_ncl=5, max_ncl=50, metric="cosine") +bestncl = 13 + +# get cluster predictions and metadata for each cluster +cluster_model = KMeans(n_clusters=bestncl, random_state=50).fit(X) +centroids = cluster_model.cluster_centers_ +cl_pred = cluster_model.predict(X) +ddf['Clusters'] = cl_pred +clusters_metadata(ddf, cl_pred) + +# how similar are the cultures and which ones seem to be global outliers +cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids) + +# Moran on Mahalanobis distances +data = cluster_freq.get_values() +data_countries = cluster_freq.index +#threshold, y_pred, MD = utils.get_outliers_Mahal(data, chi2thr=0.999) +threshold, y_pred, MD = utils.get_outliers(data, chi2thr=0.999) +y = np.sqrt(MD) +utils_spatial.print_Moran_outliers(y, w, data_countries) +utils_spatial.plot_Moran_scatterplot(y, w, data_countries)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/smoothiecore.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,328 @@ +"""Smoothie + +A module aimed at melody salience and melody features. It contains + +* a frequency transform inspired by constant-Q transforms +* an NMF-based melodic salience function +* methods that transform this salience into melody features (not implemented) +""" + +import sys +import numpy as np +from scipy.signal import hann +from scipy.sparse import csr_matrix +import librosa + +p = {# smoothie q mapping parameters + 'bpo' : 36, + 'break1' : 500, + 'break2' : 4000, + 'f_max' : 8000, + 'fs' : 16000, + # spectrogram parameters + 'step_size' : 160, + 'use_A_wtg' : False, + 'harm_remove' : False, + # NMF and NMF dictionary parameters + 's' : 0.85, # decay parameter + 'n_harm' : 50, # I'm gonna do you no harm + 'min_midi' : 25, + 'max_midi' : 90.8, + 'bps' : 5, + 'n_iter' : 30 + } + +def update_nmf_simple(H, W, V): + """Update the gain matrix H using the multiplicative NMF update equation. + + Keyword arguments: + H -- gain matrix + W -- template matrix + V -- target data matrix + """ + WHV = np.dot(W, H) ** (-1) * V + H = H * np.dot(W.transpose(), WHV) + return H + +def make_a_weight(f): + """Return the values of A-weighting at the input frequencies f.""" + f = np.array(f) + zaehler = 12200 * 12200* (f**4) + nenner = (f*f + 20.6*20.6) * np.sqrt((f*f + 107.7*107.7) * (f*f + 737.9*737.9)) * (f*f + 12200*12200) + return zaehler / nenner + +def nextpow2(x): + """Return the smallest integer n such that 2 ** n > x.""" + return np.ceil(np.log2(x)) + +def get_smoothie_frequencies(p): + """Calculate filter centres and widths for the smooth Q transform.""" + # I think this calculation should be reconsidered in order to have + # a more principled transition between linear and exponential + n = 1.0 / (2.0**(1.0/p['bpo']) - 1) + x = p['break1'] * 1.0 / n + + # first linear stretch + # f = (np.arange(1, int(np.round(n)) + 1) * x).round().tolist() + f = (np.arange(1, int(np.round(n)) + 1) * x).tolist() + + # exponential stretch + while max(f) < p['break2']: + f.append(max(f) * 2**(1.0/p['bpo'])) + + # final linear stretch + lastdiff = f[-1] - f[-2] + while max(f) < p['f_max']: + f.append(max(f) + lastdiff) + + deltaf = np.diff(np.array(f)) + f = f[:-1] + + return f, deltaf + +def create_smoothie_kernel(f, deltaf, fs): + """Create a sparse matrix that maps the complex DFT to the complex + smoothie representation. + """ + print >>sys.stdout, "[ SMOOTHIE Q kernel calculation ... ]" + n_filter = len(f) + n_fft = 2**nextpow2(np.ceil(fs/min(deltaf))) + + thresh = 0.0054 + smoothie_kernel = np.zeros([n_fft, n_filter], np.complex64) + + for i_filter in range(n_filter-1, -1, -1): # descending + # print i_filter + Q = f[i_filter] * 1.0 / deltaf[i_filter] # local Q for this filter + lgth = int(np.ceil(fs * 1.0 / deltaf[i_filter])) + lgthinv = 1.0 / lgth + offset = int(n_fft/2 - np.ceil(lgth * 0.5)) + temp = hann(lgth) * lgthinv * \ + np.exp(2j * np.pi * Q * (np.arange(lgth) - offset) * lgthinv) + # print(sum(hann(lgth)), Q, lgth, offset) + temp_kernel = np.zeros(n_fft, dtype = np.complex64) + temp_kernel[np.arange(lgth) + offset] = temp + spec_kernel = np.fft.fft(temp_kernel) + spec_kernel[abs(spec_kernel) <= thresh] = 0 + smoothie_kernel[:,i_filter] = spec_kernel + return csr_matrix(smoothie_kernel.conj()/n_fft) + +def smoothie_q_spectrogram(x, p): + """Calculate the actual spectrogram with smooth Q frequencies""" + print >>sys.stdout, "[ SMOOTHIE Q spectrogram ... ]" + + # precalculate smoothie kernel + f, deltaf = get_smoothie_frequencies(p) + smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs']) + n_fft, n_filter = smoothie_kernel.shape + + # some preparations + n_sample = len(x) + # n_frame = int(np.floor(n_sample / p['step_size'])) + n_frame = int(np.ceil(n_sample / float(p['step_size']))) # added mp + t = (np.arange(n_frame) * p['step_size']) * 1.0 / p['fs'] + smoothie_kernelT = smoothie_kernel.T + + # allocate + s = np.zeros((n_filter, n_frame), dtype = np.complex64) + + # pad (if wanted) + x = np.concatenate((np.zeros(n_fft/2), x, np.zeros(n_fft/2))) + + for i_frame in range(n_frame): + smpl = p['step_size'] * i_frame + block = x[smpl + np.arange(n_fft)] + s[:, i_frame] = smoothie_kernelT.dot(np.fft.fft(block)) + + if p['use_A_wtg']: + a_wtg = make_a_weight(f) + s = s * a_wtg[:, np.newaxis] + + return s, t + +def mel_triangles(input_f): + """Make matrix with mel filters at the given frequencies. + Warning: this is a very coarse mel filterbank. + """ + n_linearfilters = 3 + n_logfilters0 = 30 # just something high, will be pruned later + linear_spacing = 100 + log_spacing = 6.0/4 + n_filter0 = n_linearfilters + n_logfilters0 + + freqs = np.zeros(n_filter0+2) # includes one more on either side, hence +2 + freqs[range(n_linearfilters+1)] = \ + np.arange(-1,n_linearfilters) * linear_spacing + freqs[range(n_linearfilters+1, n_filter0+2)] = \ + freqs[n_linearfilters] * log_spacing ** np.arange(1, n_logfilters0+2) + + centre_freqs = freqs[1:-1] + lower_freqs = freqs[0:-2] + upper_freqs = freqs[2:] + + n_filter = list(np.nonzero(lower_freqs < max(input_f)))[0][-1] + 1 + + n_input_f = len(input_f) + mtr = np.zeros((n_input_f, n_filter)) + + for i_filter in range(n_filter): + for i_freq in range(n_input_f): + if input_f[i_freq] > lower_freqs[i_filter] \ + and input_f[i_freq] <= upper_freqs[i_filter]: + + if input_f[i_freq] <= centre_freqs[i_filter]: + mtr[i_freq, i_filter] = \ + (input_f[i_freq] - lower_freqs[i_filter]) * 1.0 / \ + (centre_freqs[i_filter] - lower_freqs[i_filter]) + else: + mtr[i_freq, i_filter] = \ + 1 - (input_f[i_freq] - centre_freqs[i_filter]) / \ + (upper_freqs[i_filter] - centre_freqs[i_filter]) + return mtr + +def create_smoothie_nfm_dict(smoothie_kernel, filterf, p): + """Create dictionary matrix with note templates.""" + n_note = int((p['max_midi'] - p['min_midi']) * p['bps'] + 1) + n_fft, n_filter = smoothie_kernel.shape + t = ((np.arange(n_fft) + 1) - ((n_fft + 1)*0.5))/p['fs'] + + mtr = mel_triangles(filterf) + n_template = n_note + mtr.shape[1] + + w = np.zeros((n_filter, n_template), dtype = np.complex64) + w[:,n_note:] = mtr + f0s = [] + + smoothie_kernelT = smoothie_kernel.T + + for i_note in range(n_note): + midi = p['min_midi'] + i_note * 1.0 / p['bps'] + f0 = 440 * 2 ** ((midi-69)*1.0/12) + f0s.append(f0) + sig = np.zeros(len(t)) + for i_harm in range(p['n_harm']): + f = f0 * (i_harm + 1) + if f > p['fs'] * 0.5: + continue + x = np.sin(2*np.pi*f*t) * p['s']**(i_harm) + sig += x + w[:, i_note] = smoothie_kernelT.dot(np.fft.fft(sig)) + + for i in range(mtr.shape[1]): + f0s.append(np.nan) + + w = abs(w) + col_sums = w.sum(axis = 0) + w = w / col_sums[np.newaxis, :] # normalisation + return w, np.array(f0s) + +def smoothie_salience(x, p, do_tune = False): + """Calculate melodic salience.""" + print >>sys.stdout, "[ SMOOTHIE Q salience ... ]" + + # precalculate nmf kernel + f, deltaf = get_smoothie_frequencies(p) + smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs']) + w, f0s = create_smoothie_nfm_dict(smoothie_kernel, f, p) + + # calculate smoothiegram + s, t = smoothie_q_spectrogram(x, p) + s[s==0] = np.spacing(1) # very small number + s = abs(s) + + # run NMF + n_frame = len(t) + print >>sys.stdout, "[ SMOOTHIE Q : NMF updates ... ]" + sal = np.ones((w.shape[1], n_frame)) + for i_iter in range(p['n_iter']): + sal = update_nmf_simple(sal, w, s) + + if do_tune: + error("Tuning isn't yet implemented in the Python version") + + return sal, t, f0s + +def qnormalise(x, q, dim): + nrm = np.sum(x**q, axis=dim, keepdims=True)**(1./float(q)) + nrmmatrix = np.repeat(nrm, x.shape[0], axis=0) + x = x / nrmmatrix + return x + +def wrap_to_octave(sal, param): + epsilon = 0.00001 + nsal = qnormalise(sal, 2, 0) + + step = 1./float(param['bps']) + NMFpitch = np.arange(param['min_midi'],param['max_midi']+step,step) + nNote = len(NMFpitch) + nsal = nsal[0:nNote, :] + + allsal = nsal + allsal[nsal<epsilon] = epsilon + + # chroma mapping + n = param['bps']*12 + mmap = np.tile(np.eye(n),(1,5)) + allchroma = mmap.dot(allsal[0:(n*5),:]) + return allchroma + +def segment_audio(y, sr): + tracklength = y.shape[0]/float(sr) + startSample = 0 + endSample = None + if tracklength > 90: + startPointSec = (tracklength/2.)-20 + startSample = round(startPointSec*sr) + endSample = startSample+45*sr + y = y[startSample:endSample] + return y + +def get_smoothie(filename = 'test.wav', param = None, segment=True, secondframedecomp=False, hopinsec=None): + if param is None: + param = p + param['fs'] = None + param['step_size'] = 128 + + y, sr = librosa.load(filename, sr = param['fs']) + param['fs'] = sr + if hopinsec is not None: + param['step_size'] = int(round(hopinsec*param['fs'])) + if segment: + y = segment_audio(y,sr) + # sg, t = smoothie_q_spectrogram(y, param) + # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param) + sal, t, f0s = smoothie_salience(y, param) + sal = sal[-np.isnan(f0s),:] + chroma = wrap_to_octave(sal, param) + + if secondframedecomp: + chromasr = param['fs']/float(param['step_size']) + win2 = int(round(8*chromasr)) + hop2 = int(round(0.5*chromasr)) + nbins, norigframes = chroma.shape + nframes = int(1+np.floor((norigframes-win2)/float(hop2))) + avechroma = np.empty((nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + avechroma[:,i] = np.mean(chroma[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True) + chroma = avechroma + return chroma + +def get_smoothie_for_bihist(filename = 'test.wav', param = None, segment=True, hopinsec=None): + if param is None: + param = p + param['fs'] = None # no downsample + param['step_size'] = 128 + + y, sr = librosa.load(filename, sr = param['fs']) + param['fs'] = sr + if hopinsec is not None: + param['step_size'] = int(round(hopinsec*param['fs'])) + if segment: + y = segment_audio(y,sr) + # sg, t = smoothie_q_spectrogram(y, param) + # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param) + sal, t, f0s = smoothie_salience(y, param) + sal = sal[-np.isnan(f0s),:] + chroma = wrap_to_octave(sal, param) + chromasr = param['fs']/float(param['step_size']) + return (chroma, chromasr) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/util_dataset.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 15 23:04:24 2017 + +@author: mariapanteli +""" + +import numpy as np +from sklearn.model_selection import train_test_split + + +def get_train_val_test_idx(X, Y, seed=None): + """ Split in train, validation, test sets. + + Parameters + ---------- + X : np.array + Data or indices. + Y : np.array + Class labels for data in X. + seed: int + Random seed. + Returns + ------- + (X_train, Y_train) : tuple + Data X and labels y for the train set + (X_val, Y_val) : tuple + Data X and labels y for the validation set + (X_test, Y_test) : tuple + Data X and labels y for the test set + + """ + X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) + X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) + return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) + + +def subset_labels(Y, N_min=10, N_max=100, seed=None): + """ Subset dataset to contain minimum N_min and maximum N_max instances + per class. Return indices for this subset. + + Parameters + ---------- + Y : np.array + Class labels + N_min : int + Minimum instances per class + N_max : int + Maximum instances per class + seed: int + Random seed. + + Returns + ------- + subset_idx : np.array + Indices for a subset with classes of size bounded by N_min, N_max + + """ + subset_idx = [] + labels = np.unique(Y) + for label in labels: + label_idx = np.where(Y==label)[0] + counts = len(label_idx) + if counts>=N_max: + subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) + elif counts>=N_min and counts<N_max: + subset_idx.append(label_idx) + else: + # not enough samples for this class, skip + continue + if len(subset_idx)>0: + subset_idx = np.concatenate(subset_idx, axis=0) + return subset_idx
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/utils.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 17 11:26:01 2017 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans +from sklearn import metrics +from collections import Counter +from sklearn.decomposition import PCA +from scipy import stats +from scipy.spatial.distance import mahalanobis +from sklearn.covariance import MinCovDet + + +def get_outliers(X, chi2thr=0.975): + robust_cov = MinCovDet().fit(X) + MD = robust_cov.mahalanobis(X) + chi2 = stats.chi2 + degrees_of_freedom = X.shape[1] + threshold = chi2.ppf(chi2thr, degrees_of_freedom) + y_pred = MD>threshold + return threshold, y_pred, MD + + +def get_outliers_Mahal(X, chi2thr=0.975): + n_samples = X.shape[0] + inv_cov = np.linalg.inv(np.cov(X.T)) + col_mean = np.mean(X, axis=0) + MD = np.zeros(n_samples, dtype='float') + for i in range(n_samples): + MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) + MD = MD ** 2 + degrees_of_freedom = X.shape[1] + chi2 = stats.chi2 + threshold = chi2.ppf(chi2thr, degrees_of_freedom) + y_pred = MD>threshold + return threshold, y_pred, MD + + +def pca_data(X, min_variance=None): + # rotate data to avoid singularity in Mahalanobis/covariance matrix + model = PCA(whiten=True).fit(X) + model.explained_variance_ratio_.sum() + if min_variance is None: + n_pc = X.shape[1] + else: + n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] + X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) + return X_pca, n_pc + + +def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): + uniq_labels = np.unique(Y) + spatial_outliers = [] + for uniq_label in uniq_labels: + countries_neighbors = w_dict[uniq_label] + if len(countries_neighbors)==0: + print uniq_label, " no neighbors found" + continue + inds_neighborhood = [] + for country in countries_neighbors: + inds = np.where(Y==country)[0] + inds_neighborhood.append(inds) # append neighboring countries + if len(np.concatenate(inds_neighborhood))==0: + print "no neighbors found" + continue + inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country + inds_neighborhood = np.concatenate(inds_neighborhood) + if do_pca: + XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) + else: + XX = X[inds_neighborhood, :] # assume X is already in PCA + print len(inds_neighborhood) + if len(inds_neighborhood)<XX.shape[1]: + print uniq_label, " neighborhood too small for number of features" + continue + threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr) + counts = Counter(Y[inds_neighborhood[y_pred]]) + spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood]) + return spatial_outliers + + +def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'): + ave_silh = [] + for i in range(min_ncl): + ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering + for ncl in range(min_ncl, max_ncl): + print ncl + cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X) + ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg + ave_silh = np.array(ave_silh) + bestncl = np.nanargmax(ave_silh) + return bestncl, ave_silh + + +def get_cluster_freq_linear(X, Y, centroids): + """ for each label in Y get the distribution of clusters by linear encoding + """ + def encode_linear(X, centroids): + """Linear encoding via the dot product + """ + return np.dot(X, centroids.T) + encoding = encode_linear(X, centroids) + encoding_df = pd.DataFrame(data=encoding, index=Y) + encoding_df_sum = encoding_df.groupby(encoding_df.index).sum() + cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum) + cluster_freq.index.name = 'labels' + return cluster_freq + + +def get_cluster_predictions(X, n_clusters=10): + cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X) + return cl_pred
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/utils_spatial.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 17 11:35:51 2017 + +@author: mariapanteli +""" +import numpy as np +import json +import pysal # before shapely in util_plots +import fiona +import sys +sys.path.append('../misc') +import matplotlib.pyplot as plt + + +def neighbors_from_json_file(data_countries, json_DB='../MergeBL-Smith/data/countries.json'): + neighbors = {} + with open(json_DB) as json_file: + countries_dict = json.load(json_file) + country_names = [] + country_iso = [] + country_borders_iso = [] + for country_info in countries_dict: + country_names.append(country_info['name']['common']) + country_iso.append(country_info['cca3']) + country_borders_iso.append(country_info['borders']) + # temporary fixes of country names to match json data + country_names[country_names.index('United States')] = 'United States of America' + country_names[country_names.index('Tanzania')] = 'United Republic of Tanzania' + country_names[country_names.index('DR Congo')] = 'Democratic Republic of the Congo' + country_names[country_names.index('Czechia')] = 'Czech Republic' + for i, country in enumerate(data_countries): + neighbors[i] = {} + if country in country_names: + if len(country_borders_iso[country_names.index(country)])>0: + # if country has neighbors according to json file + neighbors_iso = country_borders_iso[country_names.index(country)] + neighbors_names = [country_names[country_iso.index(nn)] for nn in neighbors_iso] + for neighbor in neighbors_names: + if neighbor in data_countries: + neighbor_idx = np.where(data_countries==neighbor)[0][0] + neighbors[i][neighbor_idx] = 1.0 + w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) + return w + + +def get_countries_from_shapefile(shapefile): + shp = fiona.open(shapefile, 'r') + countries = [] + if shp[0]["properties"].has_key("ADMIN"): + country_keyword = "ADMIN" + elif shp[0]["properties"].has_key("NAME"): + country_keyword = "NAME" + else: + country_keyword = "admin" + for line in shp: + countries.append(line["properties"][country_keyword]) + shp.close() + return countries + + +def replace_empty_neighbours_with_KNN(data_countries, w): + shapefile = "../MergeBL-Smith/shapefiles/ne_10m_admin_0_countries.shp" + no_neighbors_idx = w.islands + knn = 10 + wknn = pysal.knnW_from_shapefile(shapefile, knn) + knn_countries = get_countries_from_shapefile(shapefile) + neighbors = w.neighbors + for nn_idx in no_neighbors_idx: + country = data_countries[nn_idx] + print country + if country not in knn_countries: + continue + knn_country_idx = knn_countries.index(country) + knn_country_neighbors = [knn_countries[nn] for nn in wknn.neighbors[knn_country_idx]] + for knn_nn in knn_country_neighbors: + if len(neighbors[nn_idx])>2: + continue + data_country_idx = np.where(data_countries==knn_nn)[0] + if len(data_country_idx)>0: + neighbors[nn_idx][data_country_idx[0]] = 1.0 + w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) + return w + + +def get_neighbors_for_countries_in_dataset(Y): + # neighbors + data_countries = np.unique(Y) + w = neighbors_from_json_file(data_countries) + w = replace_empty_neighbours_with_KNN(data_countries, w) + return w, data_countries + + +def from_weights_to_dict(w, data_countries): + w_dict = {} + for i in w.neighbors: + w_dict[data_countries[i]] = [data_countries[nn] for nn in w.neighbors[i]] + return w_dict + + +def get_LH_HL_idx(lm, p_vals): + sig_idx = np.where(p_vals<0.05)[0] + LH_idx = sig_idx[np.where(lm.q[sig_idx]==2)[0]] + HL_idx = sig_idx[np.where(lm.q[sig_idx]==4)[0]] + return LH_idx, HL_idx + + +def print_Moran_outliers(y, w, data_countries): + lm = pysal.Moran_Local(y, w) + p_vals = lm.p_z_sim + LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) + print 'LH', zip(data_countries[LH_idx], p_vals[LH_idx]) # LH + print 'HL', zip(data_countries[HL_idx], p_vals[HL_idx]) # HL + + +def plot_Moran_scatterplot(y, w, data_countries, out_file=None): + lm = pysal.Moran_Local(y, w) + p_vals = lm.p_z_sim + LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) + + ylt = pysal.lag_spatial(lm.w, lm.y) + yt = lm.y + yt = (yt - np.mean(yt))/np.std(yt) + ylt = (ylt - np.mean(ylt))/np.std(ylt) + colors = plt.cm.spectral(np.linspace(0,1,5)) + quad = np.zeros(yt.shape, dtype=int) + quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH + quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH + quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL + quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL + marker_color = colors[quad] + marker_size = 40*np.ones(yt.shape, dtype=int) + marker_size[LH_idx] = 140 + marker_size[HL_idx] = 140 + + plt.figure() + plt.scatter(yt, ylt, c=marker_color, s=marker_size, alpha=0.7) + plt.xlabel('Value') + plt.ylabel('Spatially Lagged Value') + plt.axvline(c='black', ls='--') + plt.axhline(c='black', ls='--') + plt.ylim(min(ylt)-0.5, max(ylt)+0.5) + plt.xlim(min(yt)-0.5, max(yt)+1.5) + for i in np.concatenate((LH_idx, HL_idx)): + plt.annotate(data_countries[i], (yt[i], ylt[i]), xytext=(yt[i]*1.1, ylt[i]*1.1),textcoords='data',arrowprops=dict(arrowstyle="->",connectionstyle="arc3")) + extreme_points = np.concatenate(([np.argmin(ylt)], [np.argmax(ylt)], np.where(yt>np.mean(yt)+2.8*np.std(yt))[0], np.where(ylt>np.mean(yt)+2.8*np.std(ylt))[0])) + extreme_points = np.array(list(set(extreme_points) - set(np.concatenate((LH_idx, HL_idx))))) + for i in extreme_points: + plt.annotate(data_countries[i], (yt[i]+0.1, ylt[i])) + if out_file is not None: + plt.savefig(out_file)
--- a/util_dataset.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 15 23:04:24 2017 - -@author: mariapanteli -""" - -import numpy as np -from sklearn.model_selection import train_test_split - - -def get_train_val_test_idx(X, Y, seed=None): - """ Split in train, validation, test sets. - - Parameters - ---------- - X : np.array - Data or indices. - Y : np.array - Class labels for data in X. - seed: int - Random seed. - Returns - ------- - (X_train, Y_train) : tuple - Data X and labels y for the train set - (X_val, Y_val) : tuple - Data X and labels y for the validation set - (X_test, Y_test) : tuple - Data X and labels y for the test set - - """ - X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) - X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) - return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) - - -def subset_labels(Y, N_min=10, N_max=100, seed=None): - """ Subset dataset to contain minimum N_min and maximum N_max instances - per class. Return indices for this subset. - - Parameters - ---------- - Y : np.array - Class labels - N_min : int - Minimum instances per class - N_max : int - Maximum instances per class - seed: int - Random seed. - - Returns - ------- - subset_idx : np.array - Indices for a subset with classes of size bounded by N_min, N_max - - """ - subset_idx = [] - labels = np.unique(Y) - for label in labels: - label_idx = np.where(Y==label)[0] - counts = len(label_idx) - if counts>=N_max: - subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) - elif counts>=N_min and counts<N_max: - subset_idx.append(label_idx) - else: - # not enough samples for this class, skip - continue - if len(subset_idx)>0: - subset_idx = np.concatenate(subset_idx, axis=0) - return subset_idx
--- a/util_filter_dataset.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Mon Apr 10 17:44:48 2017 - -@author: mariapanteli -""" - -import os -import numpy as np -import pandas as pd - - -def get_speech_vamp(df): - jspeech = df.columns.get_loc("Speech") - nfiles = len(df) - speechinds = [] - for i in range(nfiles): - #print i - if os.path.exists(df.iat[i, jspeech]) and os.path.getsize(df.iat[i, jspeech])>0: - bounds = pd.read_csv(df.iat[i, jspeech], header=None, delimiter='\t').get_values() - if len(bounds)>0: - if len(np.where(bounds[:,2]=='m')[0])==0 or len(np.where(bounds[:,2]=='s')[0])==len(bounds): - speechinds.append(i) - return speechinds - - -def get_speech_meta(df): - genres = np.array(df["Genre_Album"].get_values(), dtype=str) - speechinds_genre = [] - invalid_genres = ["Spoken Word", "Language Instruction", "Classical", - "Poetry", "Nature|Sounds", "Music Instruction", - "Soundtracks &", "Contemporary &", "Jazz &", - "Sounds", "Ragtime", "Nature", "Electronic", - "African American Spoken", "Blues", "Gospel", - "Psychology &"] - for i in range(len(genres)): - genre = genres[i] - #if genre in invalid_genres: - if any(x in genre for x in invalid_genres): - speechinds_genre.append(i) - return speechinds_genre - - -def get_missing_csv(df): - nfiles = len(df) - missing_csv = [] - for i in range(nfiles): - if not (os.path.exists(df["Melspec"].iloc[i]) and os.path.exists(df["Chroma"].iloc[i]) and os.path.exists(df["Melodia"].iloc[i])): - missing_csv.append(i) - return missing_csv - - -def get_missing_country_meta(df): - nfiles = len(df) - missing_country = [] - country_labels = np.array(df['Country'].get_values(), dtype=str) - invalid_countries = ['Unidentified', 'unknown', 'nan', - 'Yugoslavia (former)', 'Pathian village Wangulei ', - 'Joulouloum either Senegal or The Gambia '] - for i in range(nfiles): - country = country_labels[i] - if country in invalid_countries: - missing_country.append(i) - return missing_country - - -def remove_missing_data(df): - speechinds_vamp = get_speech_vamp(df) - speechinds_genre = get_speech_meta(df) - speechinds = set(speechinds_vamp) | set(speechinds_genre) - missing = set(get_missing_csv(df)) - missing_country = set(get_missing_country_meta(df)) - selectinds = np.asarray(list(set(range(len(df))) - (missing | speechinds | missing_country))) - - df = df.iloc[selectinds, :] - return df \ No newline at end of file
--- a/utils.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 17 11:26:01 2017 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans -from sklearn import metrics -from collections import Counter -from sklearn.decomposition import PCA -from scipy import stats -from scipy.spatial.distance import mahalanobis -from sklearn.covariance import MinCovDet - - -def get_outliers(X, chi2thr=0.975): - robust_cov = MinCovDet().fit(X) - MD = robust_cov.mahalanobis(X) - chi2 = stats.chi2 - degrees_of_freedom = X.shape[1] - threshold = chi2.ppf(chi2thr, degrees_of_freedom) - y_pred = MD>threshold - return threshold, y_pred, MD - - -def get_outliers_Mahal(X, chi2thr=0.975): - n_samples = X.shape[0] - inv_cov = np.linalg.inv(np.cov(X.T)) - col_mean = np.mean(X, axis=0) - MD = np.zeros(n_samples, dtype='float') - for i in range(n_samples): - MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) - MD = MD ** 2 - degrees_of_freedom = X.shape[1] - chi2 = stats.chi2 - threshold = chi2.ppf(chi2thr, degrees_of_freedom) - y_pred = MD>threshold - return threshold, y_pred, MD - - -def pca_data(X, min_variance=None): - # rotate data to avoid singularity in Mahalanobis/covariance matrix - model = PCA(whiten=True).fit(X) - model.explained_variance_ratio_.sum() - if min_variance is None: - n_pc = X.shape[1] - else: - n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] - X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) - return X_pca, n_pc - - -def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): - uniq_labels = np.unique(Y) - spatial_outliers = [] - for uniq_label in uniq_labels: - countries_neighbors = w_dict[uniq_label] - if len(countries_neighbors)==0: - print uniq_label, " no neighbors found" - continue - inds_neighborhood = [] - for country in countries_neighbors: - inds = np.where(Y==country)[0] - inds_neighborhood.append(inds) # append neighboring countries - if len(np.concatenate(inds_neighborhood))==0: - print "no neighbors found" - continue - inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country - inds_neighborhood = np.concatenate(inds_neighborhood) - if do_pca: - XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) - else: - XX = X[inds_neighborhood, :] # assume X is already in PCA - print len(inds_neighborhood) - if len(inds_neighborhood)<XX.shape[1]: - print uniq_label, " neighborhood too small for number of features" - continue - threshold, y_pred, MD = get_outliers_Mahal(XX, chi2thr=chi2thr) - counts = Counter(Y[inds_neighborhood[y_pred]]) - spatial_outliers.append([uniq_label, counts[uniq_label], threshold, y_pred, MD, counts, inds_neighborhood]) - return spatial_outliers - - -def best_n_clusters_silhouette(X, min_ncl=2, max_ncl=50, metric='euclidean'): - ave_silh = [] - for i in range(min_ncl): - ave_silh.append(np.nan) # for ncl=0, ncl=1 no clustering - for ncl in range(min_ncl, max_ncl): - print ncl - cl_pred = KMeans(n_clusters=ncl, random_state=50).fit_predict(X) - ave_silh.append(metrics.silhouette_score(X, cl_pred, metric=metric)) # silhouette avg - ave_silh = np.array(ave_silh) - bestncl = np.nanargmax(ave_silh) - return bestncl, ave_silh - - -def get_cluster_freq_linear(X, Y, centroids): - """ for each label in Y get the distribution of clusters by linear encoding - """ - def encode_linear(X, centroids): - """Linear encoding via the dot product - """ - return np.dot(X, centroids.T) - encoding = encode_linear(X, centroids) - encoding_df = pd.DataFrame(data=encoding, index=Y) - encoding_df_sum = encoding_df.groupby(encoding_df.index).sum() - cluster_freq = (encoding_df_sum - np.mean(encoding_df_sum)) / np.std(encoding_df_sum) - cluster_freq.index.name = 'labels' - return cluster_freq - - -def get_cluster_predictions(X, n_clusters=10): - cl_pred = KMeans(n_clusters=n_clusters, random_state=50).fit_predict(X) - return cl_pred
--- a/utils_spatial.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,151 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 17 11:35:51 2017 - -@author: mariapanteli -""" -import numpy as np -import json -import pysal # before shapely in util_plots -import fiona -import sys -sys.path.append('../misc') -import matplotlib.pyplot as plt - - -def neighbors_from_json_file(data_countries, json_DB='../MergeBL-Smith/data/countries.json'): - neighbors = {} - with open(json_DB) as json_file: - countries_dict = json.load(json_file) - country_names = [] - country_iso = [] - country_borders_iso = [] - for country_info in countries_dict: - country_names.append(country_info['name']['common']) - country_iso.append(country_info['cca3']) - country_borders_iso.append(country_info['borders']) - # temporary fixes of country names to match json data - country_names[country_names.index('United States')] = 'United States of America' - country_names[country_names.index('Tanzania')] = 'United Republic of Tanzania' - country_names[country_names.index('DR Congo')] = 'Democratic Republic of the Congo' - country_names[country_names.index('Czechia')] = 'Czech Republic' - for i, country in enumerate(data_countries): - neighbors[i] = {} - if country in country_names: - if len(country_borders_iso[country_names.index(country)])>0: - # if country has neighbors according to json file - neighbors_iso = country_borders_iso[country_names.index(country)] - neighbors_names = [country_names[country_iso.index(nn)] for nn in neighbors_iso] - for neighbor in neighbors_names: - if neighbor in data_countries: - neighbor_idx = np.where(data_countries==neighbor)[0][0] - neighbors[i][neighbor_idx] = 1.0 - w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) - return w - - -def get_countries_from_shapefile(shapefile): - shp = fiona.open(shapefile, 'r') - countries = [] - if shp[0]["properties"].has_key("ADMIN"): - country_keyword = "ADMIN" - elif shp[0]["properties"].has_key("NAME"): - country_keyword = "NAME" - else: - country_keyword = "admin" - for line in shp: - countries.append(line["properties"][country_keyword]) - shp.close() - return countries - - -def replace_empty_neighbours_with_KNN(data_countries, w): - shapefile = "../MergeBL-Smith/shapefiles/ne_10m_admin_0_countries.shp" - no_neighbors_idx = w.islands - knn = 10 - wknn = pysal.knnW_from_shapefile(shapefile, knn) - knn_countries = get_countries_from_shapefile(shapefile) - neighbors = w.neighbors - for nn_idx in no_neighbors_idx: - country = data_countries[nn_idx] - print country - if country not in knn_countries: - continue - knn_country_idx = knn_countries.index(country) - knn_country_neighbors = [knn_countries[nn] for nn in wknn.neighbors[knn_country_idx]] - for knn_nn in knn_country_neighbors: - if len(neighbors[nn_idx])>2: - continue - data_country_idx = np.where(data_countries==knn_nn)[0] - if len(data_country_idx)>0: - neighbors[nn_idx][data_country_idx[0]] = 1.0 - w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) - return w - - -def get_neighbors_for_countries_in_dataset(Y): - # neighbors - data_countries = np.unique(Y) - w = neighbors_from_json_file(data_countries) - w = replace_empty_neighbours_with_KNN(data_countries, w) - return w, data_countries - - -def from_weights_to_dict(w, data_countries): - w_dict = {} - for i in w.neighbors: - w_dict[data_countries[i]] = [data_countries[nn] for nn in w.neighbors[i]] - return w_dict - - -def get_LH_HL_idx(lm, p_vals): - sig_idx = np.where(p_vals<0.05)[0] - LH_idx = sig_idx[np.where(lm.q[sig_idx]==2)[0]] - HL_idx = sig_idx[np.where(lm.q[sig_idx]==4)[0]] - return LH_idx, HL_idx - - -def print_Moran_outliers(y, w, data_countries): - lm = pysal.Moran_Local(y, w) - p_vals = lm.p_z_sim - LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) - print 'LH', zip(data_countries[LH_idx], p_vals[LH_idx]) # LH - print 'HL', zip(data_countries[HL_idx], p_vals[HL_idx]) # HL - - -def plot_Moran_scatterplot(y, w, data_countries, out_file=None): - lm = pysal.Moran_Local(y, w) - p_vals = lm.p_z_sim - LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) - - ylt = pysal.lag_spatial(lm.w, lm.y) - yt = lm.y - yt = (yt - np.mean(yt))/np.std(yt) - ylt = (ylt - np.mean(ylt))/np.std(ylt) - colors = plt.cm.spectral(np.linspace(0,1,5)) - quad = np.zeros(yt.shape, dtype=int) - quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH - quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH - quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL - quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL - marker_color = colors[quad] - marker_size = 40*np.ones(yt.shape, dtype=int) - marker_size[LH_idx] = 140 - marker_size[HL_idx] = 140 - - plt.figure() - plt.scatter(yt, ylt, c=marker_color, s=marker_size, alpha=0.7) - plt.xlabel('Value') - plt.ylabel('Spatially Lagged Value') - plt.axvline(c='black', ls='--') - plt.axhline(c='black', ls='--') - plt.ylim(min(ylt)-0.5, max(ylt)+0.5) - plt.xlim(min(yt)-0.5, max(yt)+1.5) - for i in np.concatenate((LH_idx, HL_idx)): - plt.annotate(data_countries[i], (yt[i], ylt[i]), xytext=(yt[i]*1.1, ylt[i]*1.1),textcoords='data',arrowprops=dict(arrowstyle="->",connectionstyle="arc3")) - extreme_points = np.concatenate(([np.argmin(ylt)], [np.argmax(ylt)], np.where(yt>np.mean(yt)+2.8*np.std(yt))[0], np.where(ylt>np.mean(yt)+2.8*np.std(ylt))[0])) - extreme_points = np.array(list(set(extreme_points) - set(np.concatenate((LH_idx, HL_idx))))) - for i in extreme_points: - plt.annotate(data_countries[i], (yt[i]+0.1, ylt[i])) - if out_file is not None: - plt.savefig(out_file)