# HG changeset patch # User Maria Panteli # Date 1505127110 -3600 # Node ID e50c63cf96bea4e0dfb0dac84813d5003a331ce8 # Parent 230a0cf17de0068461bd957822a66287ed8a5ac5 rearranging folders diff -r 230a0cf17de0 -r e50c63cf96be load_dataset.py --- a/load_dataset.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 15 22:52:57 2017 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -import pickle - -import load_features -import util_dataset -import util_filter_dataset - - -#METADATA_FILE = 'sample_dataset/metadata.csv' -#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] -WIN_SIZE = 2 -METADATA_FILE = 'data/metadata_BLSM_language_all.csv' -#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] -#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] -OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', - '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', - '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] - -def extract_features(df, win2sec=8.0): - """Extract features from melspec and chroma. - - Parameters - ---------- - df : pd.DataFrame - Metadata including class label and path to audio, melspec, chroma - win2sec : float - The window size for the second frame decomposition of the features - - Returns - ------- - X : np.array - The features for every frame x every audio file in the dataset - Y : np.array - The class labels for every frame in the dataset - Y_audio : np.array - The audio labels - """ - feat_loader = load_features.FeatureLoader(win2sec=win2sec) - frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) - print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape - X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) - Y = Y_df.get_values() - Y_audio = Y_audio_df.get_values() - return X, Y, Y_audio - - -if __name__ == '__main__': - # load dataset - df = pd.read_csv(METADATA_FILE) - df = util_filter_dataset.remove_missing_data(df) - subset_idx = util_dataset.subset_labels(df['Country'].get_values()) - df = df.iloc[subset_idx, :] - X, Y = np.arange(len(df)), df['Country'].get_values() - - # split in train, val, test set - train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) - - # extract features and write output - X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[0], 'wb') as f: - pickle.dump([X_train, Y_train, Y_audio_train], f) - - X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[1], 'wb') as f: - pickle.dump([X_val, Y_val, Y_audio_val], f) - - X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[2], 'wb') as f: - pickle.dump([X_test, Y_test, Y_audio_test], f) - -#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' -# pickle.dump([X_test, Y_test, Y_audio_test], f) -#with open(out_file, 'wb') as f: diff -r 230a0cf17de0 -r e50c63cf96be load_features.py --- a/load_features.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,287 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Mar 16 01:50:57 2017 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -import os -from sklearn.decomposition import NMF -import OPMellin as opm -import MFCC as mfc -import PitchBihist as pbi - - -class FeatureLoader: - def __init__(self, win2sec=8): - self.win2sec = float(win2sec) - self.sr = 44100. - self.win1 = int(round(0.04*self.sr)) - self.hop1 = int(round(self.win1/8.)) - self.framessr = self.sr/float(self.hop1) - self.win2 = int(round(self.win2sec*self.framessr)) - self.hop2 = int(round(0.5*self.framessr)) - self.framessr2 = self.framessr/float(self.hop2) - - - def get_op_mfcc_for_file(self, melspec_file=None, scale=True): - op = [] - mfc = [] - if not os.path.exists(melspec_file): - return op, mfc - print 'extracting onset patterns and mfccs...' - songframes = pd.read_csv(melspec_file, engine="c", header=None) - songframes.iloc[np.where(np.isnan(songframes))] = 0 - songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes - melspec = songframes.get_values().T - op = self.get_op_from_melspec(melspec, K=2) - mfc = self.get_mfcc_from_melspec(melspec) - if scale: - # scale all frames by mean and std of recording - op = (op - np.nanmean(op)) / np.nanstd(op) - mfc = (mfc - np.nanmean(mfc)) / np.nanstd(mfc) - return op, mfc - - - def get_chroma_for_file(self, chroma_file=None, scale=True): - ch = [] - if not os.path.exists(chroma_file): - return ch - print 'extracting chroma...' - songframes = pd.read_csv(chroma_file, engine="c", header=None) - songframes.iloc[np.where(np.isnan(songframes))] = 0 - songframes = songframes.iloc[0:min(len(songframes), 18000), :] # only first 1.5 minutes - chroma = songframes.get_values().T - ch = self.get_ave_chroma(chroma) - if scale: - # scale all frames by mean and std of recording - ch = (ch - np.nanmean(ch)) / np.nanstd(ch) - return ch - - - def get_music_idx_from_bounds(self, bounds, sr=None): - music_idx = [] - if len(bounds) == 0: - # bounds is empty list - return music_idx - nbounds = bounds.shape[0] - if len(np.where(bounds[:,2]=='m')[0])==0: - # no music segments - return music_idx - elif len(np.where(bounds[:,2]=='s')[0])==nbounds: - # all segments are speech - return music_idx - else: - half_win_hop = int(round(0.5 * self.win2 / float(self.hop2))) - music_bounds = np.where(bounds[:, 2] == 'm')[0] - bounds_in_frames = np.round(np.array(bounds[:, 0], dtype=float) * sr) - duration_in_frames = np.round(np.array(bounds[:, 1], dtype=float) * sr) - for music_bound in music_bounds: - lower_bound = np.max([0, bounds_in_frames[music_bound] - half_win_hop]) - upper_bound = lower_bound + duration_in_frames[music_bound] - half_win_hop - music_idx.append(np.arange(lower_bound, upper_bound, dtype=int)) - if len(music_idx)>0: - music_idx = np.sort(np.concatenate(music_idx)) # it should be sorted, but just in case segments overlap -- remove duplicates if segments overlap - return music_idx - - - def get_music_idx_for_file(self, segmenter_file=None): - music_idx = [] - if os.path.exists(segmenter_file) and os.path.getsize(segmenter_file)>0: - print 'loading speech/music segments...' - bounds = pd.read_csv(segmenter_file, header=None, delimiter='\t').get_values() - if bounds.shape[1] == 1: # depends on the computer platform - bounds = pd.read_csv(segmenter_file, header=None, delimiter=',').get_values() - music_idx = self.get_music_idx_from_bounds(bounds, sr=self.framessr2) - return music_idx - - - def get_features(self, df, class_label='Country'): - oplist = [] - mflist = [] - chlist = [] - pblist = [] - clabels = [] - aulabels = [] - n_files = len(df) - for i in range(n_files): - if not (os.path.exists(df['Melspec'].iloc[i]) and os.path.exists(df['Chroma'].iloc[i]) and os.path.exists(df['Melodia'].iloc[i])): - continue - print 'file ' + str(i) + ' of ' + str(n_files) - music_idx = self.get_music_idx_for_file(df['Speech'].iloc[i]) - if len(music_idx)==0: - # no music segments -> skip this file - continue - try: - op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i]) - ch = self.get_chroma_for_file(df['Chroma'].iloc[i]) - #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i]) - pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i]) - #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i]) - except: - continue - min_n_frames = np.min([len(op), len(mfcc), len(ch), len(pb)]) # ideally, features should have the same number of frames - if min_n_frames==0: - # no features extracted -> skip this file - continue - music_idx = music_idx[music_idx 90: + startPointSec = (tracklength/2.)-20 + startSample = round(startPointSec*self.sr) + endSample = startSample+45*self.sr + self.y = self.y[startSample:endSample] + + def mel_spectrogram(self, y=None, sr=None): + if self.y is None: + self.y = y + if self.sr is None: + self.sr = sr + win1 = int(round(0.04*self.sr)) + hop1 = int(round(win1/8.)) + nfft1 = int(2**numpy.ceil(numpy.log2(win1))) + nmels = 40 + D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming))**2 + #melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels) + melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels, fmax=8000) + melsr = self.sr/float(hop1) + self.melspec = melspec + self.melsr = melsr + + def calc_mfccs(self, y=None, sr=None): + if self.y is None: + self.y = y + if self.sr is None: + self.sr = sr + # require log-amplitude + self.mfccs = librosa.feature.mfcc(S=librosa.logamplitude(self.melspec), n_mfcc=21) + # remove DCT component + self.mfccs = self.mfccs[1:,:] + + def get_mfccs(self, filename='test.wav', secondframedecomp=False): + self.load_audiofile(filename=filename) + self.mel_spectrogram() + self.calc_mfccs() + + if secondframedecomp: + win2 = int(round(8*self.melsr)) + hop2 = int(round(0.5*self.melsr)) + nbins, norigframes = self.melspec.shape + nframes = int(1+numpy.floor((norigframes-win2)/float(hop2))) + avemfccs = numpy.empty((nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + avemfccs[:,i] = numpy.mean(self.mfccs[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True) + self.mfccs = avemfccs + return self.mfccs + + def get_mfccs_from_melspec(self, melspec=[], melsr=[]): + self.melspec = melspec + self.melsr = melsr + self.calc_mfccs() + return self.mfccs + +if __name__ == '__main__': + mfs = MFCCs() + mfs.get_mfccs() \ No newline at end of file diff -r 230a0cf17de0 -r e50c63cf96be scripts/OPMellin.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/OPMellin.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Aug 17 13:37:00 2015 + +@author: mariapanteli +""" + +import librosa +import scipy.signal +# import matplotlib.pyplot as plt +import numpy + +class OPMellin: + def __init__(self, win2sec=8): + self.y = None + self.sr = None + self.melspec = None + self.melsr = None + self.op = None + self.opmellin = None + self.win2sec = win2sec + + def load_audiofile(self, filename='test.wav', sr=None, segment=True): + self.y, self.sr = librosa.load(filename, sr=sr) + #self.y, self.sr = librosa.load(filename) # default sr=22050 + if segment: + tracklength = self.y.shape[0]/float(self.sr) + startSample = 0 + endSample = None + if tracklength > 90: + startPointSec = (tracklength/2.)-20 + startSample = round(startPointSec*self.sr) + endSample = startSample+45*self.sr + self.y = self.y[startSample:endSample] + + + # def mel_spectrogram(self, y=None, sr=None, filename='1_21.wav'): + def mel_spectrogram(self, y=None, sr=None): + if self.y is None: + self.y = y + if self.sr is None: + self.sr = sr + win1 = int(round(0.04*self.sr)) + hop1 = int(round(win1/8.)) + nfft1 = int(2**numpy.ceil(numpy.log2(win1))) + nmels = 40 + # y, sr = librosa.load(filename, sr=fs) + # melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=nfft, win_length=win1, window=scipy.signal.hamming, hop_length=hop1, n_mels=nmels) + # melspec = librosa.feature.melspectrogram(y=self.y, sr=self.sr, n_fft=nfft, hop_length=hop1, n_mels=nmels) + D = numpy.abs(librosa.stft(self.y, n_fft=nfft1, hop_length=hop1, win_length=win1, window=scipy.signal.hamming))**2 + #melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels) + melspec = librosa.feature.melspectrogram(S=D, sr=self.sr, n_mels=nmels, fmax=8000) + melsr = self.sr/float(hop1) + # librosa.display.specshow(melspec) + # return melspec, melsr + self.melspec = melspec + self.melsr = melsr + + # post process spectrogram + def post_process_spec(self, melspec=None, log=True, medianfilt=False, sqrt=False, smooth=False, diff=True, subtractmean=True, halfwave=True, maxnormal=True): + if self.melspec is None: + self.melspec = melspec + if log: + self.melspec = librosa.logamplitude(self.melspec) + if medianfilt: + ks = int(0.1 * self.melsr) # 100ms kernel size + if ks % 2 == 0: # ks must be odd + ks += 1 + nmels = self.melspec.shape[0] + for i in range(nmels): + self.melspec[i,:] = scipy.signal.medfilt(self.melspec[i,:],kernel_size = ks) + if sqrt: + self.melspec = self.melspec**.5 + if smooth: + step = 50.0 + nmels = self.melspec.shape[0] + mm = self.melspec + for i in range(nmels): + mm[i,:] = numpy.convolve(mm[i,:], numpy.ones(step)/step, mode='same') + self.melspec = mm + if diff: + self.melspec = numpy.concatenate((self.melspec,self.melspec[:,-1,None]),axis=1) # append one frame before diff to keep number of frames the same + self.melspec = numpy.diff(self.melspec, n=1, axis=1) + if subtractmean: + mean = self.melspec.mean(axis=1) + mean.shape = (mean.shape[0], 1) + self.melspec = self.melspec - mean + if halfwave: + self.melspec[numpy.where(self.melspec < 0)] = 0 + if maxnormal: + if self.melspec.max() != 0: # avoid division by 0 + self.melspec = self.melspec/self.melspec.max() + # return melspec + + + def onset_patterns(self, melspec=None, melsr=None, fft=True, logfilter=False, center=True, bpmrange=True): + if self.melspec is None: + self.melspec = melspec + if self.melsr is None: + self.melsr = melsr + win2 = int(round(self.win2sec*self.melsr)) + hop2 = int(round(0.5*self.melsr)) + nmels, nmelframes = self.melspec.shape + if fft: # periodicity via fft + #nfft2 = int(2**numpy.ceil(numpy.log2(win2))) + nfft2 = 2048 # nfft2 does not depend on win2?? + melspectemp = self.melspec + if ((nfft2 > win2) and (center is False)): + # pad the signal by nfft2-win2 so that frame decomposition (n_frames) + # is still based on win2 and not nfft2 + melspectemp = numpy.concatenate([numpy.zeros((nmels,int((nfft2 - win2) // 2))),self.melspec, numpy.zeros((nmels,int((nfft2 - win2) // 2)))],axis=1) + if melspectemp.shape[1]0] + melody = freqs + n_frames = len(melody) + melody_cents = hz_to_cents(melody, n_cents=n_bins) + melody_octave = wrap_to_octave(melody_cents, octave_length=n_bins) + melody_matrix = numpy.zeros((n_bins, n_frames)) + for time, pitch in enumerate(melody_octave): + if not numpy.isnan(pitch): + melody_matrix[int(pitch), time] = 1 + if secondframedecomp: + win2 = int(round(self.win2sec*self.chromasr)) + hop2 = int(round(0.5*self.chromasr)) + nbins, norigframes = melody_matrix.shape + if norigframes 0] = 0 + + # multiply with original + B = numpy.dot(tra.T, spec) + + # normalize + mxB = numpy.max(B) + mnB = numpy.min(B) + if mxB != mnB: + B = (B - mnB)/float(mxB-mnB) + + # circshift to highest? + if align: + ref = numpy.argmax(numpy.sum(spec, axis=0)) + B = numpy.roll(B, -ref, axis=0) + B = numpy.roll(B, -ref, axis=1) + return B + + def bihist_from_precomp_chroma(self, align=False): + win2 = int(round(self.win2sec*self.chromasr)) + hop2 = int(round(0.5*self.chromasr)) + nbins, norigframes = self.chroma.shape + if norigframes0: + music_idx = np.sort(np.concatenate(music_idx)) # it should be sorted, but just in case segments overlap -- remove duplicates if segments overlap + return music_idx + + + def get_music_idx_for_file(self, segmenter_file=None): + music_idx = [] + if os.path.exists(segmenter_file) and os.path.getsize(segmenter_file)>0: + print 'loading speech/music segments...' + bounds = pd.read_csv(segmenter_file, header=None, delimiter='\t').get_values() + if bounds.shape[1] == 1: # depends on the computer platform + bounds = pd.read_csv(segmenter_file, header=None, delimiter=',').get_values() + music_idx = self.get_music_idx_from_bounds(bounds, sr=self.framessr2) + return music_idx + + + def get_features(self, df, class_label='Country'): + oplist = [] + mflist = [] + chlist = [] + pblist = [] + clabels = [] + aulabels = [] + n_files = len(df) + for i in range(n_files): + if not (os.path.exists(df['Melspec'].iloc[i]) and os.path.exists(df['Chroma'].iloc[i]) and os.path.exists(df['Melodia'].iloc[i])): + continue + print 'file ' + str(i) + ' of ' + str(n_files) + music_idx = self.get_music_idx_for_file(df['Speech'].iloc[i]) + if len(music_idx)==0: + # no music segments -> skip this file + continue + try: + op, mfcc = self.get_op_mfcc_for_file(df['Melspec'].iloc[i]) + ch = self.get_chroma_for_file(df['Chroma'].iloc[i]) + #pb = self.get_pb_from_melodia(df['Melodia'].iloc[i]) + pb = self.load_precomputed_pb_from_melodia(df['Melodia'].iloc[i]) + #pb = self.get_contour_feat_from_melodia(df['Melodia'].iloc[i]) + except: + continue + min_n_frames = np.min([len(op), len(mfcc), len(ch), len(pb)]) # ideally, features should have the same number of frames + if min_n_frames==0: + # no features extracted -> skip this file + continue + music_idx = music_idx[music_idx x.""" + return np.ceil(np.log2(x)) + +def get_smoothie_frequencies(p): + """Calculate filter centres and widths for the smooth Q transform.""" + # I think this calculation should be reconsidered in order to have + # a more principled transition between linear and exponential + n = 1.0 / (2.0**(1.0/p['bpo']) - 1) + x = p['break1'] * 1.0 / n + + # first linear stretch + # f = (np.arange(1, int(np.round(n)) + 1) * x).round().tolist() + f = (np.arange(1, int(np.round(n)) + 1) * x).tolist() + + # exponential stretch + while max(f) < p['break2']: + f.append(max(f) * 2**(1.0/p['bpo'])) + + # final linear stretch + lastdiff = f[-1] - f[-2] + while max(f) < p['f_max']: + f.append(max(f) + lastdiff) + + deltaf = np.diff(np.array(f)) + f = f[:-1] + + return f, deltaf + +def create_smoothie_kernel(f, deltaf, fs): + """Create a sparse matrix that maps the complex DFT to the complex + smoothie representation. + """ + print >>sys.stdout, "[ SMOOTHIE Q kernel calculation ... ]" + n_filter = len(f) + n_fft = 2**nextpow2(np.ceil(fs/min(deltaf))) + + thresh = 0.0054 + smoothie_kernel = np.zeros([n_fft, n_filter], np.complex64) + + for i_filter in range(n_filter-1, -1, -1): # descending + # print i_filter + Q = f[i_filter] * 1.0 / deltaf[i_filter] # local Q for this filter + lgth = int(np.ceil(fs * 1.0 / deltaf[i_filter])) + lgthinv = 1.0 / lgth + offset = int(n_fft/2 - np.ceil(lgth * 0.5)) + temp = hann(lgth) * lgthinv * \ + np.exp(2j * np.pi * Q * (np.arange(lgth) - offset) * lgthinv) + # print(sum(hann(lgth)), Q, lgth, offset) + temp_kernel = np.zeros(n_fft, dtype = np.complex64) + temp_kernel[np.arange(lgth) + offset] = temp + spec_kernel = np.fft.fft(temp_kernel) + spec_kernel[abs(spec_kernel) <= thresh] = 0 + smoothie_kernel[:,i_filter] = spec_kernel + return csr_matrix(smoothie_kernel.conj()/n_fft) + +def smoothie_q_spectrogram(x, p): + """Calculate the actual spectrogram with smooth Q frequencies""" + print >>sys.stdout, "[ SMOOTHIE Q spectrogram ... ]" + + # precalculate smoothie kernel + f, deltaf = get_smoothie_frequencies(p) + smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs']) + n_fft, n_filter = smoothie_kernel.shape + + # some preparations + n_sample = len(x) + # n_frame = int(np.floor(n_sample / p['step_size'])) + n_frame = int(np.ceil(n_sample / float(p['step_size']))) # added mp + t = (np.arange(n_frame) * p['step_size']) * 1.0 / p['fs'] + smoothie_kernelT = smoothie_kernel.T + + # allocate + s = np.zeros((n_filter, n_frame), dtype = np.complex64) + + # pad (if wanted) + x = np.concatenate((np.zeros(n_fft/2), x, np.zeros(n_fft/2))) + + for i_frame in range(n_frame): + smpl = p['step_size'] * i_frame + block = x[smpl + np.arange(n_fft)] + s[:, i_frame] = smoothie_kernelT.dot(np.fft.fft(block)) + + if p['use_A_wtg']: + a_wtg = make_a_weight(f) + s = s * a_wtg[:, np.newaxis] + + return s, t + +def mel_triangles(input_f): + """Make matrix with mel filters at the given frequencies. + Warning: this is a very coarse mel filterbank. + """ + n_linearfilters = 3 + n_logfilters0 = 30 # just something high, will be pruned later + linear_spacing = 100 + log_spacing = 6.0/4 + n_filter0 = n_linearfilters + n_logfilters0 + + freqs = np.zeros(n_filter0+2) # includes one more on either side, hence +2 + freqs[range(n_linearfilters+1)] = \ + np.arange(-1,n_linearfilters) * linear_spacing + freqs[range(n_linearfilters+1, n_filter0+2)] = \ + freqs[n_linearfilters] * log_spacing ** np.arange(1, n_logfilters0+2) + + centre_freqs = freqs[1:-1] + lower_freqs = freqs[0:-2] + upper_freqs = freqs[2:] + + n_filter = list(np.nonzero(lower_freqs < max(input_f)))[0][-1] + 1 + + n_input_f = len(input_f) + mtr = np.zeros((n_input_f, n_filter)) + + for i_filter in range(n_filter): + for i_freq in range(n_input_f): + if input_f[i_freq] > lower_freqs[i_filter] \ + and input_f[i_freq] <= upper_freqs[i_filter]: + + if input_f[i_freq] <= centre_freqs[i_filter]: + mtr[i_freq, i_filter] = \ + (input_f[i_freq] - lower_freqs[i_filter]) * 1.0 / \ + (centre_freqs[i_filter] - lower_freqs[i_filter]) + else: + mtr[i_freq, i_filter] = \ + 1 - (input_f[i_freq] - centre_freqs[i_filter]) / \ + (upper_freqs[i_filter] - centre_freqs[i_filter]) + return mtr + +def create_smoothie_nfm_dict(smoothie_kernel, filterf, p): + """Create dictionary matrix with note templates.""" + n_note = int((p['max_midi'] - p['min_midi']) * p['bps'] + 1) + n_fft, n_filter = smoothie_kernel.shape + t = ((np.arange(n_fft) + 1) - ((n_fft + 1)*0.5))/p['fs'] + + mtr = mel_triangles(filterf) + n_template = n_note + mtr.shape[1] + + w = np.zeros((n_filter, n_template), dtype = np.complex64) + w[:,n_note:] = mtr + f0s = [] + + smoothie_kernelT = smoothie_kernel.T + + for i_note in range(n_note): + midi = p['min_midi'] + i_note * 1.0 / p['bps'] + f0 = 440 * 2 ** ((midi-69)*1.0/12) + f0s.append(f0) + sig = np.zeros(len(t)) + for i_harm in range(p['n_harm']): + f = f0 * (i_harm + 1) + if f > p['fs'] * 0.5: + continue + x = np.sin(2*np.pi*f*t) * p['s']**(i_harm) + sig += x + w[:, i_note] = smoothie_kernelT.dot(np.fft.fft(sig)) + + for i in range(mtr.shape[1]): + f0s.append(np.nan) + + w = abs(w) + col_sums = w.sum(axis = 0) + w = w / col_sums[np.newaxis, :] # normalisation + return w, np.array(f0s) + +def smoothie_salience(x, p, do_tune = False): + """Calculate melodic salience.""" + print >>sys.stdout, "[ SMOOTHIE Q salience ... ]" + + # precalculate nmf kernel + f, deltaf = get_smoothie_frequencies(p) + smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs']) + w, f0s = create_smoothie_nfm_dict(smoothie_kernel, f, p) + + # calculate smoothiegram + s, t = smoothie_q_spectrogram(x, p) + s[s==0] = np.spacing(1) # very small number + s = abs(s) + + # run NMF + n_frame = len(t) + print >>sys.stdout, "[ SMOOTHIE Q : NMF updates ... ]" + sal = np.ones((w.shape[1], n_frame)) + for i_iter in range(p['n_iter']): + sal = update_nmf_simple(sal, w, s) + + if do_tune: + error("Tuning isn't yet implemented in the Python version") + + return sal, t, f0s + +def qnormalise(x, q, dim): + nrm = np.sum(x**q, axis=dim, keepdims=True)**(1./float(q)) + nrmmatrix = np.repeat(nrm, x.shape[0], axis=0) + x = x / nrmmatrix + return x + +def wrap_to_octave(sal, param): + epsilon = 0.00001 + nsal = qnormalise(sal, 2, 0) + + step = 1./float(param['bps']) + NMFpitch = np.arange(param['min_midi'],param['max_midi']+step,step) + nNote = len(NMFpitch) + nsal = nsal[0:nNote, :] + + allsal = nsal + allsal[nsal 90: + startPointSec = (tracklength/2.)-20 + startSample = round(startPointSec*sr) + endSample = startSample+45*sr + y = y[startSample:endSample] + return y + +def get_smoothie(filename = 'test.wav', param = None, segment=True, secondframedecomp=False, hopinsec=None): + if param is None: + param = p + param['fs'] = None + param['step_size'] = 128 + + y, sr = librosa.load(filename, sr = param['fs']) + param['fs'] = sr + if hopinsec is not None: + param['step_size'] = int(round(hopinsec*param['fs'])) + if segment: + y = segment_audio(y,sr) + # sg, t = smoothie_q_spectrogram(y, param) + # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param) + sal, t, f0s = smoothie_salience(y, param) + sal = sal[-np.isnan(f0s),:] + chroma = wrap_to_octave(sal, param) + + if secondframedecomp: + chromasr = param['fs']/float(param['step_size']) + win2 = int(round(8*chromasr)) + hop2 = int(round(0.5*chromasr)) + nbins, norigframes = chroma.shape + nframes = int(1+np.floor((norigframes-win2)/float(hop2))) + avechroma = np.empty((nbins, nframes)) + for i in range(nframes): # loop over all 8-sec frames + avechroma[:,i] = np.mean(chroma[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True) + chroma = avechroma + return chroma + +def get_smoothie_for_bihist(filename = 'test.wav', param = None, segment=True, hopinsec=None): + if param is None: + param = p + param['fs'] = None # no downsample + param['step_size'] = 128 + + y, sr = librosa.load(filename, sr = param['fs']) + param['fs'] = sr + if hopinsec is not None: + param['step_size'] = int(round(hopinsec*param['fs'])) + if segment: + y = segment_audio(y,sr) + # sg, t = smoothie_q_spectrogram(y, param) + # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param) + sal, t, f0s = smoothie_salience(y, param) + sal = sal[-np.isnan(f0s),:] + chroma = wrap_to_octave(sal, param) + chromasr = param['fs']/float(param['step_size']) + return (chroma, chromasr) \ No newline at end of file diff -r 230a0cf17de0 -r e50c63cf96be scripts/util_dataset.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/util_dataset.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 15 23:04:24 2017 + +@author: mariapanteli +""" + +import numpy as np +from sklearn.model_selection import train_test_split + + +def get_train_val_test_idx(X, Y, seed=None): + """ Split in train, validation, test sets. + + Parameters + ---------- + X : np.array + Data or indices. + Y : np.array + Class labels for data in X. + seed: int + Random seed. + Returns + ------- + (X_train, Y_train) : tuple + Data X and labels y for the train set + (X_val, Y_val) : tuple + Data X and labels y for the validation set + (X_test, Y_test) : tuple + Data X and labels y for the test set + + """ + X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) + X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) + return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) + + +def subset_labels(Y, N_min=10, N_max=100, seed=None): + """ Subset dataset to contain minimum N_min and maximum N_max instances + per class. Return indices for this subset. + + Parameters + ---------- + Y : np.array + Class labels + N_min : int + Minimum instances per class + N_max : int + Maximum instances per class + seed: int + Random seed. + + Returns + ------- + subset_idx : np.array + Indices for a subset with classes of size bounded by N_min, N_max + + """ + subset_idx = [] + labels = np.unique(Y) + for label in labels: + label_idx = np.where(Y==label)[0] + counts = len(label_idx) + if counts>=N_max: + subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) + elif counts>=N_min and counts0: + subset_idx = np.concatenate(subset_idx, axis=0) + return subset_idx diff -r 230a0cf17de0 -r e50c63cf96be scripts/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/utils.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 17 11:26:01 2017 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans +from sklearn import metrics +from collections import Counter +from sklearn.decomposition import PCA +from scipy import stats +from scipy.spatial.distance import mahalanobis +from sklearn.covariance import MinCovDet + + +def get_outliers(X, chi2thr=0.975): + robust_cov = MinCovDet().fit(X) + MD = robust_cov.mahalanobis(X) + chi2 = stats.chi2 + degrees_of_freedom = X.shape[1] + threshold = chi2.ppf(chi2thr, degrees_of_freedom) + y_pred = MD>threshold + return threshold, y_pred, MD + + +def get_outliers_Mahal(X, chi2thr=0.975): + n_samples = X.shape[0] + inv_cov = np.linalg.inv(np.cov(X.T)) + col_mean = np.mean(X, axis=0) + MD = np.zeros(n_samples, dtype='float') + for i in range(n_samples): + MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) + MD = MD ** 2 + degrees_of_freedom = X.shape[1] + chi2 = stats.chi2 + threshold = chi2.ppf(chi2thr, degrees_of_freedom) + y_pred = MD>threshold + return threshold, y_pred, MD + + +def pca_data(X, min_variance=None): + # rotate data to avoid singularity in Mahalanobis/covariance matrix + model = PCA(whiten=True).fit(X) + model.explained_variance_ratio_.sum() + if min_variance is None: + n_pc = X.shape[1] + else: + n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] + X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) + return X_pca, n_pc + + +def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): + uniq_labels = np.unique(Y) + spatial_outliers = [] + for uniq_label in uniq_labels: + countries_neighbors = w_dict[uniq_label] + if len(countries_neighbors)==0: + print uniq_label, " no neighbors found" + continue + inds_neighborhood = [] + for country in countries_neighbors: + inds = np.where(Y==country)[0] + inds_neighborhood.append(inds) # append neighboring countries + if len(np.concatenate(inds_neighborhood))==0: + print "no neighbors found" + continue + inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country + inds_neighborhood = np.concatenate(inds_neighborhood) + if do_pca: + XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) + else: + XX = X[inds_neighborhood, :] # assume X is already in PCA + print len(inds_neighborhood) + if len(inds_neighborhood)0: + # if country has neighbors according to json file + neighbors_iso = country_borders_iso[country_names.index(country)] + neighbors_names = [country_names[country_iso.index(nn)] for nn in neighbors_iso] + for neighbor in neighbors_names: + if neighbor in data_countries: + neighbor_idx = np.where(data_countries==neighbor)[0][0] + neighbors[i][neighbor_idx] = 1.0 + w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) + return w + + +def get_countries_from_shapefile(shapefile): + shp = fiona.open(shapefile, 'r') + countries = [] + if shp[0]["properties"].has_key("ADMIN"): + country_keyword = "ADMIN" + elif shp[0]["properties"].has_key("NAME"): + country_keyword = "NAME" + else: + country_keyword = "admin" + for line in shp: + countries.append(line["properties"][country_keyword]) + shp.close() + return countries + + +def replace_empty_neighbours_with_KNN(data_countries, w): + shapefile = "../MergeBL-Smith/shapefiles/ne_10m_admin_0_countries.shp" + no_neighbors_idx = w.islands + knn = 10 + wknn = pysal.knnW_from_shapefile(shapefile, knn) + knn_countries = get_countries_from_shapefile(shapefile) + neighbors = w.neighbors + for nn_idx in no_neighbors_idx: + country = data_countries[nn_idx] + print country + if country not in knn_countries: + continue + knn_country_idx = knn_countries.index(country) + knn_country_neighbors = [knn_countries[nn] for nn in wknn.neighbors[knn_country_idx]] + for knn_nn in knn_country_neighbors: + if len(neighbors[nn_idx])>2: + continue + data_country_idx = np.where(data_countries==knn_nn)[0] + if len(data_country_idx)>0: + neighbors[nn_idx][data_country_idx[0]] = 1.0 + w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) + return w + + +def get_neighbors_for_countries_in_dataset(Y): + # neighbors + data_countries = np.unique(Y) + w = neighbors_from_json_file(data_countries) + w = replace_empty_neighbours_with_KNN(data_countries, w) + return w, data_countries + + +def from_weights_to_dict(w, data_countries): + w_dict = {} + for i in w.neighbors: + w_dict[data_countries[i]] = [data_countries[nn] for nn in w.neighbors[i]] + return w_dict + + +def get_LH_HL_idx(lm, p_vals): + sig_idx = np.where(p_vals<0.05)[0] + LH_idx = sig_idx[np.where(lm.q[sig_idx]==2)[0]] + HL_idx = sig_idx[np.where(lm.q[sig_idx]==4)[0]] + return LH_idx, HL_idx + + +def print_Moran_outliers(y, w, data_countries): + lm = pysal.Moran_Local(y, w) + p_vals = lm.p_z_sim + LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) + print 'LH', zip(data_countries[LH_idx], p_vals[LH_idx]) # LH + print 'HL', zip(data_countries[HL_idx], p_vals[HL_idx]) # HL + + +def plot_Moran_scatterplot(y, w, data_countries, out_file=None): + lm = pysal.Moran_Local(y, w) + p_vals = lm.p_z_sim + LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) + + ylt = pysal.lag_spatial(lm.w, lm.y) + yt = lm.y + yt = (yt - np.mean(yt))/np.std(yt) + ylt = (ylt - np.mean(ylt))/np.std(ylt) + colors = plt.cm.spectral(np.linspace(0,1,5)) + quad = np.zeros(yt.shape, dtype=int) + quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH + quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH + quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL + quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL + marker_color = colors[quad] + marker_size = 40*np.ones(yt.shape, dtype=int) + marker_size[LH_idx] = 140 + marker_size[HL_idx] = 140 + + plt.figure() + plt.scatter(yt, ylt, c=marker_color, s=marker_size, alpha=0.7) + plt.xlabel('Value') + plt.ylabel('Spatially Lagged Value') + plt.axvline(c='black', ls='--') + plt.axhline(c='black', ls='--') + plt.ylim(min(ylt)-0.5, max(ylt)+0.5) + plt.xlim(min(yt)-0.5, max(yt)+1.5) + for i in np.concatenate((LH_idx, HL_idx)): + plt.annotate(data_countries[i], (yt[i], ylt[i]), xytext=(yt[i]*1.1, ylt[i]*1.1),textcoords='data',arrowprops=dict(arrowstyle="->",connectionstyle="arc3")) + extreme_points = np.concatenate(([np.argmin(ylt)], [np.argmax(ylt)], np.where(yt>np.mean(yt)+2.8*np.std(yt))[0], np.where(ylt>np.mean(yt)+2.8*np.std(ylt))[0])) + extreme_points = np.array(list(set(extreme_points) - set(np.concatenate((LH_idx, HL_idx))))) + for i in extreme_points: + plt.annotate(data_countries[i], (yt[i]+0.1, ylt[i])) + if out_file is not None: + plt.savefig(out_file) diff -r 230a0cf17de0 -r e50c63cf96be util_dataset.py --- a/util_dataset.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 15 23:04:24 2017 - -@author: mariapanteli -""" - -import numpy as np -from sklearn.model_selection import train_test_split - - -def get_train_val_test_idx(X, Y, seed=None): - """ Split in train, validation, test sets. - - Parameters - ---------- - X : np.array - Data or indices. - Y : np.array - Class labels for data in X. - seed: int - Random seed. - Returns - ------- - (X_train, Y_train) : tuple - Data X and labels y for the train set - (X_val, Y_val) : tuple - Data X and labels y for the validation set - (X_test, Y_test) : tuple - Data X and labels y for the test set - - """ - X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) - X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) - return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) - - -def subset_labels(Y, N_min=10, N_max=100, seed=None): - """ Subset dataset to contain minimum N_min and maximum N_max instances - per class. Return indices for this subset. - - Parameters - ---------- - Y : np.array - Class labels - N_min : int - Minimum instances per class - N_max : int - Maximum instances per class - seed: int - Random seed. - - Returns - ------- - subset_idx : np.array - Indices for a subset with classes of size bounded by N_min, N_max - - """ - subset_idx = [] - labels = np.unique(Y) - for label in labels: - label_idx = np.where(Y==label)[0] - counts = len(label_idx) - if counts>=N_max: - subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) - elif counts>=N_min and counts0: - subset_idx = np.concatenate(subset_idx, axis=0) - return subset_idx diff -r 230a0cf17de0 -r e50c63cf96be util_filter_dataset.py --- a/util_filter_dataset.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Mon Apr 10 17:44:48 2017 - -@author: mariapanteli -""" - -import os -import numpy as np -import pandas as pd - - -def get_speech_vamp(df): - jspeech = df.columns.get_loc("Speech") - nfiles = len(df) - speechinds = [] - for i in range(nfiles): - #print i - if os.path.exists(df.iat[i, jspeech]) and os.path.getsize(df.iat[i, jspeech])>0: - bounds = pd.read_csv(df.iat[i, jspeech], header=None, delimiter='\t').get_values() - if len(bounds)>0: - if len(np.where(bounds[:,2]=='m')[0])==0 or len(np.where(bounds[:,2]=='s')[0])==len(bounds): - speechinds.append(i) - return speechinds - - -def get_speech_meta(df): - genres = np.array(df["Genre_Album"].get_values(), dtype=str) - speechinds_genre = [] - invalid_genres = ["Spoken Word", "Language Instruction", "Classical", - "Poetry", "Nature|Sounds", "Music Instruction", - "Soundtracks &", "Contemporary &", "Jazz &", - "Sounds", "Ragtime", "Nature", "Electronic", - "African American Spoken", "Blues", "Gospel", - "Psychology &"] - for i in range(len(genres)): - genre = genres[i] - #if genre in invalid_genres: - if any(x in genre for x in invalid_genres): - speechinds_genre.append(i) - return speechinds_genre - - -def get_missing_csv(df): - nfiles = len(df) - missing_csv = [] - for i in range(nfiles): - if not (os.path.exists(df["Melspec"].iloc[i]) and os.path.exists(df["Chroma"].iloc[i]) and os.path.exists(df["Melodia"].iloc[i])): - missing_csv.append(i) - return missing_csv - - -def get_missing_country_meta(df): - nfiles = len(df) - missing_country = [] - country_labels = np.array(df['Country'].get_values(), dtype=str) - invalid_countries = ['Unidentified', 'unknown', 'nan', - 'Yugoslavia (former)', 'Pathian village Wangulei ', - 'Joulouloum either Senegal or The Gambia '] - for i in range(nfiles): - country = country_labels[i] - if country in invalid_countries: - missing_country.append(i) - return missing_country - - -def remove_missing_data(df): - speechinds_vamp = get_speech_vamp(df) - speechinds_genre = get_speech_meta(df) - speechinds = set(speechinds_vamp) | set(speechinds_genre) - missing = set(get_missing_csv(df)) - missing_country = set(get_missing_country_meta(df)) - selectinds = np.asarray(list(set(range(len(df))) - (missing | speechinds | missing_country))) - - df = df.iloc[selectinds, :] - return df \ No newline at end of file diff -r 230a0cf17de0 -r e50c63cf96be utils.py --- a/utils.py Mon Sep 11 11:32:45 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 17 11:26:01 2017 - -@author: mariapanteli -""" - -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans -from sklearn import metrics -from collections import Counter -from sklearn.decomposition import PCA -from scipy import stats -from scipy.spatial.distance import mahalanobis -from sklearn.covariance import MinCovDet - - -def get_outliers(X, chi2thr=0.975): - robust_cov = MinCovDet().fit(X) - MD = robust_cov.mahalanobis(X) - chi2 = stats.chi2 - degrees_of_freedom = X.shape[1] - threshold = chi2.ppf(chi2thr, degrees_of_freedom) - y_pred = MD>threshold - return threshold, y_pred, MD - - -def get_outliers_Mahal(X, chi2thr=0.975): - n_samples = X.shape[0] - inv_cov = np.linalg.inv(np.cov(X.T)) - col_mean = np.mean(X, axis=0) - MD = np.zeros(n_samples, dtype='float') - for i in range(n_samples): - MD[i] = mahalanobis(X[i,:], col_mean, inv_cov) - MD = MD ** 2 - degrees_of_freedom = X.shape[1] - chi2 = stats.chi2 - threshold = chi2.ppf(chi2thr, degrees_of_freedom) - y_pred = MD>threshold - return threshold, y_pred, MD - - -def pca_data(X, min_variance=None): - # rotate data to avoid singularity in Mahalanobis/covariance matrix - model = PCA(whiten=True).fit(X) - model.explained_variance_ratio_.sum() - if min_variance is None: - n_pc = X.shape[1] - else: - n_pc = np.where(model.explained_variance_ratio_.cumsum()>min_variance)[0][0] - X_pca = PCA(n_components=n_pc, whiten=True).fit_transform(X) - return X_pca, n_pc - - -def get_local_outliers_from_neighbors_dict(X, Y, w_dict, chi2thr=0.975, do_pca=False): - uniq_labels = np.unique(Y) - spatial_outliers = [] - for uniq_label in uniq_labels: - countries_neighbors = w_dict[uniq_label] - if len(countries_neighbors)==0: - print uniq_label, " no neighbors found" - continue - inds_neighborhood = [] - for country in countries_neighbors: - inds = np.where(Y==country)[0] - inds_neighborhood.append(inds) # append neighboring countries - if len(np.concatenate(inds_neighborhood))==0: - print "no neighbors found" - continue - inds_neighborhood.append(np.where(Y==uniq_label)[0]) # append query country - inds_neighborhood = np.concatenate(inds_neighborhood) - if do_pca: - XX, _ = pca_data(X[inds_neighborhood, :], min_variance=0.99) - else: - XX = X[inds_neighborhood, :] # assume X is already in PCA - print len(inds_neighborhood) - if len(inds_neighborhood)0: - # if country has neighbors according to json file - neighbors_iso = country_borders_iso[country_names.index(country)] - neighbors_names = [country_names[country_iso.index(nn)] for nn in neighbors_iso] - for neighbor in neighbors_names: - if neighbor in data_countries: - neighbor_idx = np.where(data_countries==neighbor)[0][0] - neighbors[i][neighbor_idx] = 1.0 - w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) - return w - - -def get_countries_from_shapefile(shapefile): - shp = fiona.open(shapefile, 'r') - countries = [] - if shp[0]["properties"].has_key("ADMIN"): - country_keyword = "ADMIN" - elif shp[0]["properties"].has_key("NAME"): - country_keyword = "NAME" - else: - country_keyword = "admin" - for line in shp: - countries.append(line["properties"][country_keyword]) - shp.close() - return countries - - -def replace_empty_neighbours_with_KNN(data_countries, w): - shapefile = "../MergeBL-Smith/shapefiles/ne_10m_admin_0_countries.shp" - no_neighbors_idx = w.islands - knn = 10 - wknn = pysal.knnW_from_shapefile(shapefile, knn) - knn_countries = get_countries_from_shapefile(shapefile) - neighbors = w.neighbors - for nn_idx in no_neighbors_idx: - country = data_countries[nn_idx] - print country - if country not in knn_countries: - continue - knn_country_idx = knn_countries.index(country) - knn_country_neighbors = [knn_countries[nn] for nn in wknn.neighbors[knn_country_idx]] - for knn_nn in knn_country_neighbors: - if len(neighbors[nn_idx])>2: - continue - data_country_idx = np.where(data_countries==knn_nn)[0] - if len(data_country_idx)>0: - neighbors[nn_idx][data_country_idx[0]] = 1.0 - w = pysal.weights.W(neighbors, id_order=range(len(data_countries))) - return w - - -def get_neighbors_for_countries_in_dataset(Y): - # neighbors - data_countries = np.unique(Y) - w = neighbors_from_json_file(data_countries) - w = replace_empty_neighbours_with_KNN(data_countries, w) - return w, data_countries - - -def from_weights_to_dict(w, data_countries): - w_dict = {} - for i in w.neighbors: - w_dict[data_countries[i]] = [data_countries[nn] for nn in w.neighbors[i]] - return w_dict - - -def get_LH_HL_idx(lm, p_vals): - sig_idx = np.where(p_vals<0.05)[0] - LH_idx = sig_idx[np.where(lm.q[sig_idx]==2)[0]] - HL_idx = sig_idx[np.where(lm.q[sig_idx]==4)[0]] - return LH_idx, HL_idx - - -def print_Moran_outliers(y, w, data_countries): - lm = pysal.Moran_Local(y, w) - p_vals = lm.p_z_sim - LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) - print 'LH', zip(data_countries[LH_idx], p_vals[LH_idx]) # LH - print 'HL', zip(data_countries[HL_idx], p_vals[HL_idx]) # HL - - -def plot_Moran_scatterplot(y, w, data_countries, out_file=None): - lm = pysal.Moran_Local(y, w) - p_vals = lm.p_z_sim - LH_idx, HL_idx = get_LH_HL_idx(lm, p_vals) - - ylt = pysal.lag_spatial(lm.w, lm.y) - yt = lm.y - yt = (yt - np.mean(yt))/np.std(yt) - ylt = (ylt - np.mean(ylt))/np.std(ylt) - colors = plt.cm.spectral(np.linspace(0,1,5)) - quad = np.zeros(yt.shape, dtype=int) - quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH - quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH - quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL - quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL - marker_color = colors[quad] - marker_size = 40*np.ones(yt.shape, dtype=int) - marker_size[LH_idx] = 140 - marker_size[HL_idx] = 140 - - plt.figure() - plt.scatter(yt, ylt, c=marker_color, s=marker_size, alpha=0.7) - plt.xlabel('Value') - plt.ylabel('Spatially Lagged Value') - plt.axvline(c='black', ls='--') - plt.axhline(c='black', ls='--') - plt.ylim(min(ylt)-0.5, max(ylt)+0.5) - plt.xlim(min(yt)-0.5, max(yt)+1.5) - for i in np.concatenate((LH_idx, HL_idx)): - plt.annotate(data_countries[i], (yt[i], ylt[i]), xytext=(yt[i]*1.1, ylt[i]*1.1),textcoords='data',arrowprops=dict(arrowstyle="->",connectionstyle="arc3")) - extreme_points = np.concatenate(([np.argmin(ylt)], [np.argmax(ylt)], np.where(yt>np.mean(yt)+2.8*np.std(yt))[0], np.where(ylt>np.mean(yt)+2.8*np.std(ylt))[0])) - extreme_points = np.array(list(set(extreme_points) - set(np.concatenate((LH_idx, HL_idx))))) - for i in extreme_points: - plt.annotate(data_countries[i], (yt[i]+0.1, ylt[i])) - if out_file is not None: - plt.savefig(out_file)