Mercurial > hg > plosone_underreview
changeset 14:088b5547e094 branch-tests
Merge
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Tue, 12 Sep 2017 18:03:56 +0100 |
parents | 98718fdd8326 (diff) ff18f364bbac (current diff) |
children | 9847b954c217 |
files | |
diffstat | 5 files changed, 147 insertions(+), 86 deletions(-) [+] |
line wrap: on
line diff
--- a/scripts/load_dataset.py Tue Sep 12 18:02:43 2017 +0100 +++ b/scripts/load_dataset.py Tue Sep 12 18:03:56 2017 +0100 @@ -8,22 +8,86 @@ import numpy as np import pandas as pd import pickle +from sklearn.model_selection import train_test_split import load_features -import util_dataset import util_filter_dataset #METADATA_FILE = 'sample_dataset/metadata.csv' #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] -WIN_SIZE = 2 +WIN_SIZE = 8 METADATA_FILE = 'data/metadata_BLSM_language_all.csv' -#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] -#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] + +def get_train_val_test_idx(X, Y, seed=None): + """ Split in train, validation, test sets. + + Parameters + ---------- + X : np.array + Data or indices. + Y : np.array + Class labels for data in X. + seed: int + Random seed. + Returns + ------- + (X_train, Y_train) : tuple + Data X and labels y for the train set + (X_val, Y_val) : tuple + Data X and labels y for the validation set + (X_test, Y_test) : tuple + Data X and labels y for the test set + + """ + X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) + X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) + return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) + + +def subset_labels(Y, N_min=10, N_max=100, seed=None): + """ Subset dataset to contain minimum N_min and maximum N_max instances + per class. Return indices for this subset. + + Parameters + ---------- + Y : np.array + Class labels + N_min : int + Minimum instances per class + N_max : int + Maximum instances per class + seed: int + Random seed. + + Returns + ------- + subset_idx : np.array + Indices for a subset with classes of size bounded by N_min, N_max + + """ + np.random.seed(seed=seed) + subset_idx = [] + labels = np.unique(Y) + for label in labels: + label_idx = np.where(Y==label)[0] + counts = len(label_idx) + if counts>=N_max: + subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) + elif counts>=N_min and counts<N_max: + subset_idx.append(label_idx) + else: + # not enough samples for this class, skip + continue + if len(subset_idx)>0: + subset_idx = np.concatenate(subset_idx, axis=0) + return subset_idx + + def extract_features(df, win2sec=8.0): """Extract features from melspec and chroma. @@ -56,12 +120,12 @@ # load dataset df = pd.read_csv(METADATA_FILE) df = util_filter_dataset.remove_missing_data(df) - subset_idx = util_dataset.subset_labels(df['Country'].get_values()) + subset_idx = subset_labels(df['Country'].get_values()) df = df.iloc[subset_idx, :] X, Y = np.arange(len(df)), df['Country'].get_values() # split in train, val, test set - train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) + train_set, val_set, test_set = get_train_val_test_idx(X, Y) # extract features and write output X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) @@ -76,6 +140,3 @@ with open(OUTPUT_FILES[2], 'wb') as f: pickle.dump([X_test, Y_test, Y_audio_test], f) -#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' -# pickle.dump([X_test, Y_test, Y_audio_test], f) -#with open(out_file, 'wb') as f:
--- a/scripts/results.py Tue Sep 12 18:02:43 2017 +0100 +++ b/scripts/results.py Tue Sep 12 18:03:56 2017 +0100 @@ -88,8 +88,7 @@ ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv') w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y) w_dict = utils_spatial.from_weights_to_dict(w, data_countries) - Xrhy, Xmel, Xmfc, Xchr = X_list - X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1) + X = np.concatenate(X_list, axis=1) # global outliers df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
--- a/scripts/util_dataset.py Tue Sep 12 18:02:43 2017 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed Mar 15 23:04:24 2017 - -@author: mariapanteli -""" - -import numpy as np -from sklearn.model_selection import train_test_split - - -def get_train_val_test_idx(X, Y, seed=None): - """ Split in train, validation, test sets. - - Parameters - ---------- - X : np.array - Data or indices. - Y : np.array - Class labels for data in X. - seed: int - Random seed. - Returns - ------- - (X_train, Y_train) : tuple - Data X and labels y for the train set - (X_val, Y_val) : tuple - Data X and labels y for the validation set - (X_test, Y_test) : tuple - Data X and labels y for the test set - - """ - X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) - X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) - return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) - - -def subset_labels(Y, N_min=10, N_max=100, seed=None): - """ Subset dataset to contain minimum N_min and maximum N_max instances - per class. Return indices for this subset. - - Parameters - ---------- - Y : np.array - Class labels - N_min : int - Minimum instances per class - N_max : int - Maximum instances per class - seed: int - Random seed. - - Returns - ------- - subset_idx : np.array - Indices for a subset with classes of size bounded by N_min, N_max - - """ - subset_idx = [] - labels = np.unique(Y) - for label in labels: - label_idx = np.where(Y==label)[0] - counts = len(label_idx) - if counts>=N_max: - subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) - elif counts>=N_min and counts<N_max: - subset_idx.append(label_idx) - else: - # not enough samples for this class, skip - continue - if len(subset_idx)>0: - subset_idx = np.concatenate(subset_idx, axis=0) - return subset_idx
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/util_filter_dataset.py Tue Sep 12 18:03:56 2017 +0100 @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 10 17:44:48 2017 + +@author: mariapanteli +""" + +import os +import numpy as np +import pandas as pd + + +def get_speech_vamp(df): + jspeech = df.columns.get_loc("Speech") + nfiles = len(df) + speechinds = [] + for i in range(nfiles): + #print i + if os.path.exists(df.iat[i, jspeech]) and os.path.getsize(df.iat[i, jspeech])>0: + bounds = pd.read_csv(df.iat[i, jspeech], header=None, delimiter='\t').get_values() + if len(bounds)>0: + if len(np.where(bounds[:,2]=='m')[0])==0 or len(np.where(bounds[:,2]=='s')[0])==len(bounds): + speechinds.append(i) + return speechinds + + +def get_speech_meta(df): + genres = np.array(df["Genre_Album"].get_values(), dtype=str) + speechinds_genre = [] + invalid_genres = ["Spoken Word", "Language Instruction", "Classical", + "Poetry", "Nature|Sounds", "Music Instruction", + "Soundtracks &", "Contemporary &", "Jazz &", + "Sounds", "Ragtime", "Nature", "Electronic", + "African American Spoken", "Blues", "Gospel", + "Psychology &"] + for i in range(len(genres)): + genre = genres[i] + #if genre in invalid_genres: + if any(x in genre for x in invalid_genres): + speechinds_genre.append(i) + return speechinds_genre + + +def get_missing_csv(df): + nfiles = len(df) + missing_csv = [] + for i in range(nfiles): + if not (os.path.exists(df["Melspec"].iloc[i]) and os.path.exists(df["Chroma"].iloc[i]) and os.path.exists(df["Melodia"].iloc[i])): + missing_csv.append(i) + return missing_csv + + +def get_missing_country_meta(df): + nfiles = len(df) + missing_country = [] + country_labels = np.array(df['Country'].get_values(), dtype=str) + invalid_countries = ['Unidentified', 'unknown', 'nan', + 'Yugoslavia (former)', 'Pathian village Wangulei ', + 'Joulouloum either Senegal or The Gambia '] + for i in range(nfiles): + country = country_labels[i] + if country in invalid_countries: + missing_country.append(i) + return missing_country + + +def remove_missing_data(df): + speechinds_vamp = get_speech_vamp(df) + speechinds_genre = get_speech_meta(df) + speechinds = set(speechinds_vamp) | set(speechinds_genre) + missing = set(get_missing_csv(df)) + missing_country = set(get_missing_country_meta(df)) + selectinds = np.asarray(list(set(range(len(df))) - (missing | speechinds | missing_country))) + + df = df.iloc[selectinds, :] + return df \ No newline at end of file