Mercurial > hg > plosone_underreview
diff scripts/load_dataset.py @ 15:9847b954c217 branch-tests
added sensitivity experiment
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Tue, 12 Sep 2017 19:03:48 +0100 |
parents | 98718fdd8326 |
children | 65b9330afdd8 |
line wrap: on
line diff
--- a/scripts/load_dataset.py Tue Sep 12 18:03:56 2017 +0100 +++ b/scripts/load_dataset.py Tue Sep 12 19:03:48 2017 +0100 @@ -17,7 +17,7 @@ #METADATA_FILE = 'sample_dataset/metadata.csv' #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] WIN_SIZE = 8 -METADATA_FILE = 'data/metadata_BLSM_language_all.csv' +METADATA_FILE = '../data/metadata_BLSM_language_all.csv' OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] @@ -89,7 +89,7 @@ def extract_features(df, win2sec=8.0): - """Extract features from melspec and chroma. + """ Extract features from melspec and chroma. Parameters ---------- @@ -116,27 +116,53 @@ return X, Y, Y_audio -if __name__ == '__main__': - # load dataset - df = pd.read_csv(METADATA_FILE) +def sample_dataset(csv_file): + """ Load data from csv and select min 10 - max 100 recs from each country. + + Parameters + ---------- + csv_file : str + The path to the csv file containing the metadata (including country) of the tracks. + + Returns + ------- + df : pd.DataFrame + The metadata for the selected subset of tracks. + """ + df = pd.read_csv(csv_file) df = util_filter_dataset.remove_missing_data(df) subset_idx = subset_labels(df['Country'].get_values()) df = df.iloc[subset_idx, :] - X, Y = np.arange(len(df)), df['Country'].get_values() + return df - # split in train, val, test set - train_set, val_set, test_set = get_train_val_test_idx(X, Y) - - # extract features and write output + +def features_for_train_test_sets(df, write_output=False): + """Split in train/val/test sets, extract features and write output files. + + Parameters + ------- + df : pd.DataFrame + The metadata for the selected subset of tracks. + write_output : boolean + Whether to write files with the extracted features for train/val/test sets. + """ + X_idx, Y = np.arange(len(df)), df['Country'].get_values() + train_set, val_set, test_set = get_train_val_test_idx(X_idx, Y) X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[0], 'wb') as f: - pickle.dump([X_train, Y_train, Y_audio_train], f) - - X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[1], 'wb') as f: - pickle.dump([X_val, Y_val, Y_audio_val], f) - + X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) - with open(OUTPUT_FILES[2], 'wb') as f: - pickle.dump([X_test, Y_test, Y_audio_test], f) + + if write_output: + with open(OUTPUT_FILES[0], 'wb') as f: + pickle.dump([X_train, Y_train, Y_audio_train], f) + with open(OUTPUT_FILES[1], 'wb') as f: + pickle.dump([X_val, Y_val, Y_audio_val], f) + with open(OUTPUT_FILES[2], 'wb') as f: + pickle.dump([X_test, Y_test, Y_audio_test], f) + +if __name__ == '__main__': + # load dataset + df = sample_dataset(csv_file=METADATA_FILE) + features_for_train_test_sets(df, write_output=True) +