Mercurial > hg > plosone_underreview
diff scripts/load_dataset.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children | 98718fdd8326 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/load_dataset.py Mon Sep 11 11:51:50 2017 +0100 @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Mar 15 22:52:57 2017 + +@author: mariapanteli +""" + +import numpy as np +import pandas as pd +import pickle + +import load_features +import util_dataset +import util_filter_dataset + + +#METADATA_FILE = 'sample_dataset/metadata.csv' +#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] +WIN_SIZE = 2 +METADATA_FILE = 'data/metadata_BLSM_language_all.csv' +#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] +#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] +OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] + +def extract_features(df, win2sec=8.0): + """Extract features from melspec and chroma. + + Parameters + ---------- + df : pd.DataFrame + Metadata including class label and path to audio, melspec, chroma + win2sec : float + The window size for the second frame decomposition of the features + + Returns + ------- + X : np.array + The features for every frame x every audio file in the dataset + Y : np.array + The class labels for every frame in the dataset + Y_audio : np.array + The audio labels + """ + feat_loader = load_features.FeatureLoader(win2sec=win2sec) + frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) + print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape + X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) + Y = Y_df.get_values() + Y_audio = Y_audio_df.get_values() + return X, Y, Y_audio + + +if __name__ == '__main__': + # load dataset + df = pd.read_csv(METADATA_FILE) + df = util_filter_dataset.remove_missing_data(df) + subset_idx = util_dataset.subset_labels(df['Country'].get_values()) + df = df.iloc[subset_idx, :] + X, Y = np.arange(len(df)), df['Country'].get_values() + + # split in train, val, test set + train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) + + # extract features and write output + X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) + with open(OUTPUT_FILES[0], 'wb') as f: + pickle.dump([X_train, Y_train, Y_audio_train], f) + + X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) + with open(OUTPUT_FILES[1], 'wb') as f: + pickle.dump([X_val, Y_val, Y_audio_val], f) + + X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) + with open(OUTPUT_FILES[2], 'wb') as f: + pickle.dump([X_test, Y_test, Y_audio_test], f) + +#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' +# pickle.dump([X_test, Y_test, Y_audio_test], f) +#with open(out_file, 'wb') as f: