Maria@4: # -*- coding: utf-8 -*- Maria@4: """ Maria@4: Created on Wed Mar 15 22:52:57 2017 Maria@4: Maria@4: @author: mariapanteli Maria@4: """ Maria@4: Maria@4: import numpy as np Maria@4: import pandas as pd Maria@4: import pickle Maria@4: Maria@4: import load_features Maria@4: import util_dataset Maria@4: import util_filter_dataset Maria@4: Maria@4: Maria@4: #METADATA_FILE = 'sample_dataset/metadata.csv' Maria@4: #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] Maria@4: WIN_SIZE = 2 Maria@4: METADATA_FILE = 'data/metadata_BLSM_language_all.csv' Maria@4: #OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] Maria@4: #OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] Maria@4: OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', Maria@4: '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', Maria@4: '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] Maria@4: Maria@4: def extract_features(df, win2sec=8.0): Maria@4: """Extract features from melspec and chroma. Maria@4: Maria@4: Parameters Maria@4: ---------- Maria@4: df : pd.DataFrame Maria@4: Metadata including class label and path to audio, melspec, chroma Maria@4: win2sec : float Maria@4: The window size for the second frame decomposition of the features Maria@4: Maria@4: Returns Maria@4: ------- Maria@4: X : np.array Maria@4: The features for every frame x every audio file in the dataset Maria@4: Y : np.array Maria@4: The class labels for every frame in the dataset Maria@4: Y_audio : np.array Maria@4: The audio labels Maria@4: """ Maria@4: feat_loader = load_features.FeatureLoader(win2sec=win2sec) Maria@4: frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) Maria@4: print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape Maria@4: X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) Maria@4: Y = Y_df.get_values() Maria@4: Y_audio = Y_audio_df.get_values() Maria@4: return X, Y, Y_audio Maria@4: Maria@4: Maria@4: if __name__ == '__main__': Maria@4: # load dataset Maria@4: df = pd.read_csv(METADATA_FILE) Maria@4: df = util_filter_dataset.remove_missing_data(df) Maria@4: subset_idx = util_dataset.subset_labels(df['Country'].get_values()) Maria@4: df = df.iloc[subset_idx, :] Maria@4: X, Y = np.arange(len(df)), df['Country'].get_values() Maria@4: Maria@4: # split in train, val, test set Maria@4: train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) Maria@4: Maria@4: # extract features and write output Maria@4: X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) Maria@4: with open(OUTPUT_FILES[0], 'wb') as f: Maria@4: pickle.dump([X_train, Y_train, Y_audio_train], f) Maria@4: Maria@4: X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) Maria@4: with open(OUTPUT_FILES[1], 'wb') as f: Maria@4: pickle.dump([X_val, Y_val, Y_audio_val], f) Maria@4: Maria@4: X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) Maria@4: with open(OUTPUT_FILES[2], 'wb') as f: Maria@4: pickle.dump([X_test, Y_test, Y_audio_test], f) Maria@4: Maria@4: #out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' Maria@4: # pickle.dump([X_test, Y_test, Y_audio_test], f) Maria@4: #with open(out_file, 'wb') as f: