diff scripts/load_dataset.py @ 4:e50c63cf96be branch-tests

rearranging folders
author Maria Panteli
date Mon, 11 Sep 2017 11:51:50 +0100
parents
children 98718fdd8326
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/load_dataset.py	Mon Sep 11 11:51:50 2017 +0100
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Mar 15 22:52:57 2017
+
+@author: mariapanteli
+"""
+
+import numpy as np
+import pandas as pd
+import pickle
+
+import load_features
+import util_dataset
+import util_filter_dataset
+
+
+#METADATA_FILE = 'sample_dataset/metadata.csv'
+#OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
+WIN_SIZE = 2
+METADATA_FILE = 'data/metadata_BLSM_language_all.csv'
+#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle']
+#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle']
+OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', 
+                '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', 
+                '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
+
+def extract_features(df, win2sec=8.0):
+    """Extract features from melspec and chroma.
+    
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Metadata including class label and path to audio, melspec, chroma
+    win2sec : float
+        The window size for the second frame decomposition of the features
+        
+    Returns
+    -------
+    X : np.array
+        The features for every frame x every audio file in the dataset
+    Y : np.array
+        The class labels for every frame in the dataset
+    Y_audio : np.array
+        The audio labels
+    """
+    feat_loader = load_features.FeatureLoader(win2sec=win2sec)
+    frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df)
+    print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape
+    X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1)
+    Y = Y_df.get_values()
+    Y_audio = Y_audio_df.get_values()
+    return X, Y, Y_audio
+
+
+if __name__ == '__main__':
+    # load dataset
+    df = pd.read_csv(METADATA_FILE)
+    df = util_filter_dataset.remove_missing_data(df)
+    subset_idx = util_dataset.subset_labels(df['Country'].get_values())
+    df = df.iloc[subset_idx, :]
+    X, Y = np.arange(len(df)), df['Country'].get_values()
+    
+    # split in train, val, test set
+    train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) 
+    
+    # extract features and write output
+    X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
+    with open(OUTPUT_FILES[0], 'wb') as f:
+        pickle.dump([X_train, Y_train, Y_audio_train], f)
+        
+    X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
+    with open(OUTPUT_FILES[1], 'wb') as f:
+        pickle.dump([X_val, Y_val, Y_audio_val], f)
+        
+    X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE)
+    with open(OUTPUT_FILES[2], 'wb') as f:
+        pickle.dump([X_test, Y_test, Y_audio_test], f)
+
+#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle'
+#    pickle.dump([X_test, Y_test, Y_audio_test], f)
+#with open(out_file, 'wb') as f: