diff scripts/load_dataset.py @ 15:9847b954c217 branch-tests

added sensitivity experiment
author Maria Panteli <m.x.panteli@gmail.com>
date Tue, 12 Sep 2017 19:03:48 +0100
parents 98718fdd8326
children 65b9330afdd8
line wrap: on
line diff
--- a/scripts/load_dataset.py	Tue Sep 12 18:03:56 2017 +0100
+++ b/scripts/load_dataset.py	Tue Sep 12 19:03:48 2017 +0100
@@ -17,7 +17,7 @@
 #METADATA_FILE = 'sample_dataset/metadata.csv'
 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
 WIN_SIZE = 8
-METADATA_FILE = 'data/metadata_BLSM_language_all.csv'
+METADATA_FILE = '../data/metadata_BLSM_language_all.csv'
 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', 
                 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', 
                 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
@@ -89,7 +89,7 @@
 
 
 def extract_features(df, win2sec=8.0):
-    """Extract features from melspec and chroma.
+    """ Extract features from melspec and chroma.
     
     Parameters
     ----------
@@ -116,27 +116,53 @@
     return X, Y, Y_audio
 
 
-if __name__ == '__main__':
-    # load dataset
-    df = pd.read_csv(METADATA_FILE)
+def sample_dataset(csv_file):
+    """ Load data from csv and select min 10 - max 100 recs from each country.
+
+    Parameters
+    ----------
+    csv_file : str
+        The path to the csv file containing the metadata (including country) of the tracks.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        The metadata for the selected subset of tracks.
+    """
+    df = pd.read_csv(csv_file)
     df = util_filter_dataset.remove_missing_data(df)
     subset_idx = subset_labels(df['Country'].get_values())
     df = df.iloc[subset_idx, :]
-    X, Y = np.arange(len(df)), df['Country'].get_values()
+    return df
     
-    # split in train, val, test set
-    train_set, val_set, test_set = get_train_val_test_idx(X, Y) 
-    
-    # extract features and write output
+
+def features_for_train_test_sets(df, write_output=False):
+    """Split in train/val/test sets, extract features and write output files.
+
+    Parameters
+    -------
+    df : pd.DataFrame
+        The metadata for the selected subset of tracks.
+    write_output : boolean
+        Whether to write files with the extracted features for train/val/test sets.
+    """
+    X_idx, Y = np.arange(len(df)), df['Country'].get_values()
+    train_set, val_set, test_set = get_train_val_test_idx(X_idx, Y)
     X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
-    with open(OUTPUT_FILES[0], 'wb') as f:
-        pickle.dump([X_train, Y_train, Y_audio_train], f)
-        
-    X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
-    with open(OUTPUT_FILES[1], 'wb') as f:
-        pickle.dump([X_val, Y_val, Y_audio_val], f)
-        
+    X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)   
     X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE)
-    with open(OUTPUT_FILES[2], 'wb') as f:
-        pickle.dump([X_test, Y_test, Y_audio_test], f)
+   
+    if write_output:
+        with open(OUTPUT_FILES[0], 'wb') as f:
+            pickle.dump([X_train, Y_train, Y_audio_train], f)            
+        with open(OUTPUT_FILES[1], 'wb') as f:
+            pickle.dump([X_val, Y_val, Y_audio_val], f)
+        with open(OUTPUT_FILES[2], 'wb') as f:
+            pickle.dump([X_test, Y_test, Y_audio_test], f)
 
+
+if __name__ == '__main__':
+    # load dataset
+    df = sample_dataset(csv_file=METADATA_FILE)
+    features_for_train_test_sets(df, write_output=True)
+