Mercurial > hg > plosone_underreview

--- a/scripts/load_dataset.py	Tue Sep 12 18:02:43 2017 +0100
+++ b/scripts/load_dataset.py	Tue Sep 12 18:03:56 2017 +0100
@@ -8,22 +8,86 @@
 import numpy as np
 import pandas as pd
 import pickle
+from sklearn.model_selection import train_test_split

 import load_features
-import util_dataset
 import util_filter_dataset


 #METADATA_FILE = 'sample_dataset/metadata.csv'
 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
-WIN_SIZE = 2
+WIN_SIZE = 8
 METADATA_FILE = 'data/metadata_BLSM_language_all.csv'
-#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle']
-#OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle']
 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
                 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
                 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']

+
+def get_train_val_test_idx(X, Y, seed=None):
+    """ Split in train, validation, test sets.
+
+    Parameters
+    ----------
+    X : np.array
+        Data or indices.
+    Y : np.array
+        Class labels for data in X.
+    seed: int
+        Random seed.
+    Returns
+    -------
+    (X_train, Y_train) : tuple
+        Data X and labels y for the train set
+    (X_val, Y_val) : tuple
+        Data X and labels y for the validation set
+    (X_test, Y_test) : tuple
+        Data X and labels y for the test set
+
+    """
+    X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y)
+    X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test)
+    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
+
+
+def subset_labels(Y, N_min=10, N_max=100, seed=None):
+    """ Subset dataset to contain minimum N_min and maximum N_max instances
+        per class. Return indices for this subset.
+
+    Parameters
+    ----------
+    Y : np.array
+        Class labels
+    N_min : int
+        Minimum instances per class
+    N_max : int
+        Maximum instances per class
+    seed: int
+        Random seed.
+
+    Returns
+    -------
+    subset_idx : np.array
+        Indices for a subset with classes of size bounded by N_min, N_max
+
+    """
+    np.random.seed(seed=seed)
+    subset_idx = []
+    labels = np.unique(Y)
+    for label in labels:
+        label_idx = np.where(Y==label)[0]
+        counts = len(label_idx)
+        if counts>=N_max:
+            subset_idx.append(np.random.choice(label_idx, N_max, replace=False))
+        elif counts>=N_min and counts<N_max:
+            subset_idx.append(label_idx)
+        else:
+            # not enough samples for this class, skip
+            continue
+    if len(subset_idx)>0:
+        subset_idx = np.concatenate(subset_idx, axis=0)
+    return subset_idx
+
+
 def extract_features(df, win2sec=8.0):
     """Extract features from melspec and chroma.

@@ -56,12 +120,12 @@
     # load dataset
     df = pd.read_csv(METADATA_FILE)
     df = util_filter_dataset.remove_missing_data(df)
-    subset_idx = util_dataset.subset_labels(df['Country'].get_values())
+    subset_idx = subset_labels(df['Country'].get_values())
     df = df.iloc[subset_idx, :]
     X, Y = np.arange(len(df)), df['Country'].get_values()

     # split in train, val, test set
-    train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y)
+    train_set, val_set, test_set = get_train_val_test_idx(X, Y)

     # extract features and write output
     X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
@@ -76,6 +140,3 @@
     with open(OUTPUT_FILES[2], 'wb') as f:
         pickle.dump([X_test, Y_test, Y_audio_test], f)

-#out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle'
-#    pickle.dump([X_test, Y_test, Y_audio_test], f)
-#with open(out_file, 'wb') as f:
--- a/scripts/results.py	Tue Sep 12 18:02:43 2017 +0100
+++ b/scripts/results.py	Tue Sep 12 18:03:56 2017 +0100
@@ -88,8 +88,7 @@
     ddf = load_metadata(Yaudio, metadata_file='data/metadata.csv')
     w, data_countries = utils_spatial.get_neighbors_for_countries_in_dataset(Y)
     w_dict = utils_spatial.from_weights_to_dict(w, data_countries)
-    Xrhy, Xmel, Xmfc, Xchr = X_list
-    X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)
+    X = np.concatenate(X_list, axis=1)

     # global outliers
     df_global, threshold, MD = get_outliers_df(X, Y, chi2thr=0.999)
--- a/scripts/util_dataset.py	Tue Sep 12 18:02:43 2017 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Mar 15 23:04:24 2017
-
-@author: mariapanteli
-"""
-
-import numpy as np
-from sklearn.model_selection import train_test_split
-
-
-def get_train_val_test_idx(X, Y, seed=None):
-    """ Split in train, validation, test sets.
-
-    Parameters
-    ----------
-    X : np.array
-        Data or indices.
-    Y : np.array
-        Class labels for data in X.
-    seed: int
-        Random seed.
-    Returns
-    -------
-    (X_train, Y_train) : tuple
-        Data X and labels y for the train set
-    (X_val, Y_val) : tuple
-        Data X and labels y for the validation set
-    (X_test, Y_test) : tuple
-        Data X and labels y for the test set
-
-    """
-    X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y)
-    X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test)
-    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
-
-
-def subset_labels(Y, N_min=10, N_max=100, seed=None):
-    """ Subset dataset to contain minimum N_min and maximum N_max instances
-        per class. Return indices for this subset.
-
-    Parameters
-    ----------
-    Y : np.array
-        Class labels
-    N_min : int
-        Minimum instances per class
-    N_max : int
-        Maximum instances per class
-    seed: int
-        Random seed.
-
-    Returns
-    -------
-    subset_idx : np.array
-        Indices for a subset with classes of size bounded by N_min, N_max
-
-    """
-    subset_idx = []
-    labels = np.unique(Y)
-    for label in labels:
-        label_idx = np.where(Y==label)[0]
-        counts = len(label_idx)
-        if counts>=N_max:
-            subset_idx.append(np.random.choice(label_idx, N_max, replace=False))
-        elif counts>=N_min and counts<N_max:
-            subset_idx.append(label_idx)
-        else:
-            # not enough samples for this class, skip
-            continue
-    if len(subset_idx)>0:
-        subset_idx = np.concatenate(subset_idx, axis=0)
-    return subset_idx
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/util_filter_dataset.py	Tue Sep 12 18:03:56 2017 +0100
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr 10 17:44:48 2017
+
+@author: mariapanteli
+"""
+
+import os
+import numpy as np
+import pandas as pd
+
+
+def get_speech_vamp(df):
+    jspeech = df.columns.get_loc("Speech")
+    nfiles = len(df)
+    speechinds = []
+    for i in range(nfiles):
+        #print i
+        if os.path.exists(df.iat[i, jspeech]) and os.path.getsize(df.iat[i, jspeech])>0:
+            bounds = pd.read_csv(df.iat[i, jspeech], header=None, delimiter='\t').get_values()
+            if len(bounds)>0:
+                if len(np.where(bounds[:,2]=='m')[0])==0 or len(np.where(bounds[:,2]=='s')[0])==len(bounds):
+                    speechinds.append(i)
+    return speechinds
+
+
+def get_speech_meta(df):
+    genres = np.array(df["Genre_Album"].get_values(), dtype=str)
+    speechinds_genre = []
+    invalid_genres = ["Spoken Word", "Language Instruction", "Classical",
+                        "Poetry", "Nature|Sounds", "Music Instruction",
+                        "Soundtracks &amp", "Contemporary &amp", "Jazz &amp",
+                        "Sounds", "Ragtime", "Nature", "Electronic",
+                        "African American Spoken", "Blues", "Gospel",
+                        "Psychology &amp"]
+    for i in range(len(genres)):
+        genre = genres[i]
+        #if genre in invalid_genres:
+        if any(x in genre for x in invalid_genres):
+            speechinds_genre.append(i)
+    return speechinds_genre
+
+
+def get_missing_csv(df):
+    nfiles = len(df)
+    missing_csv = []
+    for i in range(nfiles):
+        if not (os.path.exists(df["Melspec"].iloc[i]) and os.path.exists(df["Chroma"].iloc[i]) and os.path.exists(df["Melodia"].iloc[i])):
+            missing_csv.append(i)
+    return missing_csv
+
+
+def get_missing_country_meta(df):
+    nfiles = len(df)
+    missing_country = []
+    country_labels = np.array(df['Country'].get_values(), dtype=str)
+    invalid_countries = ['Unidentified', 'unknown', 'nan',
+                         'Yugoslavia (former)', 'Pathian village  Wangulei ',
+                         'Joulouloum  either Senegal or The Gambia ']
+    for i in range(nfiles):
+        country = country_labels[i]
+        if country in invalid_countries:
+            missing_country.append(i)
+    return missing_country
+
+
+def remove_missing_data(df):
+    speechinds_vamp = get_speech_vamp(df)
+    speechinds_genre = get_speech_meta(df)
+    speechinds = set(speechinds_vamp) | set(speechinds_genre)
+    missing = set(get_missing_csv(df))
+    missing_country = set(get_missing_country_meta(df))
+    selectinds = np.asarray(list(set(range(len(df))) - (missing | speechinds | missing_country)))
+
+    df = df.iloc[selectinds, :]
+    return df
\ No newline at end of file
--- a/scripts/utils_spatial.py	Tue Sep 12 18:02:43 2017 +0100
+++ b/scripts/utils_spatial.py	Tue Sep 12 18:03:56 2017 +0100
@@ -9,8 +9,6 @@
 import pysal # before shapely in util_plots
 import fiona
 import os
-import sys
-sys.path.append('../misc')
 import matplotlib.pyplot as plt