Maria@4: # -*- coding: utf-8 -*- Maria@4: """ Maria@4: Created on Wed Mar 15 22:52:57 2017 Maria@4: Maria@4: @author: mariapanteli Maria@4: """ Maria@4: Maria@4: import numpy as np Maria@4: import pandas as pd Maria@4: import pickle m@13: from sklearn.model_selection import train_test_split Maria@4: Maria@4: import load_features Maria@4: import util_filter_dataset Maria@4: Maria@4: Maria@4: #METADATA_FILE = 'sample_dataset/metadata.csv' Maria@4: #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] m@13: WIN_SIZE = 8 m@15: METADATA_FILE = '../data/metadata_BLSM_language_all.csv' Maria@4: OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', Maria@4: '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', Maria@4: '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] Maria@4: m@13: m@13: def get_train_val_test_idx(X, Y, seed=None): m@13: """ Split in train, validation, test sets. m@13: m@13: Parameters m@13: ---------- m@13: X : np.array m@13: Data or indices. m@13: Y : np.array m@13: Class labels for data in X. m@13: seed: int m@13: Random seed. m@13: Returns m@13: ------- m@13: (X_train, Y_train) : tuple m@13: Data X and labels y for the train set m@13: (X_val, Y_val) : tuple m@13: Data X and labels y for the validation set m@13: (X_test, Y_test) : tuple m@13: Data X and labels y for the test set m@13: m@13: """ m@13: X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y) m@13: X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test) m@13: return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) m@13: m@13: m@13: def subset_labels(Y, N_min=10, N_max=100, seed=None): m@13: """ Subset dataset to contain minimum N_min and maximum N_max instances m@13: per class. Return indices for this subset. m@13: m@13: Parameters m@13: ---------- m@13: Y : np.array m@13: Class labels m@13: N_min : int m@13: Minimum instances per class m@13: N_max : int m@13: Maximum instances per class m@13: seed: int m@13: Random seed. m@13: m@13: Returns m@13: ------- m@13: subset_idx : np.array m@13: Indices for a subset with classes of size bounded by N_min, N_max m@13: m@13: """ m@13: np.random.seed(seed=seed) m@13: subset_idx = [] m@13: labels = np.unique(Y) m@13: for label in labels: m@13: label_idx = np.where(Y==label)[0] m@13: counts = len(label_idx) m@13: if counts>=N_max: m@13: subset_idx.append(np.random.choice(label_idx, N_max, replace=False)) m@13: elif counts>=N_min and counts0: m@13: subset_idx = np.concatenate(subset_idx, axis=0) m@13: return subset_idx m@13: m@13: Maria@4: def extract_features(df, win2sec=8.0): m@15: """ Extract features from melspec and chroma. Maria@4: Maria@4: Parameters Maria@4: ---------- Maria@4: df : pd.DataFrame Maria@4: Metadata including class label and path to audio, melspec, chroma Maria@4: win2sec : float Maria@4: The window size for the second frame decomposition of the features Maria@4: Maria@4: Returns Maria@4: ------- Maria@4: X : np.array Maria@4: The features for every frame x every audio file in the dataset Maria@4: Y : np.array Maria@4: The class labels for every frame in the dataset Maria@4: Y_audio : np.array Maria@4: The audio labels Maria@4: """ Maria@4: feat_loader = load_features.FeatureLoader(win2sec=win2sec) Maria@4: frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) Maria@4: print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape Maria@4: X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) Maria@4: Y = Y_df.get_values() Maria@4: Y_audio = Y_audio_df.get_values() Maria@4: return X, Y, Y_audio Maria@4: Maria@4: m@15: def sample_dataset(csv_file): m@15: """ Load data from csv and select min 10 - max 100 recs from each country. m@15: m@15: Parameters m@15: ---------- m@15: csv_file : str m@15: The path to the csv file containing the metadata (including country) of the tracks. m@15: m@15: Returns m@15: ------- m@15: df : pd.DataFrame m@15: The metadata for the selected subset of tracks. m@15: """ m@15: df = pd.read_csv(csv_file) Maria@4: df = util_filter_dataset.remove_missing_data(df) m@13: subset_idx = subset_labels(df['Country'].get_values()) Maria@4: df = df.iloc[subset_idx, :] m@15: return df Maria@4: m@15: m@15: def features_for_train_test_sets(df, write_output=False): m@15: """Split in train/val/test sets, extract features and write output files. m@15: m@15: Parameters m@15: ------- m@15: df : pd.DataFrame m@15: The metadata for the selected subset of tracks. m@15: write_output : boolean m@15: Whether to write files with the extracted features for train/val/test sets. m@15: """ m@15: X_idx, Y = np.arange(len(df)), df['Country'].get_values() m@15: train_set, val_set, test_set = get_train_val_test_idx(X_idx, Y) Maria@4: X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) m@15: X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) Maria@4: X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) m@15: m@20: train = [X_train, Y_train, Y_audio_train] m@20: val = [X_val, Y_val, Y_audio_val] m@20: test = [X_test, Y_test, Y_audio_test] m@15: if write_output: m@15: with open(OUTPUT_FILES[0], 'wb') as f: m@20: pickle.dump(train, f) m@15: with open(OUTPUT_FILES[1], 'wb') as f: m@20: pickle.dump(val, f) m@15: with open(OUTPUT_FILES[2], 'wb') as f: m@20: pickle.dump(test, f) m@20: return train, val, test Maria@4: m@15: m@15: if __name__ == '__main__': m@15: # load dataset m@15: df = sample_dataset(csv_file=METADATA_FILE) m@20: train, val, test = features_for_train_test_sets(df, write_output=True) m@15: