Mercurial > hg > plosone_underreview
comparison scripts/load_dataset.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children | 98718fdd8326 |
comparison
equal
deleted
inserted
replaced
3:230a0cf17de0 | 4:e50c63cf96be |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Wed Mar 15 22:52:57 2017 | |
4 | |
5 @author: mariapanteli | |
6 """ | |
7 | |
8 import numpy as np | |
9 import pandas as pd | |
10 import pickle | |
11 | |
12 import load_features | |
13 import util_dataset | |
14 import util_filter_dataset | |
15 | |
16 | |
17 #METADATA_FILE = 'sample_dataset/metadata.csv' | |
18 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] | |
19 WIN_SIZE = 2 | |
20 METADATA_FILE = 'data/metadata_BLSM_language_all.csv' | |
21 #OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf.pickle', '/import/c4dm-04/mariap/val_data_cf.pickle', '/import/c4dm-04/mariap/test_data_cf.pickle'] | |
22 #OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_cf_4.pickle', '/import/c4dm-04/mariap/val_data_cf_4.pickle', '/import/c4dm-04/mariap/test_data_cf_4.pickle'] | |
23 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
24 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', | |
25 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] | |
26 | |
27 def extract_features(df, win2sec=8.0): | |
28 """Extract features from melspec and chroma. | |
29 | |
30 Parameters | |
31 ---------- | |
32 df : pd.DataFrame | |
33 Metadata including class label and path to audio, melspec, chroma | |
34 win2sec : float | |
35 The window size for the second frame decomposition of the features | |
36 | |
37 Returns | |
38 ------- | |
39 X : np.array | |
40 The features for every frame x every audio file in the dataset | |
41 Y : np.array | |
42 The class labels for every frame in the dataset | |
43 Y_audio : np.array | |
44 The audio labels | |
45 """ | |
46 feat_loader = load_features.FeatureLoader(win2sec=win2sec) | |
47 frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df) | |
48 print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape | |
49 X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1) | |
50 Y = Y_df.get_values() | |
51 Y_audio = Y_audio_df.get_values() | |
52 return X, Y, Y_audio | |
53 | |
54 | |
55 if __name__ == '__main__': | |
56 # load dataset | |
57 df = pd.read_csv(METADATA_FILE) | |
58 df = util_filter_dataset.remove_missing_data(df) | |
59 subset_idx = util_dataset.subset_labels(df['Country'].get_values()) | |
60 df = df.iloc[subset_idx, :] | |
61 X, Y = np.arange(len(df)), df['Country'].get_values() | |
62 | |
63 # split in train, val, test set | |
64 train_set, val_set, test_set = util_dataset.get_train_val_test_idx(X, Y) | |
65 | |
66 # extract features and write output | |
67 X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) | |
68 with open(OUTPUT_FILES[0], 'wb') as f: | |
69 pickle.dump([X_train, Y_train, Y_audio_train], f) | |
70 | |
71 X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) | |
72 with open(OUTPUT_FILES[1], 'wb') as f: | |
73 pickle.dump([X_val, Y_val, Y_audio_val], f) | |
74 | |
75 X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) | |
76 with open(OUTPUT_FILES[2], 'wb') as f: | |
77 pickle.dump([X_test, Y_test, Y_audio_test], f) | |
78 | |
79 #out_file = '/import/c4dm-04/mariap/test_data_melodia_1_test.pickle' | |
80 # pickle.dump([X_test, Y_test, Y_audio_test], f) | |
81 #with open(out_file, 'wb') as f: |