Maria@4
|
1 # -*- coding: utf-8 -*-
|
Maria@4
|
2 """
|
Maria@4
|
3 Created on Wed Mar 15 22:52:57 2017
|
Maria@4
|
4
|
Maria@4
|
5 @author: mariapanteli
|
Maria@4
|
6 """
|
Maria@4
|
7
|
Maria@4
|
8 import numpy as np
|
Maria@4
|
9 import pandas as pd
|
Maria@4
|
10 import pickle
|
m@13
|
11 from sklearn.model_selection import train_test_split
|
Maria@4
|
12
|
Maria@4
|
13 import load_features
|
Maria@4
|
14 import util_filter_dataset
|
Maria@4
|
15
|
Maria@4
|
16
|
Maria@4
|
17 #METADATA_FILE = 'sample_dataset/metadata.csv'
|
Maria@4
|
18 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
|
m@13
|
19 WIN_SIZE = 8
|
m@15
|
20 METADATA_FILE = '../data/metadata_BLSM_language_all.csv'
|
Maria@4
|
21 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
|
Maria@4
|
22 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
|
Maria@4
|
23 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
|
Maria@4
|
24
|
m@13
|
25
|
m@13
|
26 def get_train_val_test_idx(X, Y, seed=None):
|
m@13
|
27 """ Split in train, validation, test sets.
|
m@13
|
28
|
m@13
|
29 Parameters
|
m@13
|
30 ----------
|
m@13
|
31 X : np.array
|
m@13
|
32 Data or indices.
|
m@13
|
33 Y : np.array
|
m@13
|
34 Class labels for data in X.
|
m@13
|
35 seed: int
|
m@13
|
36 Random seed.
|
m@13
|
37 Returns
|
m@13
|
38 -------
|
m@13
|
39 (X_train, Y_train) : tuple
|
m@13
|
40 Data X and labels y for the train set
|
m@13
|
41 (X_val, Y_val) : tuple
|
m@13
|
42 Data X and labels y for the validation set
|
m@13
|
43 (X_test, Y_test) : tuple
|
m@13
|
44 Data X and labels y for the test set
|
m@13
|
45
|
m@13
|
46 """
|
m@13
|
47 X_train, X_val_test, Y_train, Y_val_test = train_test_split(X, Y, train_size=0.6, random_state=seed, stratify=Y)
|
m@13
|
48 X_val, X_test, Y_val, Y_test = train_test_split(X_val_test, Y_val_test, train_size=0.5, random_state=seed, stratify=Y_val_test)
|
m@13
|
49 return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
|
m@13
|
50
|
m@13
|
51
|
m@13
|
52 def subset_labels(Y, N_min=10, N_max=100, seed=None):
|
m@13
|
53 """ Subset dataset to contain minimum N_min and maximum N_max instances
|
m@13
|
54 per class. Return indices for this subset.
|
m@13
|
55
|
m@13
|
56 Parameters
|
m@13
|
57 ----------
|
m@13
|
58 Y : np.array
|
m@13
|
59 Class labels
|
m@13
|
60 N_min : int
|
m@13
|
61 Minimum instances per class
|
m@13
|
62 N_max : int
|
m@13
|
63 Maximum instances per class
|
m@13
|
64 seed: int
|
m@13
|
65 Random seed.
|
m@13
|
66
|
m@13
|
67 Returns
|
m@13
|
68 -------
|
m@13
|
69 subset_idx : np.array
|
m@13
|
70 Indices for a subset with classes of size bounded by N_min, N_max
|
m@13
|
71
|
m@13
|
72 """
|
m@13
|
73 np.random.seed(seed=seed)
|
m@13
|
74 subset_idx = []
|
m@13
|
75 labels = np.unique(Y)
|
m@13
|
76 for label in labels:
|
m@13
|
77 label_idx = np.where(Y==label)[0]
|
m@13
|
78 counts = len(label_idx)
|
m@13
|
79 if counts>=N_max:
|
m@13
|
80 subset_idx.append(np.random.choice(label_idx, N_max, replace=False))
|
m@13
|
81 elif counts>=N_min and counts<N_max:
|
m@13
|
82 subset_idx.append(label_idx)
|
m@13
|
83 else:
|
m@13
|
84 # not enough samples for this class, skip
|
m@13
|
85 continue
|
m@13
|
86 if len(subset_idx)>0:
|
m@13
|
87 subset_idx = np.concatenate(subset_idx, axis=0)
|
m@13
|
88 return subset_idx
|
m@13
|
89
|
m@13
|
90
|
Maria@4
|
91 def extract_features(df, win2sec=8.0):
|
m@15
|
92 """ Extract features from melspec and chroma.
|
Maria@4
|
93
|
Maria@4
|
94 Parameters
|
Maria@4
|
95 ----------
|
Maria@4
|
96 df : pd.DataFrame
|
Maria@4
|
97 Metadata including class label and path to audio, melspec, chroma
|
Maria@4
|
98 win2sec : float
|
Maria@4
|
99 The window size for the second frame decomposition of the features
|
Maria@4
|
100
|
Maria@4
|
101 Returns
|
Maria@4
|
102 -------
|
Maria@4
|
103 X : np.array
|
Maria@4
|
104 The features for every frame x every audio file in the dataset
|
Maria@4
|
105 Y : np.array
|
Maria@4
|
106 The class labels for every frame in the dataset
|
Maria@4
|
107 Y_audio : np.array
|
Maria@4
|
108 The audio labels
|
Maria@4
|
109 """
|
Maria@4
|
110 feat_loader = load_features.FeatureLoader(win2sec=win2sec)
|
Maria@4
|
111 frames_rhy, frames_mfcc, frames_chroma, frames_mel, Y_df, Y_audio_df = feat_loader.get_features(df)
|
Maria@4
|
112 print frames_rhy.shape, frames_mel.shape, frames_mfcc.shape, frames_chroma.shape
|
Maria@4
|
113 X = np.concatenate((frames_rhy, frames_mel, frames_mfcc, frames_chroma), axis=1)
|
Maria@4
|
114 Y = Y_df.get_values()
|
Maria@4
|
115 Y_audio = Y_audio_df.get_values()
|
Maria@4
|
116 return X, Y, Y_audio
|
Maria@4
|
117
|
Maria@4
|
118
|
m@15
|
119 def sample_dataset(csv_file):
|
m@15
|
120 """ Load data from csv and select min 10 - max 100 recs from each country.
|
m@15
|
121
|
m@15
|
122 Parameters
|
m@15
|
123 ----------
|
m@15
|
124 csv_file : str
|
m@15
|
125 The path to the csv file containing the metadata (including country) of the tracks.
|
m@15
|
126
|
m@15
|
127 Returns
|
m@15
|
128 -------
|
m@15
|
129 df : pd.DataFrame
|
m@15
|
130 The metadata for the selected subset of tracks.
|
m@15
|
131 """
|
m@15
|
132 df = pd.read_csv(csv_file)
|
Maria@4
|
133 df = util_filter_dataset.remove_missing_data(df)
|
m@13
|
134 subset_idx = subset_labels(df['Country'].get_values())
|
Maria@4
|
135 df = df.iloc[subset_idx, :]
|
m@15
|
136 return df
|
Maria@4
|
137
|
m@15
|
138
|
m@15
|
139 def features_for_train_test_sets(df, write_output=False):
|
m@15
|
140 """Split in train/val/test sets, extract features and write output files.
|
m@15
|
141
|
m@15
|
142 Parameters
|
m@15
|
143 -------
|
m@15
|
144 df : pd.DataFrame
|
m@15
|
145 The metadata for the selected subset of tracks.
|
m@15
|
146 write_output : boolean
|
m@15
|
147 Whether to write files with the extracted features for train/val/test sets.
|
m@15
|
148 """
|
m@15
|
149 X_idx, Y = np.arange(len(df)), df['Country'].get_values()
|
m@15
|
150 train_set, val_set, test_set = get_train_val_test_idx(X_idx, Y)
|
Maria@4
|
151 X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
|
m@15
|
152 X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
|
Maria@4
|
153 X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE)
|
m@15
|
154
|
m@20
|
155 train = [X_train, Y_train, Y_audio_train]
|
m@20
|
156 val = [X_val, Y_val, Y_audio_val]
|
m@20
|
157 test = [X_test, Y_test, Y_audio_test]
|
m@15
|
158 if write_output:
|
m@15
|
159 with open(OUTPUT_FILES[0], 'wb') as f:
|
m@20
|
160 pickle.dump(train, f)
|
m@15
|
161 with open(OUTPUT_FILES[1], 'wb') as f:
|
m@20
|
162 pickle.dump(val, f)
|
m@15
|
163 with open(OUTPUT_FILES[2], 'wb') as f:
|
m@20
|
164 pickle.dump(test, f)
|
m@20
|
165 return train, val, test
|
Maria@4
|
166
|
m@15
|
167
|
m@15
|
168 if __name__ == '__main__':
|
m@15
|
169 # load dataset
|
m@15
|
170 df = sample_dataset(csv_file=METADATA_FILE)
|
m@20
|
171 train, val, test = features_for_train_test_sets(df, write_output=True)
|
m@15
|
172
|