Mercurial > hg > plosone_underreview
comparison scripts/load_dataset.py @ 15:9847b954c217 branch-tests
added sensitivity experiment
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Tue, 12 Sep 2017 19:03:48 +0100 |
parents | 98718fdd8326 |
children | 65b9330afdd8 |
comparison
equal
deleted
inserted
replaced
14:088b5547e094 | 15:9847b954c217 |
---|---|
15 | 15 |
16 | 16 |
17 #METADATA_FILE = 'sample_dataset/metadata.csv' | 17 #METADATA_FILE = 'sample_dataset/metadata.csv' |
18 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] | 18 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] |
19 WIN_SIZE = 8 | 19 WIN_SIZE = 8 |
20 METADATA_FILE = 'data/metadata_BLSM_language_all.csv' | 20 METADATA_FILE = '../data/metadata_BLSM_language_all.csv' |
21 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', | 21 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', |
22 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', | 22 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', |
23 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] | 23 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] |
24 | 24 |
25 | 25 |
87 subset_idx = np.concatenate(subset_idx, axis=0) | 87 subset_idx = np.concatenate(subset_idx, axis=0) |
88 return subset_idx | 88 return subset_idx |
89 | 89 |
90 | 90 |
91 def extract_features(df, win2sec=8.0): | 91 def extract_features(df, win2sec=8.0): |
92 """Extract features from melspec and chroma. | 92 """ Extract features from melspec and chroma. |
93 | 93 |
94 Parameters | 94 Parameters |
95 ---------- | 95 ---------- |
96 df : pd.DataFrame | 96 df : pd.DataFrame |
97 Metadata including class label and path to audio, melspec, chroma | 97 Metadata including class label and path to audio, melspec, chroma |
114 Y = Y_df.get_values() | 114 Y = Y_df.get_values() |
115 Y_audio = Y_audio_df.get_values() | 115 Y_audio = Y_audio_df.get_values() |
116 return X, Y, Y_audio | 116 return X, Y, Y_audio |
117 | 117 |
118 | 118 |
119 if __name__ == '__main__': | 119 def sample_dataset(csv_file): |
120 # load dataset | 120 """ Load data from csv and select min 10 - max 100 recs from each country. |
121 df = pd.read_csv(METADATA_FILE) | 121 |
122 Parameters | |
123 ---------- | |
124 csv_file : str | |
125 The path to the csv file containing the metadata (including country) of the tracks. | |
126 | |
127 Returns | |
128 ------- | |
129 df : pd.DataFrame | |
130 The metadata for the selected subset of tracks. | |
131 """ | |
132 df = pd.read_csv(csv_file) | |
122 df = util_filter_dataset.remove_missing_data(df) | 133 df = util_filter_dataset.remove_missing_data(df) |
123 subset_idx = subset_labels(df['Country'].get_values()) | 134 subset_idx = subset_labels(df['Country'].get_values()) |
124 df = df.iloc[subset_idx, :] | 135 df = df.iloc[subset_idx, :] |
125 X, Y = np.arange(len(df)), df['Country'].get_values() | 136 return df |
126 | 137 |
127 # split in train, val, test set | 138 |
128 train_set, val_set, test_set = get_train_val_test_idx(X, Y) | 139 def features_for_train_test_sets(df, write_output=False): |
129 | 140 """Split in train/val/test sets, extract features and write output files. |
130 # extract features and write output | 141 |
142 Parameters | |
143 ------- | |
144 df : pd.DataFrame | |
145 The metadata for the selected subset of tracks. | |
146 write_output : boolean | |
147 Whether to write files with the extracted features for train/val/test sets. | |
148 """ | |
149 X_idx, Y = np.arange(len(df)), df['Country'].get_values() | |
150 train_set, val_set, test_set = get_train_val_test_idx(X_idx, Y) | |
131 X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) | 151 X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) |
132 with open(OUTPUT_FILES[0], 'wb') as f: | 152 X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) |
133 pickle.dump([X_train, Y_train, Y_audio_train], f) | |
134 | |
135 X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE) | |
136 with open(OUTPUT_FILES[1], 'wb') as f: | |
137 pickle.dump([X_val, Y_val, Y_audio_val], f) | |
138 | |
139 X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) | 153 X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) |
140 with open(OUTPUT_FILES[2], 'wb') as f: | 154 |
141 pickle.dump([X_test, Y_test, Y_audio_test], f) | 155 if write_output: |
156 with open(OUTPUT_FILES[0], 'wb') as f: | |
157 pickle.dump([X_train, Y_train, Y_audio_train], f) | |
158 with open(OUTPUT_FILES[1], 'wb') as f: | |
159 pickle.dump([X_val, Y_val, Y_audio_val], f) | |
160 with open(OUTPUT_FILES[2], 'wb') as f: | |
161 pickle.dump([X_test, Y_test, Y_audio_test], f) | |
142 | 162 |
163 | |
164 if __name__ == '__main__': | |
165 # load dataset | |
166 df = sample_dataset(csv_file=METADATA_FILE) | |
167 features_for_train_test_sets(df, write_output=True) | |
168 |