comparison scripts/load_dataset.py @ 15:9847b954c217 branch-tests

added sensitivity experiment
author Maria Panteli <m.x.panteli@gmail.com>
date Tue, 12 Sep 2017 19:03:48 +0100
parents 98718fdd8326
children 65b9330afdd8
comparison
equal deleted inserted replaced
14:088b5547e094 15:9847b954c217
15 15
16 16
17 #METADATA_FILE = 'sample_dataset/metadata.csv' 17 #METADATA_FILE = 'sample_dataset/metadata.csv'
18 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle'] 18 #OUTPUT_FILES = ['sample_dataset/train_data.pickle', 'sample_dataset/val_data.pickle', 'sample_dataset/test_data.pickle']
19 WIN_SIZE = 8 19 WIN_SIZE = 8
20 METADATA_FILE = 'data/metadata_BLSM_language_all.csv' 20 METADATA_FILE = '../data/metadata_BLSM_language_all.csv'
21 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', 21 OUTPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle',
22 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', 22 '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle',
23 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] 23 '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
24 24
25 25
87 subset_idx = np.concatenate(subset_idx, axis=0) 87 subset_idx = np.concatenate(subset_idx, axis=0)
88 return subset_idx 88 return subset_idx
89 89
90 90
91 def extract_features(df, win2sec=8.0): 91 def extract_features(df, win2sec=8.0):
92 """Extract features from melspec and chroma. 92 """ Extract features from melspec and chroma.
93 93
94 Parameters 94 Parameters
95 ---------- 95 ----------
96 df : pd.DataFrame 96 df : pd.DataFrame
97 Metadata including class label and path to audio, melspec, chroma 97 Metadata including class label and path to audio, melspec, chroma
114 Y = Y_df.get_values() 114 Y = Y_df.get_values()
115 Y_audio = Y_audio_df.get_values() 115 Y_audio = Y_audio_df.get_values()
116 return X, Y, Y_audio 116 return X, Y, Y_audio
117 117
118 118
119 if __name__ == '__main__': 119 def sample_dataset(csv_file):
120 # load dataset 120 """ Load data from csv and select min 10 - max 100 recs from each country.
121 df = pd.read_csv(METADATA_FILE) 121
122 Parameters
123 ----------
124 csv_file : str
125 The path to the csv file containing the metadata (including country) of the tracks.
126
127 Returns
128 -------
129 df : pd.DataFrame
130 The metadata for the selected subset of tracks.
131 """
132 df = pd.read_csv(csv_file)
122 df = util_filter_dataset.remove_missing_data(df) 133 df = util_filter_dataset.remove_missing_data(df)
123 subset_idx = subset_labels(df['Country'].get_values()) 134 subset_idx = subset_labels(df['Country'].get_values())
124 df = df.iloc[subset_idx, :] 135 df = df.iloc[subset_idx, :]
125 X, Y = np.arange(len(df)), df['Country'].get_values() 136 return df
126 137
127 # split in train, val, test set 138
128 train_set, val_set, test_set = get_train_val_test_idx(X, Y) 139 def features_for_train_test_sets(df, write_output=False):
129 140 """Split in train/val/test sets, extract features and write output files.
130 # extract features and write output 141
142 Parameters
143 -------
144 df : pd.DataFrame
145 The metadata for the selected subset of tracks.
146 write_output : boolean
147 Whether to write files with the extracted features for train/val/test sets.
148 """
149 X_idx, Y = np.arange(len(df)), df['Country'].get_values()
150 train_set, val_set, test_set = get_train_val_test_idx(X_idx, Y)
131 X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE) 151 X_train, Y_train, Y_audio_train = extract_features(df.iloc[train_set[0], :], win2sec=WIN_SIZE)
132 with open(OUTPUT_FILES[0], 'wb') as f: 152 X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
133 pickle.dump([X_train, Y_train, Y_audio_train], f)
134
135 X_val, Y_val, Y_audio_val = extract_features(df.iloc[val_set[0], :], win2sec=WIN_SIZE)
136 with open(OUTPUT_FILES[1], 'wb') as f:
137 pickle.dump([X_val, Y_val, Y_audio_val], f)
138
139 X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE) 153 X_test, Y_test, Y_audio_test = extract_features(df.iloc[test_set[0], :], win2sec=WIN_SIZE)
140 with open(OUTPUT_FILES[2], 'wb') as f: 154
141 pickle.dump([X_test, Y_test, Y_audio_test], f) 155 if write_output:
156 with open(OUTPUT_FILES[0], 'wb') as f:
157 pickle.dump([X_train, Y_train, Y_audio_train], f)
158 with open(OUTPUT_FILES[1], 'wb') as f:
159 pickle.dump([X_val, Y_val, Y_audio_val], f)
160 with open(OUTPUT_FILES[2], 'wb') as f:
161 pickle.dump([X_test, Y_test, Y_audio_test], f)
142 162
163
164 if __name__ == '__main__':
165 # load dataset
166 df = sample_dataset(csv_file=METADATA_FILE)
167 features_for_train_test_sets(df, write_output=True)
168