comparison scripts/output_metadata.py @ 98:5eba53437755 branch-tests

notebooks for publication
author Maria Panteli <m.x.panteli@gmail.com>
date Tue, 03 Oct 2017 15:55:35 +0100
parents 4aa0763bf8d8
children 192259977b50
comparison
equal deleted inserted replaced
97:68ec8699e32a 98:5eba53437755
1 import numpy as np 1 import numpy as np
2 import pandas as pd 2 import pandas as pd
3 import pickle 3 import pickle
4 import os 4 import os
5
6 %load_ext autoreload
7 %autoreload 2
8 5
9 import sys 6 import sys
10 sys.path.append('../') 7 sys.path.append('../')
11 import scripts.outliers as outliers 8 import scripts.outliers as outliers
12 import scripts.utils as utils 9 import scripts.utils as utils
27 X_list, Y, Yaudio = dataset 24 X_list, Y, Yaudio = dataset
28 X = np.concatenate(X_list, axis=1) 25 X = np.concatenate(X_list, axis=1)
29 26
30 cols_to_keep = ['Country', 'continent', 'REGION', 'LocDetails', 27 cols_to_keep = ['Country', 'continent', 'REGION', 'LocDetails',
31 'Language', 'Language_iso3', 'Culture', 'Genre_Album', 'Year', 'Decade', 28 'Language', 'Language_iso3', 'Culture', 'Genre_Album', 'Year', 'Decade',
32 'songurls_Album', 'Speech', 'Melspec', 'Chroma', 'Melodia'] 29 'songurls_Album', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia']
33 cols_rename = ['Country', 'Continent', 'Region', 'Location_details', 30 cols_rename = ['Country', 'Continent', 'Region', 'Location_details',
34 'Language', 'Language_iso3', 'Culture', 'Genre', 'Year', 'Decade', 31 'Language', 'Language_iso3', 'Culture', 'Genre', 'Year', 'Decade',
35 'Url', 'Speech', 'Melspec', 'Chroma', 'Melodia'] 32 'Url', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia']
36 col_idx = [] 33 col_idx = []
37 for column in cols_to_keep: 34 for column in cols_to_keep:
38 if column not in ddf.columns: 35 if column not in ddf.columns:
39 print column 36 print column
40 col_idx.append(np.where(ddf.columns==column)[0]) 37 col_idx.append(np.where(ddf.columns==column)[0])
43 ddf_new.columns = cols_rename 40 ddf_new.columns = cols_rename
44 41
45 for i, yy in enumerate(ddf_new['Audio']): 42 for i, yy in enumerate(ddf_new['Audio']):
46 new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1] 43 new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1]
47 new_csv_name = new_name.split('.')[0]+'.csv' 44 new_csv_name = new_name.split('.')[0]+'.csv'
48 ddf_new['Audio'].iloc[i] = os.path.join('Audio', new_name) 45 ddf_new.loc[i, 'Audio'] = new_name
49 ddf_new['Chroma'].iloc[i] = os.path.join('Chroma', new_csv_name) 46 ddf_new.loc[i, 'Chroma'] = os.path.join('Chroma', new_csv_name)
50 ddf_new['Melspec'].iloc[i] = os.path.join('Melspec', new_csv_name) 47 ddf_new.loc[i, 'Melspec'] = os.path.join('Melspec', new_csv_name)
51 ddf_new['Melodia'].iloc[i] = os.path.join('Melodia', new_csv_name) 48 ddf_new.loc[i, 'Melodia'] = os.path.join('Melodia', new_csv_name)
52 ddf_new['Speech'].iloc[i] = os.path.join('Speech', new_csv_name) 49 ddf_new.loc[i, 'Speech'] = os.path.join('Speech', new_csv_name)
53 Yaudio[i] = new_name 50 Yaudio[i] = new_name
51
52 print ddf_new.head()
53 print Yaudio[:20]
54 54
55 ddf_new.to_csv('../data/metadata.csv', index=False) 55 ddf_new.to_csv('../data/metadata.csv', index=False)
56 pickle.dump([X_list, Y, Yaudio], open('../data/lda_data_8.pickle', 'wb')) 56 pickle.dump([X_list, Y, Yaudio], open('../data/lda_data_8.pickle', 'wb'))