m@95
|
1 import numpy as np
|
m@95
|
2 import pandas as pd
|
m@95
|
3 import pickle
|
m@95
|
4 import os
|
m@95
|
5
|
m@95
|
6 import sys
|
m@95
|
7 sys.path.append('../')
|
m@95
|
8 import scripts.outliers as outliers
|
m@95
|
9 import scripts.utils as utils
|
m@95
|
10
|
m@95
|
11 DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'
|
m@95
|
12 METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'
|
m@95
|
13 #METADATA_FILE = '../data/metadata.csv'
|
m@95
|
14
|
m@95
|
15 dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)
|
m@95
|
16
|
m@95
|
17 # correct BL urls:
|
m@95
|
18 bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0]
|
m@95
|
19 for bl_ind in bl_inds:
|
m@95
|
20 ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' +
|
m@95
|
21 ddf['Folder'].iloc[bl_ind] + '/' +
|
m@95
|
22 ddf['MetaFile'].iloc[bl_ind].split('.')[0])
|
m@95
|
23
|
m@95
|
24 X_list, Y, Yaudio = dataset
|
m@95
|
25 X = np.concatenate(X_list, axis=1)
|
m@95
|
26
|
m@95
|
27 cols_to_keep = ['Country', 'continent', 'REGION', 'LocDetails',
|
m@95
|
28 'Language', 'Language_iso3', 'Culture', 'Genre_Album', 'Year', 'Decade',
|
m@98
|
29 'songurls_Album', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia']
|
m@95
|
30 cols_rename = ['Country', 'Continent', 'Region', 'Location_details',
|
m@95
|
31 'Language', 'Language_iso3', 'Culture', 'Genre', 'Year', 'Decade',
|
m@98
|
32 'Url', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia']
|
m@95
|
33 col_idx = []
|
m@95
|
34 for column in cols_to_keep:
|
m@95
|
35 if column not in ddf.columns:
|
m@95
|
36 print column
|
m@95
|
37 col_idx.append(np.where(ddf.columns==column)[0])
|
m@95
|
38 col_idx = np.concatenate(col_idx)
|
m@95
|
39 ddf_new = ddf.iloc[:, col_idx]
|
m@95
|
40 ddf_new.columns = cols_rename
|
m@95
|
41
|
m@95
|
42 for i, yy in enumerate(ddf_new['Audio']):
|
m@95
|
43 new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1]
|
m@95
|
44 new_csv_name = new_name.split('.')[0]+'.csv'
|
m@98
|
45 ddf_new.loc[i, 'Audio'] = new_name
|
m@98
|
46 ddf_new.loc[i, 'Chroma'] = os.path.join('Chroma', new_csv_name)
|
m@98
|
47 ddf_new.loc[i, 'Melspec'] = os.path.join('Melspec', new_csv_name)
|
m@98
|
48 ddf_new.loc[i, 'Melodia'] = os.path.join('Melodia', new_csv_name)
|
m@98
|
49 ddf_new.loc[i, 'Speech'] = os.path.join('Speech', new_csv_name)
|
m@95
|
50 Yaudio[i] = new_name
|
m@95
|
51
|
m@98
|
52 print ddf_new.head()
|
m@98
|
53 print Yaudio[:20]
|
m@98
|
54
|
m@95
|
55 ddf_new.to_csv('../data/metadata.csv', index=False)
|
m@95
|
56 pickle.dump([X_list, Y, Yaudio], open('../data/lda_data_8.pickle', 'wb'))
|
m@99
|
57
|
m@99
|
58 #old_pickle_files = ['/import/c4dm-04/mariap/lda_data_melodia_8_30sec.pickle',
|
m@99
|
59 # '/import/c4dm-04/mariap/pca_data_melodia_8_30sec.pickle',
|
m@99
|
60 # '/import/c4dm-04/mariap/nmf_data_melodia_8_30sec.pickle',
|
m@99
|
61 # '/import/c4dm-04/mariap/ssnmf_data_melodia_8_30sec.pickle',
|
m@99
|
62 # '/import/c4dm-04/mariap/train_data_melodia_8_30sec.pickle',
|
m@99
|
63 # '/import/c4dm-04/mariap/val_data_melodia_8_30sec.pickle',
|
m@99
|
64 # '/import/c4dm-04/mariap/test_data_melodia_8_30sec.pickle']
|
m@99
|
65
|
m@99
|
66 #new_pickle_files = ['../data/lda_data_8.pickle',
|
m@99
|
67 # '../data/pca_data_8.pickle',
|
m@99
|
68 # '../data/nmf_data_8.pickle',
|
m@99
|
69 # '../data/ssnmf_data_8.pickle',
|
m@99
|
70 # '../data/train_data_8.pickle',
|
m@99
|
71 # '../data/val_data_8.pickle',
|
m@99
|
72 # '../data/test_data_8.pickle']
|
m@99
|
73
|
m@99
|
74 #for old_pickle_file, new_pickle_file in zip(old_pickle_files, new_pickle_files):
|
m@99
|
75 # X_list, Y, Yaudio = pickle.load(open(old_pickle_file,'rb'))
|
m@99
|
76 # for i, yy in enumerate(Yaudio):
|
m@99
|
77 # new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1]
|
m@99
|
78 # Yaudio[i] = new_name
|
m@99
|
79 # pickle.dump([X_list, Y, Yaudio], open(new_pickle_file, 'wb')) |