m@95: import numpy as np m@95: import pandas as pd m@95: import pickle m@95: import os m@95: m@95: import sys m@95: sys.path.append('../') m@95: import scripts.outliers as outliers m@95: import scripts.utils as utils m@95: m@95: DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle' m@95: METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv' m@95: #METADATA_FILE = '../data/metadata.csv' m@95: m@95: dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE) m@95: m@95: # correct BL urls: m@95: bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0] m@95: for bl_ind in bl_inds: m@95: ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' + m@95: ddf['Folder'].iloc[bl_ind] + '/' + m@95: ddf['MetaFile'].iloc[bl_ind].split('.')[0]) m@95: m@95: X_list, Y, Yaudio = dataset m@95: X = np.concatenate(X_list, axis=1) m@95: m@95: cols_to_keep = ['Country', 'continent', 'REGION', 'LocDetails', m@95: 'Language', 'Language_iso3', 'Culture', 'Genre_Album', 'Year', 'Decade', m@98: 'songurls_Album', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia'] m@95: cols_rename = ['Country', 'Continent', 'Region', 'Location_details', m@95: 'Language', 'Language_iso3', 'Culture', 'Genre', 'Year', 'Decade', m@98: 'Url', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia'] m@95: col_idx = [] m@95: for column in cols_to_keep: m@95: if column not in ddf.columns: m@95: print column m@95: col_idx.append(np.where(ddf.columns==column)[0]) m@95: col_idx = np.concatenate(col_idx) m@95: ddf_new = ddf.iloc[:, col_idx] m@95: ddf_new.columns = cols_rename m@95: m@95: for i, yy in enumerate(ddf_new['Audio']): m@95: new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1] m@95: new_csv_name = new_name.split('.')[0]+'.csv' m@98: ddf_new.loc[i, 'Audio'] = new_name m@98: ddf_new.loc[i, 'Chroma'] = os.path.join('Chroma', new_csv_name) m@98: ddf_new.loc[i, 'Melspec'] = os.path.join('Melspec', new_csv_name) m@98: ddf_new.loc[i, 'Melodia'] = os.path.join('Melodia', new_csv_name) m@98: ddf_new.loc[i, 'Speech'] = os.path.join('Speech', new_csv_name) m@95: Yaudio[i] = new_name m@95: m@98: print ddf_new.head() m@98: print Yaudio[:20] m@98: m@95: ddf_new.to_csv('../data/metadata.csv', index=False) m@95: pickle.dump([X_list, Y, Yaudio], open('../data/lda_data_8.pickle', 'wb')) m@99: m@99: #old_pickle_files = ['/import/c4dm-04/mariap/lda_data_melodia_8_30sec.pickle', m@99: # '/import/c4dm-04/mariap/pca_data_melodia_8_30sec.pickle', m@99: # '/import/c4dm-04/mariap/nmf_data_melodia_8_30sec.pickle', m@99: # '/import/c4dm-04/mariap/ssnmf_data_melodia_8_30sec.pickle', m@99: # '/import/c4dm-04/mariap/train_data_melodia_8_30sec.pickle', m@99: # '/import/c4dm-04/mariap/val_data_melodia_8_30sec.pickle', m@99: # '/import/c4dm-04/mariap/test_data_melodia_8_30sec.pickle'] m@99: m@99: #new_pickle_files = ['../data/lda_data_8.pickle', m@99: # '../data/pca_data_8.pickle', m@99: # '../data/nmf_data_8.pickle', m@99: # '../data/ssnmf_data_8.pickle', m@99: # '../data/train_data_8.pickle', m@99: # '../data/val_data_8.pickle', m@99: # '../data/test_data_8.pickle'] m@99: m@99: #for old_pickle_file, new_pickle_file in zip(old_pickle_files, new_pickle_files): m@99: # X_list, Y, Yaudio = pickle.load(open(old_pickle_file,'rb')) m@99: # for i, yy in enumerate(Yaudio): m@99: # new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1] m@99: # Yaudio[i] = new_name m@99: # pickle.dump([X_list, Y, Yaudio], open(new_pickle_file, 'wb'))