view scripts/output_metadata.py @ 105:edd82eb89b4b branch-tests tip

Merge
author Maria Panteli
date Sun, 15 Oct 2017 13:36:59 +0100
parents 192259977b50
children
line wrap: on
line source
import numpy as np
import pandas as pd
import pickle 
import os

import sys
sys.path.append('../')
import scripts.outliers as outliers
import scripts.utils as utils

DATA_FILE = '../data/lda_data_melodia_8_30sec.pickle'
METADATA_FILE = '/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/MergeBL-Smith/data/metadata_BLSM_language_all.csv'
#METADATA_FILE = '../data/metadata.csv'

dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)

# correct BL urls:
bl_inds = np.where(np.isnan(ddf['BuyLinkTrackDownload']))[0]
for bl_ind in bl_inds:
    ddf['songurls_Album'].iloc[bl_ind] = ('https://sounds.bl.uk/World-and-traditional-music/' + 
                                        ddf['Folder'].iloc[bl_ind] + '/' + 
                                        ddf['MetaFile'].iloc[bl_ind].split('.')[0])

X_list, Y, Yaudio = dataset
X = np.concatenate(X_list, axis=1)

cols_to_keep = ['Country', 'continent', 'REGION', 'LocDetails',
                'Language', 'Language_iso3', 'Culture', 'Genre_Album', 'Year', 'Decade', 
                 'songurls_Album', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia']
cols_rename = ['Country', 'Continent', 'Region', 'Location_details', 
               'Language', 'Language_iso3', 'Culture', 'Genre', 'Year', 'Decade',
                'Url', 'Audio', 'Speech', 'Melspec', 'Chroma', 'Melodia']
col_idx = []
for column in cols_to_keep:
    if column not in ddf.columns:
        print column
    col_idx.append(np.where(ddf.columns==column)[0])
col_idx = np.concatenate(col_idx)
ddf_new = ddf.iloc[:, col_idx]
ddf_new.columns = cols_rename

for i, yy in enumerate(ddf_new['Audio']):
    new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1]
    new_csv_name = new_name.split('.')[0]+'.csv'
    ddf_new.loc[i, 'Audio'] = new_name
    ddf_new.loc[i, 'Chroma'] = os.path.join('Chroma', new_csv_name)
    ddf_new.loc[i, 'Melspec'] = os.path.join('Melspec', new_csv_name)
    ddf_new.loc[i, 'Melodia'] = os.path.join('Melodia', new_csv_name)
    ddf_new.loc[i, 'Speech'] = os.path.join('Speech', new_csv_name)
    Yaudio[i] = new_name

print ddf_new.head()
print Yaudio[:20]

ddf_new.to_csv('../data/metadata.csv', index=False)
pickle.dump([X_list, Y, Yaudio], open('../data/lda_data_8.pickle', 'wb'))

#old_pickle_files = ['/import/c4dm-04/mariap/lda_data_melodia_8_30sec.pickle', 
#                    '/import/c4dm-04/mariap/pca_data_melodia_8_30sec.pickle',
#                    '/import/c4dm-04/mariap/nmf_data_melodia_8_30sec.pickle',
#                    '/import/c4dm-04/mariap/ssnmf_data_melodia_8_30sec.pickle',
#                    '/import/c4dm-04/mariap/train_data_melodia_8_30sec.pickle',
#                    '/import/c4dm-04/mariap/val_data_melodia_8_30sec.pickle',
#                    '/import/c4dm-04/mariap/test_data_melodia_8_30sec.pickle']

#new_pickle_files = ['../data/lda_data_8.pickle', 
#                    '../data/pca_data_8.pickle',
#                    '../data/nmf_data_8.pickle',
#                    '../data/ssnmf_data_8.pickle',
#                    '../data/train_data_8.pickle',
#                    '../data/val_data_8.pickle',
#                    '../data/test_data_8.pickle']

#for old_pickle_file, new_pickle_file in zip(old_pickle_files, new_pickle_files):
#    X_list, Y, Yaudio = pickle.load(open(old_pickle_file,'rb'))
#    for i, yy in enumerate(Yaudio):
#        new_name = os.path.split(os.path.split(yy)[0])[-1] + '_' + os.path.split(yy)[-1]
#        Yaudio[i] = new_name
#    pickle.dump([X_list, Y, Yaudio], open(new_pickle_file, 'wb'))