p@39: #!/usr/bin/env python p@39: # -*- coding: utf-8 -*- p@39: """ p@39: p@39: p@39: @author: paulochiliguano p@39: """ p@39: p@39: import os p@39: import zipfile p@39: import time p@39: import pandas as pd p@39: p@39: # Unzip Taste Profile subset p@39: def unzip_tasteprofile(zippedfile): p@39: print("Unzipping Taste Profile subset...") p@39: uncompressedFilename = os.path.splitext(zippedfile)[0] p@39: with zipfile.ZipFile(zippedfile) as myzip: p@39: myzip.extract(uncompressedFilename) p@39: return uncompressedFilename p@39: p@39: # Read songIDs from Million Song Dataset songID-trackID mismatches p@39: def read_songid_mismatches(filename): p@39: print("Reading songID mismatches...") p@39: with open(filename, 'r+') as f: p@39: songIdMismatches = set() p@39: for line in f: p@39: songIdMismatches.add(line[8:26]) p@39: return songIdMismatches p@39: p@39: def read_available_songid(filename): p@39: print("Reading available songIDs...") p@39: with open(filename, 'r+') as f: p@39: songIdAvailable = set() p@39: for line in f: p@39: songIdAvailable.add(line[0:18]) p@39: return songIdAvailable p@39: p@39: def delete_triplets(zippedfile='train_triplets.txt.zip', p@39: mismatchesfile='sid_mismatches.txt'): p@39: """ p@39: Delete triplets with songIDs mismatches and unavailable audio clips from p@39: 7Digital (UK) p@42: p@39: This is applied on Taste Profile subset. p@42: p@39: :type zippedfile: string p@39: :param zippedfile: filename of the downloaded subset p@42: p@39: :type mismatchesfile: string p@39: :param mismatchesfile: filename of the downloaded list of mismatches p@42: p@39: """ p@39: tripletsfile = unzip_tasteprofile(zippedfile) p@39: mismatches = read_songid_mismatches(mismatchesfile) p@39: print("There are %d songId-trackId mismatches." % len(mismatches)) p@39: availableClips = read_available_songid('7digital/CF_dataset_7digital.txt') p@39: print("There are %d audio clips available." % len(availableClips)) p@39: cleanfile = os.path.splitext(tripletsfile)[0] + '.h5' p@39: print("Deleting triplets with mismatches and unavailable songs...") p@39: for chunk in pd.read_table( p@39: tripletsfile, p@39: header=None, p@39: names=['userId', 'songId', 'numPlays'], p@39: chunksize=100*len(mismatches), p@39: ): p@39: chunk = chunk[~chunk.songId.isin(mismatches)] p@39: chunk = chunk[chunk.songId.isin(availableClips)] p@39: #chunk.to_csv(filename_out, mode='a', header=False, index=False) p@39: chunk.to_hdf( p@39: cleanfile, p@39: 'triplets', p@39: mode='a', p@39: format='table', p@39: append=True, p@39: complevel=9, p@39: complib='zlib', p@39: fletcher32=True p@39: ) p@39: # Delete the large text file! p@39: os.remove(tripletsfile) p@39: print("Triplets without mismatches saved in %s" % cleanfile) p@39: p@39: if __name__ == '__main__': p@39: #if len(sys.argv) < 1: p@39: #print("Not enough arguments %s" % sys.argv[0]) p@39: #sys.exit() p@39: dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset') p@39: os.chdir(dataset_path) p@39: start_time = time.time() p@39: delete_triplets() p@39: elapsed_time = time.time() - start_time p@39: print("Execution time: %.2f minutes" % (elapsed_time/60)) p@39: p@39: #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets') p@39: p@39: #played_songs = 1000 p@39: #df = pd.read_csv( p@39: #filename_out, p@39: #delim_whitespace=False, p@39: #header=None, p@39: #names=['user','song','plays']) p@39: #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) p@39: #df_active.to_pickle('../dataset/CF_dataset.pkl') p@39: p@39: #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@39: #CF_dataset.pkl', 'wb') p@39: #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) p@39: #f.close() p@39: p@39: # Select most frequent songs p@39: #frequent_songs = 1500 p@39: #print("Selecting %d frequent songs..." % frequent_songs) p@39: #counts = df_active['song'].value_counts().head(frequent_songs) p@39: #df_active = df_active.loc[df_active['song'].isin(counts.index), :] p@39: #print("Saving Echonest songID list...") p@39: #filename = '../dataset/CF_dataset_songID.txt' p@39: #with open(filename, 'wb') as f: p@39: #for item in counts.index.tolist(): p@39: #f.write("%s\n" % item) p@39: p@39: #important p@39: #df['user'].value_counts().head(50) p@39: p@39: #ddf = df.drop_duplicates(subset = 'song') p@39: #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv', p@39: #columns=['song'], p@39: #header=False, p@39: #index=False)