Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
changeset 39:7e5bda3feaa3
Taste Profile cleaning in Python 3.5
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 20 Nov 2016 03:15:25 -0500 |
parents | f6d568782b49 |
children | ba2a7e3eb418 |
files | Code/taste_profile_cleaning.py |
diffstat | 1 files changed, 130 insertions(+), 112 deletions(-) [+] |
line wrap: on
line diff
--- a/Code/taste_profile_cleaning.py Thu Oct 20 00:43:25 2016 -0500 +++ b/Code/taste_profile_cleaning.py Sun Nov 20 03:15:25 2016 -0500 @@ -1,112 +1,130 @@ - - - - - - - - -import time -import pandas as pd -#import cPickle as pickle - -# Read songIDs from Million Song Dataset songID-trackID mismatches -start_time = time.time() -print 'Reading songID mismatches...' -filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\ -sid_mismatches.txt' -with open(filename, 'rb') as f: - mismatches = set() - next = f.readline() - while next != "": - songID = next[8:26] - mismatches.add(songID) - #print(next[8:26]) - next = f.readline() - -# Delete rows with songIDs mismatches from Taste Profile Subset -print 'Reading Taste Profile subset...' -result = pd.DataFrame() -filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\ -/train_triplets.txt' -filename_out = '/Users/paulochiliguano/Documents/msc-project/dataset/\ -/train_triplets_wo_mismatches.csv' -for chunk in pd.read_csv( - filename, - low_memory=False, - delim_whitespace=True, - chunksize=20000, - names=['user', 'song', 'plays'], - header=None): - chunk = chunk[~chunk.song.isin(mismatches)] - chunk.to_csv(filename_out, mode='a', header=False, index=False) - #result = result.append(chunk, ignore_index=True) -elapsed_time = time.time() - start_time -print 'Execution time: %.3f seconds' % elapsed_time -#result.to_pickle('/homes/pchilguano/dataset/train_triplets_wo_mismatch.pkl') - -# Select most active users -start_time = time.time() -played_songs = 1000 -print 'Reading (filtered) Taste Profile subset...' -df = pd.read_csv( - filename_out, - delim_whitespace=False, - header=None, - names=['user','song','plays']) -print 'Selecting most active users (> %d ratings)...' % played_songs -df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) - -print 'Saving user-item matrix as dataframe...' -df_active.to_pickle('/Users/paulochiliguano/Documents/msc-project/dataset/\ -CF_dataset.pkl') - -'''f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ -CF_dataset.pkl', 'wb') -pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) -f.close()''' - -# Select most frequent songs -frequent_songs = 1500 -print 'Selecting %d frequent songs...' % frequent_songs -counts = df_active['song'].value_counts().head(frequent_songs) -#df_active = df_active.loc[df_active['song'].isin(counts.index), :] -print 'Saving Echonest songID list...' -filename = '/Users/paulochiliguano/Documents/msc-project/dataset/\ -CF_dataset_songID.txt' -with open(filename, 'wb') as f: - for item in counts.index.tolist(): - f.write("%s\n" % item) -elapsed_time = time.time() - start_time -print 'Execution time: %.3f seconds' % elapsed_time - -''' -#important -#df['user'].value_counts().head(50) - -ddf = df.drop_duplicates(subset = 'song') -ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False) - - - -with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: - l1 = list(csv.reader(input1)) - chain1 = list(itertools.chain(*l1)) - l2 = list(csv.reader(input2)) - chain2 = list(itertools.chain(*l2)) - l3 = set(chain2) - set(chain1) - wr = csv.writer(myfile, delimiter=',') - for item in l3: - wr.writerow([item]) - -# Save Taste Profile dataset without SongID mismatches -mdf = df[df.song.isin(l3)] -mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False) - -result = pd.DataFrame() -for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): - chunk = chunk[chunk.song.isin(l3)] - result = result.append(chunk.pivot(index='user', columns='song', values='plays') - , ignore_index=True) - print (result.shape) -''' \ No newline at end of file +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + + +@author: paulochiliguano +""" + +import sys +import os +import zipfile +import time +import pandas as pd + +# Unzip Taste Profile subset +def unzip_tasteprofile(zippedfile): + print("Unzipping Taste Profile subset...") + uncompressedFilename = os.path.splitext(zippedfile)[0] + with zipfile.ZipFile(zippedfile) as myzip: + myzip.extract(uncompressedFilename) + return uncompressedFilename + +# Read songIDs from Million Song Dataset songID-trackID mismatches +def read_songid_mismatches(filename): + print("Reading songID mismatches...") + with open(filename, 'r+') as f: + songIdMismatches = set() + for line in f: + songIdMismatches.add(line[8:26]) + return songIdMismatches + +def read_available_songid(filename): + print("Reading available songIDs...") + with open(filename, 'r+') as f: + songIdAvailable = set() + for line in f: + songIdAvailable.add(line[0:18]) + return songIdAvailable + +def delete_triplets(zippedfile='train_triplets.txt.zip', + mismatchesfile='sid_mismatches.txt'): + """ + Delete triplets with songIDs mismatches and unavailable audio clips from + 7Digital (UK) + + This is applied on Taste Profile subset. + + :type zippedfile: string + :param zippedfile: filename of the downloaded subset + + :type mismatchesfile: string + :param mismatchesfile: filename of the downloaded list of mismatches + + """ + tripletsfile = unzip_tasteprofile(zippedfile) + mismatches = read_songid_mismatches(mismatchesfile) + print("There are %d songId-trackId mismatches." % len(mismatches)) + availableClips = read_available_songid('7digital/CF_dataset_7digital.txt') + print("There are %d audio clips available." % len(availableClips)) + cleanfile = os.path.splitext(tripletsfile)[0] + '.h5' + print("Deleting triplets with mismatches and unavailable songs...") + for chunk in pd.read_table( + tripletsfile, + header=None, + names=['userId', 'songId', 'numPlays'], + chunksize=100*len(mismatches), + ): + chunk = chunk[~chunk.songId.isin(mismatches)] + chunk = chunk[chunk.songId.isin(availableClips)] + #chunk.to_csv(filename_out, mode='a', header=False, index=False) + chunk.to_hdf( + cleanfile, + 'triplets', + mode='a', + format='table', + append=True, + complevel=9, + complib='zlib', + fletcher32=True + ) + # Delete the large text file! + os.remove(tripletsfile) + print("Triplets without mismatches saved in %s" % cleanfile) + +if __name__ == '__main__': + #if len(sys.argv) < 1: + #print("Not enough arguments %s" % sys.argv[0]) + #sys.exit() + dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset') + os.chdir(dataset_path) + start_time = time.time() + delete_triplets() + elapsed_time = time.time() - start_time + print("Execution time: %.2f minutes" % (elapsed_time/60)) + +#a=pd.read_hdf('../train_triplets_clean.h5', 'triplets') + +#played_songs = 1000 +#df = pd.read_csv( + #filename_out, + #delim_whitespace=False, + #header=None, + #names=['user','song','plays']) +#df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) +#df_active.to_pickle('../dataset/CF_dataset.pkl') + +#f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ +#CF_dataset.pkl', 'wb') +#pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) +#f.close() + +# Select most frequent songs +#frequent_songs = 1500 +#print("Selecting %d frequent songs..." % frequent_songs) +#counts = df_active['song'].value_counts().head(frequent_songs) +#df_active = df_active.loc[df_active['song'].isin(counts.index), :] +#print("Saving Echonest songID list...") +#filename = '../dataset/CF_dataset_songID.txt' +#with open(filename, 'wb') as f: + #for item in counts.index.tolist(): + #f.write("%s\n" % item) + +#important +#df['user'].value_counts().head(50) + +#ddf = df.drop_duplicates(subset = 'song') +#ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv', + #columns=['song'], + #header=False, + #index=False) \ No newline at end of file