Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/taste_profile_cleaning.py @ 47:b0186d4a4496 tip
Move 7Digital dataset to Downloads
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 09 Jul 2022 00:50:43 -0500 |
parents | a4dfbc53a162 |
children |
line wrap: on
line source
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @author: paulochiliguano """ import os import zipfile import time import pandas as pd # Unzip Taste Profile subset def unzip_tasteprofile(zippedfile): print("Unzipping Taste Profile subset...") uncompressedFilename = os.path.splitext(zippedfile)[0] with zipfile.ZipFile(zippedfile) as myzip: myzip.extract(uncompressedFilename) return uncompressedFilename # Read songIDs from Million Song Dataset songID-trackID mismatches def read_songid_mismatches(filename): print("Reading songID mismatches...") with open(filename, 'r+') as f: songIdMismatches = set() for line in f: songIdMismatches.add(line[8:26]) return songIdMismatches def read_available_songid(filename): print("Reading available songIDs...") with open(filename, 'r+') as f: songIdAvailable = set() for line in f: songIdAvailable.add(line[0:18]) return songIdAvailable def delete_triplets(zippedfile='train_triplets.txt.zip', mismatchesfile='sid_mismatches.txt'): """ Delete triplets with songIDs mismatches and unavailable audio clips from 7Digital (UK) This is applied on Taste Profile subset. :type zippedfile: string :param zippedfile: filename of the downloaded subset :type mismatchesfile: string :param mismatchesfile: filename of the downloaded list of mismatches """ tripletsfile = unzip_tasteprofile(zippedfile) mismatches = read_songid_mismatches(mismatchesfile) print("There are %d songId-trackId mismatches." % len(mismatches)) availableClips = read_available_songid('7digital/CF_dataset_7digital.txt') print("There are %d audio clips available." % len(availableClips)) cleanfile = os.path.splitext(tripletsfile)[0] + '.h5' print("Deleting triplets with mismatches and unavailable songs...") for chunk in pd.read_table( tripletsfile, header=None, names=['userId', 'songId', 'numPlays'], chunksize=100*len(mismatches), ): chunk = chunk[~chunk.songId.isin(mismatches)] chunk = chunk[chunk.songId.isin(availableClips)] #chunk.to_csv(filename_out, mode='a', header=False, index=False) chunk.to_hdf( cleanfile, 'triplets', mode='a', format='table', append=True, complevel=9, complib='zlib', fletcher32=True ) # Delete the large text file! os.remove(tripletsfile) print("Triplets without mismatches saved in %s" % cleanfile) if __name__ == '__main__': #if len(sys.argv) < 1: #print("Not enough arguments %s" % sys.argv[0]) #sys.exit() dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset') os.chdir(dataset_path) start_time = time.time() delete_triplets() elapsed_time = time.time() - start_time print("Execution time: %.2f minutes" % (elapsed_time/60)) #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets') #played_songs = 1000 #df = pd.read_csv( #filename_out, #delim_whitespace=False, #header=None, #names=['user','song','plays']) #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) #df_active.to_pickle('../dataset/CF_dataset.pkl') #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ #CF_dataset.pkl', 'wb') #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) #f.close() # Select most frequent songs #frequent_songs = 1500 #print("Selecting %d frequent songs..." % frequent_songs) #counts = df_active['song'].value_counts().head(frequent_songs) #df_active = df_active.loc[df_active['song'].isin(counts.index), :] #print("Saving Echonest songID list...") #filename = '../dataset/CF_dataset_songID.txt' #with open(filename, 'wb') as f: #for item in counts.index.tolist(): #f.write("%s\n" % item) #important #df['user'].value_counts().head(50) #ddf = df.drop_duplicates(subset = 'song') #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv', #columns=['song'], #header=False, #index=False)