annotate Code/taste_profile_cleaning.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sat, 09 Jul 2022 00:50:43 -0500
parents a4dfbc53a162
children
rev   line source
p@39 1 #!/usr/bin/env python
p@39 2 # -*- coding: utf-8 -*-
p@39 3 """
p@39 4
p@39 5
p@39 6 @author: paulochiliguano
p@39 7 """
p@39 8
p@39 9 import os
p@39 10 import zipfile
p@39 11 import time
p@39 12 import pandas as pd
p@39 13
p@39 14 # Unzip Taste Profile subset
p@39 15 def unzip_tasteprofile(zippedfile):
p@39 16 print("Unzipping Taste Profile subset...")
p@39 17 uncompressedFilename = os.path.splitext(zippedfile)[0]
p@39 18 with zipfile.ZipFile(zippedfile) as myzip:
p@39 19 myzip.extract(uncompressedFilename)
p@39 20 return uncompressedFilename
p@39 21
p@39 22 # Read songIDs from Million Song Dataset songID-trackID mismatches
p@39 23 def read_songid_mismatches(filename):
p@39 24 print("Reading songID mismatches...")
p@39 25 with open(filename, 'r+') as f:
p@39 26 songIdMismatches = set()
p@39 27 for line in f:
p@39 28 songIdMismatches.add(line[8:26])
p@39 29 return songIdMismatches
p@39 30
p@39 31 def read_available_songid(filename):
p@39 32 print("Reading available songIDs...")
p@39 33 with open(filename, 'r+') as f:
p@39 34 songIdAvailable = set()
p@39 35 for line in f:
p@39 36 songIdAvailable.add(line[0:18])
p@39 37 return songIdAvailable
p@39 38
p@39 39 def delete_triplets(zippedfile='train_triplets.txt.zip',
p@39 40 mismatchesfile='sid_mismatches.txt'):
p@39 41 """
p@39 42 Delete triplets with songIDs mismatches and unavailable audio clips from
p@39 43 7Digital (UK)
p@42 44
p@39 45 This is applied on Taste Profile subset.
p@42 46
p@39 47 :type zippedfile: string
p@39 48 :param zippedfile: filename of the downloaded subset
p@42 49
p@39 50 :type mismatchesfile: string
p@39 51 :param mismatchesfile: filename of the downloaded list of mismatches
p@42 52
p@39 53 """
p@39 54 tripletsfile = unzip_tasteprofile(zippedfile)
p@39 55 mismatches = read_songid_mismatches(mismatchesfile)
p@39 56 print("There are %d songId-trackId mismatches." % len(mismatches))
p@39 57 availableClips = read_available_songid('7digital/CF_dataset_7digital.txt')
p@39 58 print("There are %d audio clips available." % len(availableClips))
p@39 59 cleanfile = os.path.splitext(tripletsfile)[0] + '.h5'
p@39 60 print("Deleting triplets with mismatches and unavailable songs...")
p@39 61 for chunk in pd.read_table(
p@39 62 tripletsfile,
p@39 63 header=None,
p@39 64 names=['userId', 'songId', 'numPlays'],
p@39 65 chunksize=100*len(mismatches),
p@39 66 ):
p@39 67 chunk = chunk[~chunk.songId.isin(mismatches)]
p@39 68 chunk = chunk[chunk.songId.isin(availableClips)]
p@39 69 #chunk.to_csv(filename_out, mode='a', header=False, index=False)
p@39 70 chunk.to_hdf(
p@39 71 cleanfile,
p@39 72 'triplets',
p@39 73 mode='a',
p@39 74 format='table',
p@39 75 append=True,
p@39 76 complevel=9,
p@39 77 complib='zlib',
p@39 78 fletcher32=True
p@39 79 )
p@39 80 # Delete the large text file!
p@39 81 os.remove(tripletsfile)
p@39 82 print("Triplets without mismatches saved in %s" % cleanfile)
p@39 83
p@39 84 if __name__ == '__main__':
p@39 85 #if len(sys.argv) < 1:
p@39 86 #print("Not enough arguments %s" % sys.argv[0])
p@39 87 #sys.exit()
p@39 88 dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset')
p@39 89 os.chdir(dataset_path)
p@39 90 start_time = time.time()
p@39 91 delete_triplets()
p@39 92 elapsed_time = time.time() - start_time
p@39 93 print("Execution time: %.2f minutes" % (elapsed_time/60))
p@39 94
p@39 95 #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets')
p@39 96
p@39 97 #played_songs = 1000
p@39 98 #df = pd.read_csv(
p@39 99 #filename_out,
p@39 100 #delim_whitespace=False,
p@39 101 #header=None,
p@39 102 #names=['user','song','plays'])
p@39 103 #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
p@39 104 #df_active.to_pickle('../dataset/CF_dataset.pkl')
p@39 105
p@39 106 #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@39 107 #CF_dataset.pkl', 'wb')
p@39 108 #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
p@39 109 #f.close()
p@39 110
p@39 111 # Select most frequent songs
p@39 112 #frequent_songs = 1500
p@39 113 #print("Selecting %d frequent songs..." % frequent_songs)
p@39 114 #counts = df_active['song'].value_counts().head(frequent_songs)
p@39 115 #df_active = df_active.loc[df_active['song'].isin(counts.index), :]
p@39 116 #print("Saving Echonest songID list...")
p@39 117 #filename = '../dataset/CF_dataset_songID.txt'
p@39 118 #with open(filename, 'wb') as f:
p@39 119 #for item in counts.index.tolist():
p@39 120 #f.write("%s\n" % item)
p@39 121
p@39 122 #important
p@39 123 #df['user'].value_counts().head(50)
p@39 124
p@39 125 #ddf = df.drop_duplicates(subset = 'song')
p@39 126 #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',
p@39 127 #columns=['song'],
p@39 128 #header=False,
p@39 129 #index=False)