hybrid-music-recommender-using-content-based-and-social-information: Code/taste_profile

annotate Code/taste_profile_cleaning.py @ 39:7e5bda3feaa3

Taste Profile cleaning in Python 3.5

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sun, 20 Nov 2016 03:15:25 -0500
parents	fafc0b249a73
children	1de207a22c1a

rev	line source
p@39	1 #!/usr/bin/env python
p@39	2 # -- coding: utf-8 --
p@39	3 """
p@39	4
p@39	5
p@39	6 @author: paulochiliguano
p@39	7 """
p@39	8
p@39	9 import sys
p@39	10 import os
p@39	11 import zipfile
p@39	12 import time
p@39	13 import pandas as pd
p@39	14
p@39	15 # Unzip Taste Profile subset
p@39	16 def unzip_tasteprofile(zippedfile):
p@39	17 print("Unzipping Taste Profile subset...")
p@39	18 uncompressedFilename = os.path.splitext(zippedfile)[0]
p@39	19 with zipfile.ZipFile(zippedfile) as myzip:
p@39	20 myzip.extract(uncompressedFilename)
p@39	21 return uncompressedFilename
p@39	22
p@39	23 # Read songIDs from Million Song Dataset songID-trackID mismatches
p@39	24 def read_songid_mismatches(filename):
p@39	25 print("Reading songID mismatches...")
p@39	26 with open(filename, 'r+') as f:
p@39	27 songIdMismatches = set()
p@39	28 for line in f:
p@39	29 songIdMismatches.add(line[8:26])
p@39	30 return songIdMismatches
p@39	31
p@39	32 def read_available_songid(filename):
p@39	33 print("Reading available songIDs...")
p@39	34 with open(filename, 'r+') as f:
p@39	35 songIdAvailable = set()
p@39	36 for line in f:
p@39	37 songIdAvailable.add(line[0:18])
p@39	38 return songIdAvailable
p@39	39
p@39	40 def delete_triplets(zippedfile='train_triplets.txt.zip',
p@39	41 mismatchesfile='sid_mismatches.txt'):
p@39	42 """
p@39	43 Delete triplets with songIDs mismatches and unavailable audio clips from
p@39	44 7Digital (UK)
p@39	45
p@39	46 This is applied on Taste Profile subset.
p@39	47
p@39	48 :type zippedfile: string
p@39	49 :param zippedfile: filename of the downloaded subset
p@39	50
p@39	51 :type mismatchesfile: string
p@39	52 :param mismatchesfile: filename of the downloaded list of mismatches
p@39	53
p@39	54 """
p@39	55 tripletsfile = unzip_tasteprofile(zippedfile)
p@39	56 mismatches = read_songid_mismatches(mismatchesfile)
p@39	57 print("There are %d songId-trackId mismatches." % len(mismatches))
p@39	58 availableClips = read_available_songid('7digital/CF_dataset_7digital.txt')
p@39	59 print("There are %d audio clips available." % len(availableClips))
p@39	60 cleanfile = os.path.splitext(tripletsfile)[0] + '.h5'
p@39	61 print("Deleting triplets with mismatches and unavailable songs...")
p@39	62 for chunk in pd.read_table(
p@39	63 tripletsfile,
p@39	64 header=None,
p@39	65 names=['userId', 'songId', 'numPlays'],
p@39	66 chunksize=100*len(mismatches),
p@39	67 ):
p@39	68 chunk = chunk[~chunk.songId.isin(mismatches)]
p@39	69 chunk = chunk[chunk.songId.isin(availableClips)]
p@39	70 #chunk.to_csv(filename_out, mode='a', header=False, index=False)
p@39	71 chunk.to_hdf(
p@39	72 cleanfile,
p@39	73 'triplets',
p@39	74 mode='a',
p@39	75 format='table',
p@39	76 append=True,
p@39	77 complevel=9,
p@39	78 complib='zlib',
p@39	79 fletcher32=True
p@39	80 )
p@39	81 # Delete the large text file!
p@39	82 os.remove(tripletsfile)
p@39	83 print("Triplets without mismatches saved in %s" % cleanfile)
p@39	84
p@39	85 if __name__ == '__main__':
p@39	86 #if len(sys.argv) < 1:
p@39	87 #print("Not enough arguments %s" % sys.argv[0])
p@39	88 #sys.exit()
p@39	89 dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset')
p@39	90 os.chdir(dataset_path)
p@39	91 start_time = time.time()
p@39	92 delete_triplets()
p@39	93 elapsed_time = time.time() - start_time
p@39	94 print("Execution time: %.2f minutes" % (elapsed_time/60))
p@39	95
p@39	96 #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets')
p@39	97
p@39	98 #played_songs = 1000
p@39	99 #df = pd.read_csv(
p@39	100 #filename_out,
p@39	101 #delim_whitespace=False,
p@39	102 #header=None,
p@39	103 #names=['user','song','plays'])
p@39	104 #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
p@39	105 #df_active.to_pickle('../dataset/CF_dataset.pkl')
p@39	106
p@39	107 #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@39	108 #CF_dataset.pkl', 'wb')
p@39	109 #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
p@39	110 #f.close()
p@39	111
p@39	112 # Select most frequent songs
p@39	113 #frequent_songs = 1500
p@39	114 #print("Selecting %d frequent songs..." % frequent_songs)
p@39	115 #counts = df_active['song'].value_counts().head(frequent_songs)
p@39	116 #df_active = df_active.loc[df_active['song'].isin(counts.index), :]
p@39	117 #print("Saving Echonest songID list...")
p@39	118 #filename = '../dataset/CF_dataset_songID.txt'
p@39	119 #with open(filename, 'wb') as f:
p@39	120 #for item in counts.index.tolist():
p@39	121 #f.write("%s\n" % item)
p@39	122
p@39	123 #important
p@39	124 #df['user'].value_counts().head(50)
p@39	125
p@39	126 #ddf = df.drop_duplicates(subset = 'song')
p@39	127 #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',
p@39	128 #columns=['song'],
p@39	129 #header=False,
p@39	130 #index=False)

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

annotate Code/taste_profile_cleaning.py @ 39:7e5bda3feaa3