hybrid-music-recommender-using-content-based-and-social-information: Code/taste_profile

comparison Code/taste_profile_cleaning.py @ 39:7e5bda3feaa3

Taste Profile cleaning in Python 3.5

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sun, 20 Nov 2016 03:15:25 -0500
parents	fafc0b249a73
children	1de207a22c1a

comparison

equal deleted inserted replaced

-:f6d568782b49
+:7e5bda3feaa3
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+@author: paulochiliguano
+"""
+import sys
+import os
+import zipfile
 import time
 import pandas as pd
-#import cPickle as pickle
+# Unzip Taste Profile subset
+def unzip_tasteprofile(zippedfile):
+print("Unzipping Taste Profile subset...")
+uncompressedFilename = os.path.splitext(zippedfile)[0]
+with zipfile.ZipFile(zippedfile) as myzip:
+myzip.extract(uncompressedFilename)
+return uncompressedFilename
 # Read songIDs from Million Song Dataset songID-trackID mismatches
-start_time = time.time()
+def read_songid_mismatches(filename):
-print 'Reading songID mismatches...'
+print("Reading songID mismatches...")
-filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\
+with open(filename, 'r+') as f:
-sid_mismatches.txt'
+songIdMismatches = set()
-with open(filename, 'rb') as f:
+for line in f:
-mismatches = set()
+songIdMismatches.add(line[8:26])
-next = f.readline()
+return songIdMismatches
-while next != "":
-songID = next[8:26]
-mismatches.add(songID)
-#print(next[8:26])
-next = f.readline()
-# Delete rows with songIDs mismatches from Taste Profile Subset
+def read_available_songid(filename):
-print 'Reading Taste Profile subset...'
+print("Reading available songIDs...")
-result = pd.DataFrame()
+with open(filename, 'r+') as f:
-filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\
+songIdAvailable = set()
-/train_triplets.txt'
+for line in f:
-filename_out = '/Users/paulochiliguano/Documents/msc-project/dataset/\
+songIdAvailable.add(line[0:18])
-/train_triplets_wo_mismatches.csv'
+return songIdAvailable
-for chunk in pd.read_csv(
-filename,
-low_memory=False,
-delim_whitespace=True,
-chunksize=20000,
-names=['user', 'song', 'plays'],
-header=None):
-chunk = chunk[~chunk.song.isin(mismatches)]
-chunk.to_csv(filename_out, mode='a', header=False, index=False)
-#result = result.append(chunk, ignore_index=True)
-elapsed_time = time.time() - start_time
-print 'Execution time: %.3f seconds' % elapsed_time
-#result.to_pickle('/homes/pchilguano/dataset/train_triplets_wo_mismatch.pkl')
-# Select most active users
+def delete_triplets(zippedfile='train_triplets.txt.zip',
-start_time = time.time()
+mismatchesfile='sid_mismatches.txt'):
-played_songs = 1000
+"""
-print 'Reading (filtered) Taste Profile subset...'
+Delete triplets with songIDs mismatches and unavailable audio clips from
-df = pd.read_csv(
+7Digital (UK)
-filename_out,
-delim_whitespace=False,
+This is applied on Taste Profile subset.
-header=None,
-names=['user','song','plays'])
+:type zippedfile: string
-print 'Selecting most active users (> %d ratings)...' % played_songs
+:param zippedfile: filename of the downloaded subset
-df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
+:type mismatchesfile: string
+:param mismatchesfile: filename of the downloaded list of mismatches
+"""
+tripletsfile = unzip_tasteprofile(zippedfile)
+mismatches = read_songid_mismatches(mismatchesfile)
+print("There are %d songId-trackId mismatches." % len(mismatches))
+availableClips = read_available_songid('7digital/CF_dataset_7digital.txt')
+print("There are %d audio clips available." % len(availableClips))
+cleanfile = os.path.splitext(tripletsfile)[0] + '.h5'
+print("Deleting triplets with mismatches and unavailable songs...")
+for chunk in pd.read_table(
+tripletsfile,
+header=None,
+names=['userId', 'songId', 'numPlays'],
+chunksize=100*len(mismatches),
+):
+chunk = chunk[~chunk.songId.isin(mismatches)]
+chunk = chunk[chunk.songId.isin(availableClips)]
+#chunk.to_csv(filename_out, mode='a', header=False, index=False)
+chunk.to_hdf(
+cleanfile,
+'triplets',
+mode='a',
+format='table',
+append=True,
+complevel=9,
+complib='zlib',
+fletcher32=True
+)
+# Delete the large text file!
+os.remove(tripletsfile)
+print("Triplets without mismatches saved in %s" % cleanfile)
-print 'Saving user-item matrix as dataframe...'
+if __name__ == '__main__':
-df_active.to_pickle('/Users/paulochiliguano/Documents/msc-project/dataset/\
+#if len(sys.argv) < 1:
-CF_dataset.pkl')
+#print("Not enough arguments %s" % sys.argv[0])
+#sys.exit()
+dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset')
+os.chdir(dataset_path)
+start_time = time.time()
+delete_triplets()
+elapsed_time = time.time() - start_time
+print("Execution time: %.2f minutes" % (elapsed_time/60))
-'''f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
+#a=pd.read_hdf('../train_triplets_clean.h5', 'triplets')
-CF_dataset.pkl', 'wb')
-pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
+#played_songs = 1000
-f.close()'''
+#df = pd.read_csv(
+#filename_out,
+#delim_whitespace=False,
+#header=None,
+#names=['user','song','plays'])
+#df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
+#df_active.to_pickle('../dataset/CF_dataset.pkl')
+#f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
+#CF_dataset.pkl', 'wb')
+#pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
+#f.close()
 # Select most frequent songs
-frequent_songs = 1500
+#frequent_songs = 1500
-print 'Selecting %d frequent songs...' % frequent_songs
+#print("Selecting %d frequent songs..." % frequent_songs)
-counts = df_active['song'].value_counts().head(frequent_songs)
+#counts = df_active['song'].value_counts().head(frequent_songs)
 #df_active = df_active.loc[df_active['song'].isin(counts.index), :]
-print 'Saving Echonest songID list...'
+#print("Saving Echonest songID list...")
-filename = '/Users/paulochiliguano/Documents/msc-project/dataset/\
+#filename = '../dataset/CF_dataset_songID.txt'
-CF_dataset_songID.txt'
+#with open(filename, 'wb') as f:
-with open(filename, 'wb') as f:
+#for item in counts.index.tolist():
-for item in counts.index.tolist():
+#f.write("%s\n" % item)
-f.write("%s\n" % item)
-elapsed_time = time.time() - start_time
-print 'Execution time: %.3f seconds' % elapsed_time
-'''
 #important
 #df['user'].value_counts().head(50)
-ddf = df.drop_duplicates(subset = 'song')
+#ddf = df.drop_duplicates(subset = 'song')
-ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False)
+#ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',
+#columns=['song'],
+#header=False,
+#index=False)
-with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
-l1 = list(csv.reader(input1))
-chain1 = list(itertools.chain(*l1))
-l2 = list(csv.reader(input2))
-chain2 = list(itertools.chain(*l2))
-l3 = set(chain2) - set(chain1)
-wr = csv.writer(myfile, delimiter=',')
-for item in l3:
-wr.writerow([item])
-# Save Taste Profile dataset without SongID mismatches
-mdf = df[df.song.isin(l3)]
-mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False)
-result = pd.DataFrame()
-for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
-chunk = chunk[chunk.song.isin(l3)]
-result = result.append(chunk.pivot(index='user', columns='song', values='plays')
-, ignore_index=True)
-print (result.shape)
-'''

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

comparison Code/taste_profile_cleaning.py @ 39:7e5bda3feaa3