changeset 39:7e5bda3feaa3

Taste Profile cleaning in Python 3.5
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 20 Nov 2016 03:15:25 -0500
parents f6d568782b49
children ba2a7e3eb418
files Code/taste_profile_cleaning.py
diffstat 1 files changed, 130 insertions(+), 112 deletions(-) [+]
line wrap: on
line diff
--- a/Code/taste_profile_cleaning.py	Thu Oct 20 00:43:25 2016 -0500
+++ b/Code/taste_profile_cleaning.py	Sun Nov 20 03:15:25 2016 -0500
@@ -1,112 +1,130 @@
-
-
-
-
-
-
-
-
-import time
-import pandas as pd
-#import cPickle as pickle
-
-# Read songIDs from Million Song Dataset songID-trackID mismatches
-start_time = time.time()
-print 'Reading songID mismatches...'
-filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\
-sid_mismatches.txt'
-with open(filename, 'rb') as f:
-    mismatches = set()    
-    next = f.readline()
-    while next != "":
-        songID = next[8:26]
-        mismatches.add(songID)
-        #print(next[8:26])
-        next = f.readline()
-
-# Delete rows with songIDs mismatches from Taste Profile Subset
-print 'Reading Taste Profile subset...'
-result = pd.DataFrame()
-filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\
-/train_triplets.txt'
-filename_out = '/Users/paulochiliguano/Documents/msc-project/dataset/\
-/train_triplets_wo_mismatches.csv'
-for chunk in pd.read_csv(
-        filename,
-        low_memory=False,
-        delim_whitespace=True, 
-        chunksize=20000,
-        names=['user', 'song', 'plays'],
-        header=None):
-    chunk = chunk[~chunk.song.isin(mismatches)]
-    chunk.to_csv(filename_out, mode='a', header=False, index=False)
-    #result = result.append(chunk, ignore_index=True)
-elapsed_time = time.time() - start_time
-print 'Execution time: %.3f seconds' % elapsed_time
-#result.to_pickle('/homes/pchilguano/dataset/train_triplets_wo_mismatch.pkl')
-
-# Select most active users
-start_time = time.time()
-played_songs = 1000
-print 'Reading (filtered) Taste Profile subset...'
-df = pd.read_csv(
-    filename_out,
-    delim_whitespace=False,
-    header=None,
-    names=['user','song','plays'])
-print 'Selecting most active users (> %d ratings)...' % played_songs
-df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
-
-print 'Saving user-item matrix as dataframe...'
-df_active.to_pickle('/Users/paulochiliguano/Documents/msc-project/dataset/\
-CF_dataset.pkl')
-
-'''f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
-CF_dataset.pkl', 'wb')
-pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
-f.close()'''
-
-# Select most frequent songs
-frequent_songs = 1500
-print 'Selecting %d frequent songs...' % frequent_songs
-counts = df_active['song'].value_counts().head(frequent_songs)
-#df_active = df_active.loc[df_active['song'].isin(counts.index), :]
-print 'Saving Echonest songID list...'
-filename = '/Users/paulochiliguano/Documents/msc-project/dataset/\
-CF_dataset_songID.txt'
-with open(filename, 'wb') as f:
-    for item in counts.index.tolist():
-        f.write("%s\n" % item)
-elapsed_time = time.time() - start_time
-print 'Execution time: %.3f seconds' % elapsed_time
-
-'''
-#important
-#df['user'].value_counts().head(50)
-
-ddf = df.drop_duplicates(subset = 'song')
-ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False)
-
-
-
-with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
-    l1 = list(csv.reader(input1))
-    chain1 = list(itertools.chain(*l1))
-    l2 = list(csv.reader(input2))
-    chain2 = list(itertools.chain(*l2))
-    l3 = set(chain2) - set(chain1)
-    wr = csv.writer(myfile, delimiter=',')
-    for item in l3:
-        wr.writerow([item])
-
-# Save Taste Profile dataset without SongID mismatches
-mdf = df[df.song.isin(l3)]
-mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False)
-
-result = pd.DataFrame()
-for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
-    chunk = chunk[chunk.song.isin(l3)]    
-    result = result.append(chunk.pivot(index='user', columns='song', values='plays')    
-    , ignore_index=True)
-    print (result.shape)
-'''
\ No newline at end of file
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+
+
+@author: paulochiliguano
+"""
+
+import sys
+import os
+import zipfile
+import time
+import pandas as pd
+
+# Unzip Taste Profile subset
+def unzip_tasteprofile(zippedfile):
+    print("Unzipping Taste Profile subset...")
+    uncompressedFilename = os.path.splitext(zippedfile)[0]
+    with zipfile.ZipFile(zippedfile) as myzip:
+        myzip.extract(uncompressedFilename)
+    return uncompressedFilename
+
+# Read songIDs from Million Song Dataset songID-trackID mismatches
+def read_songid_mismatches(filename):
+    print("Reading songID mismatches...")
+    with open(filename, 'r+') as f:
+        songIdMismatches = set()
+        for line in f:
+            songIdMismatches.add(line[8:26])
+    return songIdMismatches
+
+def read_available_songid(filename):
+    print("Reading available songIDs...")
+    with open(filename, 'r+') as f:
+        songIdAvailable = set()
+        for line in f:
+            songIdAvailable.add(line[0:18])
+    return songIdAvailable   
+
+def delete_triplets(zippedfile='train_triplets.txt.zip',
+                    mismatchesfile='sid_mismatches.txt'):
+    """
+    Delete triplets with songIDs mismatches and unavailable audio clips from
+    7Digital (UK)
+    
+    This is applied on Taste Profile subset.
+    
+    :type zippedfile: string
+    :param zippedfile: filename of the downloaded subset
+    
+    :type mismatchesfile: string
+    :param mismatchesfile: filename of the downloaded list of mismatches
+    
+    """
+    tripletsfile = unzip_tasteprofile(zippedfile)
+    mismatches = read_songid_mismatches(mismatchesfile)
+    print("There are %d songId-trackId mismatches." % len(mismatches))
+    availableClips = read_available_songid('7digital/CF_dataset_7digital.txt')
+    print("There are %d audio clips available." % len(availableClips))
+    cleanfile = os.path.splitext(tripletsfile)[0] + '.h5'
+    print("Deleting triplets with mismatches and unavailable songs...")
+    for chunk in pd.read_table(
+            tripletsfile,
+            header=None,
+            names=['userId', 'songId', 'numPlays'],
+            chunksize=100*len(mismatches),
+            ):
+        chunk = chunk[~chunk.songId.isin(mismatches)]
+        chunk = chunk[chunk.songId.isin(availableClips)]
+        #chunk.to_csv(filename_out, mode='a', header=False, index=False)
+        chunk.to_hdf(
+                cleanfile,
+                'triplets',
+                mode='a',
+                format='table',
+                append=True,
+                complevel=9,
+                complib='zlib',
+                fletcher32=True
+                )
+    # Delete the large text file!
+    os.remove(tripletsfile)
+    print("Triplets without mismatches saved in %s" % cleanfile)
+
+if __name__ == '__main__':
+    #if len(sys.argv) < 1:
+        #print("Not enough arguments %s" % sys.argv[0])
+        #sys.exit()
+    dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset')
+    os.chdir(dataset_path)
+    start_time = time.time()
+    delete_triplets()
+    elapsed_time = time.time() - start_time
+    print("Execution time: %.2f minutes" % (elapsed_time/60))
+
+#a=pd.read_hdf('../train_triplets_clean.h5', 'triplets')
+
+#played_songs = 1000
+#df = pd.read_csv(
+    #filename_out,
+    #delim_whitespace=False,
+    #header=None,
+    #names=['user','song','plays'])
+#df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
+#df_active.to_pickle('../dataset/CF_dataset.pkl')
+
+#f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
+#CF_dataset.pkl', 'wb')
+#pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
+#f.close()
+
+# Select most frequent songs
+#frequent_songs = 1500
+#print("Selecting %d frequent songs..." % frequent_songs)
+#counts = df_active['song'].value_counts().head(frequent_songs)
+#df_active = df_active.loc[df_active['song'].isin(counts.index), :]
+#print("Saving Echonest songID list...")
+#filename = '../dataset/CF_dataset_songID.txt'
+#with open(filename, 'wb') as f:
+    #for item in counts.index.tolist():
+       #f.write("%s\n" % item)
+
+#important
+#df['user'].value_counts().head(50)
+
+#ddf = df.drop_duplicates(subset = 'song')
+#ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',
+           #columns=['song'],
+           #header=False,
+           #index=False)
\ No newline at end of file