comparison Code/taste_profile_cleaning.py @ 39:7e5bda3feaa3

Taste Profile cleaning in Python 3.5
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 20 Nov 2016 03:15:25 -0500
parents fafc0b249a73
children 1de207a22c1a
comparison
equal deleted inserted replaced
38:f6d568782b49 39:7e5bda3feaa3
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
1 4
2 5
6 @author: paulochiliguano
7 """
3 8
4 9 import sys
5 10 import os
6 11 import zipfile
7
8
9 import time 12 import time
10 import pandas as pd 13 import pandas as pd
11 #import cPickle as pickle 14
15 # Unzip Taste Profile subset
16 def unzip_tasteprofile(zippedfile):
17 print("Unzipping Taste Profile subset...")
18 uncompressedFilename = os.path.splitext(zippedfile)[0]
19 with zipfile.ZipFile(zippedfile) as myzip:
20 myzip.extract(uncompressedFilename)
21 return uncompressedFilename
12 22
13 # Read songIDs from Million Song Dataset songID-trackID mismatches 23 # Read songIDs from Million Song Dataset songID-trackID mismatches
14 start_time = time.time() 24 def read_songid_mismatches(filename):
15 print 'Reading songID mismatches...' 25 print("Reading songID mismatches...")
16 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\ 26 with open(filename, 'r+') as f:
17 sid_mismatches.txt' 27 songIdMismatches = set()
18 with open(filename, 'rb') as f: 28 for line in f:
19 mismatches = set() 29 songIdMismatches.add(line[8:26])
20 next = f.readline() 30 return songIdMismatches
21 while next != "":
22 songID = next[8:26]
23 mismatches.add(songID)
24 #print(next[8:26])
25 next = f.readline()
26 31
27 # Delete rows with songIDs mismatches from Taste Profile Subset 32 def read_available_songid(filename):
28 print 'Reading Taste Profile subset...' 33 print("Reading available songIDs...")
29 result = pd.DataFrame() 34 with open(filename, 'r+') as f:
30 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\ 35 songIdAvailable = set()
31 /train_triplets.txt' 36 for line in f:
32 filename_out = '/Users/paulochiliguano/Documents/msc-project/dataset/\ 37 songIdAvailable.add(line[0:18])
33 /train_triplets_wo_mismatches.csv' 38 return songIdAvailable
34 for chunk in pd.read_csv(
35 filename,
36 low_memory=False,
37 delim_whitespace=True,
38 chunksize=20000,
39 names=['user', 'song', 'plays'],
40 header=None):
41 chunk = chunk[~chunk.song.isin(mismatches)]
42 chunk.to_csv(filename_out, mode='a', header=False, index=False)
43 #result = result.append(chunk, ignore_index=True)
44 elapsed_time = time.time() - start_time
45 print 'Execution time: %.3f seconds' % elapsed_time
46 #result.to_pickle('/homes/pchilguano/dataset/train_triplets_wo_mismatch.pkl')
47 39
48 # Select most active users 40 def delete_triplets(zippedfile='train_triplets.txt.zip',
49 start_time = time.time() 41 mismatchesfile='sid_mismatches.txt'):
50 played_songs = 1000 42 """
51 print 'Reading (filtered) Taste Profile subset...' 43 Delete triplets with songIDs mismatches and unavailable audio clips from
52 df = pd.read_csv( 44 7Digital (UK)
53 filename_out, 45
54 delim_whitespace=False, 46 This is applied on Taste Profile subset.
55 header=None, 47
56 names=['user','song','plays']) 48 :type zippedfile: string
57 print 'Selecting most active users (> %d ratings)...' % played_songs 49 :param zippedfile: filename of the downloaded subset
58 df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) 50
51 :type mismatchesfile: string
52 :param mismatchesfile: filename of the downloaded list of mismatches
53
54 """
55 tripletsfile = unzip_tasteprofile(zippedfile)
56 mismatches = read_songid_mismatches(mismatchesfile)
57 print("There are %d songId-trackId mismatches." % len(mismatches))
58 availableClips = read_available_songid('7digital/CF_dataset_7digital.txt')
59 print("There are %d audio clips available." % len(availableClips))
60 cleanfile = os.path.splitext(tripletsfile)[0] + '.h5'
61 print("Deleting triplets with mismatches and unavailable songs...")
62 for chunk in pd.read_table(
63 tripletsfile,
64 header=None,
65 names=['userId', 'songId', 'numPlays'],
66 chunksize=100*len(mismatches),
67 ):
68 chunk = chunk[~chunk.songId.isin(mismatches)]
69 chunk = chunk[chunk.songId.isin(availableClips)]
70 #chunk.to_csv(filename_out, mode='a', header=False, index=False)
71 chunk.to_hdf(
72 cleanfile,
73 'triplets',
74 mode='a',
75 format='table',
76 append=True,
77 complevel=9,
78 complib='zlib',
79 fletcher32=True
80 )
81 # Delete the large text file!
82 os.remove(tripletsfile)
83 print("Triplets without mismatches saved in %s" % cleanfile)
59 84
60 print 'Saving user-item matrix as dataframe...' 85 if __name__ == '__main__':
61 df_active.to_pickle('/Users/paulochiliguano/Documents/msc-project/dataset/\ 86 #if len(sys.argv) < 1:
62 CF_dataset.pkl') 87 #print("Not enough arguments %s" % sys.argv[0])
88 #sys.exit()
89 dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset')
90 os.chdir(dataset_path)
91 start_time = time.time()
92 delete_triplets()
93 elapsed_time = time.time() - start_time
94 print("Execution time: %.2f minutes" % (elapsed_time/60))
63 95
64 '''f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ 96 #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets')
65 CF_dataset.pkl', 'wb') 97
66 pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) 98 #played_songs = 1000
67 f.close()''' 99 #df = pd.read_csv(
100 #filename_out,
101 #delim_whitespace=False,
102 #header=None,
103 #names=['user','song','plays'])
104 #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
105 #df_active.to_pickle('../dataset/CF_dataset.pkl')
106
107 #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
108 #CF_dataset.pkl', 'wb')
109 #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
110 #f.close()
68 111
69 # Select most frequent songs 112 # Select most frequent songs
70 frequent_songs = 1500 113 #frequent_songs = 1500
71 print 'Selecting %d frequent songs...' % frequent_songs 114 #print("Selecting %d frequent songs..." % frequent_songs)
72 counts = df_active['song'].value_counts().head(frequent_songs) 115 #counts = df_active['song'].value_counts().head(frequent_songs)
73 #df_active = df_active.loc[df_active['song'].isin(counts.index), :] 116 #df_active = df_active.loc[df_active['song'].isin(counts.index), :]
74 print 'Saving Echonest songID list...' 117 #print("Saving Echonest songID list...")
75 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/\ 118 #filename = '../dataset/CF_dataset_songID.txt'
76 CF_dataset_songID.txt' 119 #with open(filename, 'wb') as f:
77 with open(filename, 'wb') as f: 120 #for item in counts.index.tolist():
78 for item in counts.index.tolist(): 121 #f.write("%s\n" % item)
79 f.write("%s\n" % item)
80 elapsed_time = time.time() - start_time
81 print 'Execution time: %.3f seconds' % elapsed_time
82 122
83 '''
84 #important 123 #important
85 #df['user'].value_counts().head(50) 124 #df['user'].value_counts().head(50)
86 125
87 ddf = df.drop_duplicates(subset = 'song') 126 #ddf = df.drop_duplicates(subset = 'song')
88 ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False) 127 #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',
89 128 #columns=['song'],
90 129 #header=False,
91 130 #index=False)
92 with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
93 l1 = list(csv.reader(input1))
94 chain1 = list(itertools.chain(*l1))
95 l2 = list(csv.reader(input2))
96 chain2 = list(itertools.chain(*l2))
97 l3 = set(chain2) - set(chain1)
98 wr = csv.writer(myfile, delimiter=',')
99 for item in l3:
100 wr.writerow([item])
101
102 # Save Taste Profile dataset without SongID mismatches
103 mdf = df[df.song.isin(l3)]
104 mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False)
105
106 result = pd.DataFrame()
107 for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
108 chunk = chunk[chunk.song.isin(l3)]
109 result = result.append(chunk.pivot(index='user', columns='song', values='plays')
110 , ignore_index=True)
111 print (result.shape)
112 '''