Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/taste_profile_cleaning.py @ 39:7e5bda3feaa3
Taste Profile cleaning in Python 3.5
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 20 Nov 2016 03:15:25 -0500 |
parents | fafc0b249a73 |
children | 1de207a22c1a |
comparison
equal
deleted
inserted
replaced
38:f6d568782b49 | 39:7e5bda3feaa3 |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 """ | |
1 | 4 |
2 | 5 |
6 @author: paulochiliguano | |
7 """ | |
3 | 8 |
4 | 9 import sys |
5 | 10 import os |
6 | 11 import zipfile |
7 | |
8 | |
9 import time | 12 import time |
10 import pandas as pd | 13 import pandas as pd |
11 #import cPickle as pickle | 14 |
15 # Unzip Taste Profile subset | |
16 def unzip_tasteprofile(zippedfile): | |
17 print("Unzipping Taste Profile subset...") | |
18 uncompressedFilename = os.path.splitext(zippedfile)[0] | |
19 with zipfile.ZipFile(zippedfile) as myzip: | |
20 myzip.extract(uncompressedFilename) | |
21 return uncompressedFilename | |
12 | 22 |
13 # Read songIDs from Million Song Dataset songID-trackID mismatches | 23 # Read songIDs from Million Song Dataset songID-trackID mismatches |
14 start_time = time.time() | 24 def read_songid_mismatches(filename): |
15 print 'Reading songID mismatches...' | 25 print("Reading songID mismatches...") |
16 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\ | 26 with open(filename, 'r+') as f: |
17 sid_mismatches.txt' | 27 songIdMismatches = set() |
18 with open(filename, 'rb') as f: | 28 for line in f: |
19 mismatches = set() | 29 songIdMismatches.add(line[8:26]) |
20 next = f.readline() | 30 return songIdMismatches |
21 while next != "": | |
22 songID = next[8:26] | |
23 mismatches.add(songID) | |
24 #print(next[8:26]) | |
25 next = f.readline() | |
26 | 31 |
27 # Delete rows with songIDs mismatches from Taste Profile Subset | 32 def read_available_songid(filename): |
28 print 'Reading Taste Profile subset...' | 33 print("Reading available songIDs...") |
29 result = pd.DataFrame() | 34 with open(filename, 'r+') as f: |
30 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/taste_profile\ | 35 songIdAvailable = set() |
31 /train_triplets.txt' | 36 for line in f: |
32 filename_out = '/Users/paulochiliguano/Documents/msc-project/dataset/\ | 37 songIdAvailable.add(line[0:18]) |
33 /train_triplets_wo_mismatches.csv' | 38 return songIdAvailable |
34 for chunk in pd.read_csv( | |
35 filename, | |
36 low_memory=False, | |
37 delim_whitespace=True, | |
38 chunksize=20000, | |
39 names=['user', 'song', 'plays'], | |
40 header=None): | |
41 chunk = chunk[~chunk.song.isin(mismatches)] | |
42 chunk.to_csv(filename_out, mode='a', header=False, index=False) | |
43 #result = result.append(chunk, ignore_index=True) | |
44 elapsed_time = time.time() - start_time | |
45 print 'Execution time: %.3f seconds' % elapsed_time | |
46 #result.to_pickle('/homes/pchilguano/dataset/train_triplets_wo_mismatch.pkl') | |
47 | 39 |
48 # Select most active users | 40 def delete_triplets(zippedfile='train_triplets.txt.zip', |
49 start_time = time.time() | 41 mismatchesfile='sid_mismatches.txt'): |
50 played_songs = 1000 | 42 """ |
51 print 'Reading (filtered) Taste Profile subset...' | 43 Delete triplets with songIDs mismatches and unavailable audio clips from |
52 df = pd.read_csv( | 44 7Digital (UK) |
53 filename_out, | 45 |
54 delim_whitespace=False, | 46 This is applied on Taste Profile subset. |
55 header=None, | 47 |
56 names=['user','song','plays']) | 48 :type zippedfile: string |
57 print 'Selecting most active users (> %d ratings)...' % played_songs | 49 :param zippedfile: filename of the downloaded subset |
58 df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) | 50 |
51 :type mismatchesfile: string | |
52 :param mismatchesfile: filename of the downloaded list of mismatches | |
53 | |
54 """ | |
55 tripletsfile = unzip_tasteprofile(zippedfile) | |
56 mismatches = read_songid_mismatches(mismatchesfile) | |
57 print("There are %d songId-trackId mismatches." % len(mismatches)) | |
58 availableClips = read_available_songid('7digital/CF_dataset_7digital.txt') | |
59 print("There are %d audio clips available." % len(availableClips)) | |
60 cleanfile = os.path.splitext(tripletsfile)[0] + '.h5' | |
61 print("Deleting triplets with mismatches and unavailable songs...") | |
62 for chunk in pd.read_table( | |
63 tripletsfile, | |
64 header=None, | |
65 names=['userId', 'songId', 'numPlays'], | |
66 chunksize=100*len(mismatches), | |
67 ): | |
68 chunk = chunk[~chunk.songId.isin(mismatches)] | |
69 chunk = chunk[chunk.songId.isin(availableClips)] | |
70 #chunk.to_csv(filename_out, mode='a', header=False, index=False) | |
71 chunk.to_hdf( | |
72 cleanfile, | |
73 'triplets', | |
74 mode='a', | |
75 format='table', | |
76 append=True, | |
77 complevel=9, | |
78 complib='zlib', | |
79 fletcher32=True | |
80 ) | |
81 # Delete the large text file! | |
82 os.remove(tripletsfile) | |
83 print("Triplets without mismatches saved in %s" % cleanfile) | |
59 | 84 |
60 print 'Saving user-item matrix as dataframe...' | 85 if __name__ == '__main__': |
61 df_active.to_pickle('/Users/paulochiliguano/Documents/msc-project/dataset/\ | 86 #if len(sys.argv) < 1: |
62 CF_dataset.pkl') | 87 #print("Not enough arguments %s" % sys.argv[0]) |
88 #sys.exit() | |
89 dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset') | |
90 os.chdir(dataset_path) | |
91 start_time = time.time() | |
92 delete_triplets() | |
93 elapsed_time = time.time() - start_time | |
94 print("Execution time: %.2f minutes" % (elapsed_time/60)) | |
63 | 95 |
64 '''f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ | 96 #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets') |
65 CF_dataset.pkl', 'wb') | 97 |
66 pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) | 98 #played_songs = 1000 |
67 f.close()''' | 99 #df = pd.read_csv( |
100 #filename_out, | |
101 #delim_whitespace=False, | |
102 #header=None, | |
103 #names=['user','song','plays']) | |
104 #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs) | |
105 #df_active.to_pickle('../dataset/CF_dataset.pkl') | |
106 | |
107 #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ | |
108 #CF_dataset.pkl', 'wb') | |
109 #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL) | |
110 #f.close() | |
68 | 111 |
69 # Select most frequent songs | 112 # Select most frequent songs |
70 frequent_songs = 1500 | 113 #frequent_songs = 1500 |
71 print 'Selecting %d frequent songs...' % frequent_songs | 114 #print("Selecting %d frequent songs..." % frequent_songs) |
72 counts = df_active['song'].value_counts().head(frequent_songs) | 115 #counts = df_active['song'].value_counts().head(frequent_songs) |
73 #df_active = df_active.loc[df_active['song'].isin(counts.index), :] | 116 #df_active = df_active.loc[df_active['song'].isin(counts.index), :] |
74 print 'Saving Echonest songID list...' | 117 #print("Saving Echonest songID list...") |
75 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/\ | 118 #filename = '../dataset/CF_dataset_songID.txt' |
76 CF_dataset_songID.txt' | 119 #with open(filename, 'wb') as f: |
77 with open(filename, 'wb') as f: | 120 #for item in counts.index.tolist(): |
78 for item in counts.index.tolist(): | 121 #f.write("%s\n" % item) |
79 f.write("%s\n" % item) | |
80 elapsed_time = time.time() - start_time | |
81 print 'Execution time: %.3f seconds' % elapsed_time | |
82 | 122 |
83 ''' | |
84 #important | 123 #important |
85 #df['user'].value_counts().head(50) | 124 #df['user'].value_counts().head(50) |
86 | 125 |
87 ddf = df.drop_duplicates(subset = 'song') | 126 #ddf = df.drop_duplicates(subset = 'song') |
88 ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False) | 127 #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv', |
89 | 128 #columns=['song'], |
90 | 129 #header=False, |
91 | 130 #index=False) |
92 with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: | |
93 l1 = list(csv.reader(input1)) | |
94 chain1 = list(itertools.chain(*l1)) | |
95 l2 = list(csv.reader(input2)) | |
96 chain2 = list(itertools.chain(*l2)) | |
97 l3 = set(chain2) - set(chain1) | |
98 wr = csv.writer(myfile, delimiter=',') | |
99 for item in l3: | |
100 wr.writerow([item]) | |
101 | |
102 # Save Taste Profile dataset without SongID mismatches | |
103 mdf = df[df.song.isin(l3)] | |
104 mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False) | |
105 | |
106 result = pd.DataFrame() | |
107 for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): | |
108 chunk = chunk[chunk.song.isin(l3)] | |
109 result = result.append(chunk.pivot(index='user', columns='song', values='plays') | |
110 , ignore_index=True) | |
111 print (result.shape) | |
112 ''' |