p@39
|
1 #!/usr/bin/env python
|
p@39
|
2 # -*- coding: utf-8 -*-
|
p@39
|
3 """
|
p@39
|
4
|
p@39
|
5
|
p@39
|
6 @author: paulochiliguano
|
p@39
|
7 """
|
p@39
|
8
|
p@39
|
9 import sys
|
p@39
|
10 import os
|
p@39
|
11 import zipfile
|
p@39
|
12 import time
|
p@39
|
13 import pandas as pd
|
p@39
|
14
|
p@39
|
15 # Unzip Taste Profile subset
|
p@39
|
16 def unzip_tasteprofile(zippedfile):
|
p@39
|
17 print("Unzipping Taste Profile subset...")
|
p@39
|
18 uncompressedFilename = os.path.splitext(zippedfile)[0]
|
p@39
|
19 with zipfile.ZipFile(zippedfile) as myzip:
|
p@39
|
20 myzip.extract(uncompressedFilename)
|
p@39
|
21 return uncompressedFilename
|
p@39
|
22
|
p@39
|
23 # Read songIDs from Million Song Dataset songID-trackID mismatches
|
p@39
|
24 def read_songid_mismatches(filename):
|
p@39
|
25 print("Reading songID mismatches...")
|
p@39
|
26 with open(filename, 'r+') as f:
|
p@39
|
27 songIdMismatches = set()
|
p@39
|
28 for line in f:
|
p@39
|
29 songIdMismatches.add(line[8:26])
|
p@39
|
30 return songIdMismatches
|
p@39
|
31
|
p@39
|
32 def read_available_songid(filename):
|
p@39
|
33 print("Reading available songIDs...")
|
p@39
|
34 with open(filename, 'r+') as f:
|
p@39
|
35 songIdAvailable = set()
|
p@39
|
36 for line in f:
|
p@39
|
37 songIdAvailable.add(line[0:18])
|
p@39
|
38 return songIdAvailable
|
p@39
|
39
|
p@39
|
40 def delete_triplets(zippedfile='train_triplets.txt.zip',
|
p@39
|
41 mismatchesfile='sid_mismatches.txt'):
|
p@39
|
42 """
|
p@39
|
43 Delete triplets with songIDs mismatches and unavailable audio clips from
|
p@39
|
44 7Digital (UK)
|
p@39
|
45
|
p@39
|
46 This is applied on Taste Profile subset.
|
p@39
|
47
|
p@39
|
48 :type zippedfile: string
|
p@39
|
49 :param zippedfile: filename of the downloaded subset
|
p@39
|
50
|
p@39
|
51 :type mismatchesfile: string
|
p@39
|
52 :param mismatchesfile: filename of the downloaded list of mismatches
|
p@39
|
53
|
p@39
|
54 """
|
p@39
|
55 tripletsfile = unzip_tasteprofile(zippedfile)
|
p@39
|
56 mismatches = read_songid_mismatches(mismatchesfile)
|
p@39
|
57 print("There are %d songId-trackId mismatches." % len(mismatches))
|
p@39
|
58 availableClips = read_available_songid('7digital/CF_dataset_7digital.txt')
|
p@39
|
59 print("There are %d audio clips available." % len(availableClips))
|
p@39
|
60 cleanfile = os.path.splitext(tripletsfile)[0] + '.h5'
|
p@39
|
61 print("Deleting triplets with mismatches and unavailable songs...")
|
p@39
|
62 for chunk in pd.read_table(
|
p@39
|
63 tripletsfile,
|
p@39
|
64 header=None,
|
p@39
|
65 names=['userId', 'songId', 'numPlays'],
|
p@39
|
66 chunksize=100*len(mismatches),
|
p@39
|
67 ):
|
p@39
|
68 chunk = chunk[~chunk.songId.isin(mismatches)]
|
p@39
|
69 chunk = chunk[chunk.songId.isin(availableClips)]
|
p@39
|
70 #chunk.to_csv(filename_out, mode='a', header=False, index=False)
|
p@39
|
71 chunk.to_hdf(
|
p@39
|
72 cleanfile,
|
p@39
|
73 'triplets',
|
p@39
|
74 mode='a',
|
p@39
|
75 format='table',
|
p@39
|
76 append=True,
|
p@39
|
77 complevel=9,
|
p@39
|
78 complib='zlib',
|
p@39
|
79 fletcher32=True
|
p@39
|
80 )
|
p@39
|
81 # Delete the large text file!
|
p@39
|
82 os.remove(tripletsfile)
|
p@39
|
83 print("Triplets without mismatches saved in %s" % cleanfile)
|
p@39
|
84
|
p@39
|
85 if __name__ == '__main__':
|
p@39
|
86 #if len(sys.argv) < 1:
|
p@39
|
87 #print("Not enough arguments %s" % sys.argv[0])
|
p@39
|
88 #sys.exit()
|
p@39
|
89 dataset_path = os.path.join(os.path.split(os.getcwd())[0],'dataset')
|
p@39
|
90 os.chdir(dataset_path)
|
p@39
|
91 start_time = time.time()
|
p@39
|
92 delete_triplets()
|
p@39
|
93 elapsed_time = time.time() - start_time
|
p@39
|
94 print("Execution time: %.2f minutes" % (elapsed_time/60))
|
p@39
|
95
|
p@39
|
96 #a=pd.read_hdf('../train_triplets_clean.h5', 'triplets')
|
p@39
|
97
|
p@39
|
98 #played_songs = 1000
|
p@39
|
99 #df = pd.read_csv(
|
p@39
|
100 #filename_out,
|
p@39
|
101 #delim_whitespace=False,
|
p@39
|
102 #header=None,
|
p@39
|
103 #names=['user','song','plays'])
|
p@39
|
104 #df_active = df.groupby('user').filter(lambda x: len(x) > played_songs)
|
p@39
|
105 #df_active.to_pickle('../dataset/CF_dataset.pkl')
|
p@39
|
106
|
p@39
|
107 #f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@39
|
108 #CF_dataset.pkl', 'wb')
|
p@39
|
109 #pickle.dump(df_active, f, protocol=pickle.HIGHEST_PROTOCOL)
|
p@39
|
110 #f.close()
|
p@39
|
111
|
p@39
|
112 # Select most frequent songs
|
p@39
|
113 #frequent_songs = 1500
|
p@39
|
114 #print("Selecting %d frequent songs..." % frequent_songs)
|
p@39
|
115 #counts = df_active['song'].value_counts().head(frequent_songs)
|
p@39
|
116 #df_active = df_active.loc[df_active['song'].isin(counts.index), :]
|
p@39
|
117 #print("Saving Echonest songID list...")
|
p@39
|
118 #filename = '../dataset/CF_dataset_songID.txt'
|
p@39
|
119 #with open(filename, 'wb') as f:
|
p@39
|
120 #for item in counts.index.tolist():
|
p@39
|
121 #f.write("%s\n" % item)
|
p@39
|
122
|
p@39
|
123 #important
|
p@39
|
124 #df['user'].value_counts().head(50)
|
p@39
|
125
|
p@39
|
126 #ddf = df.drop_duplicates(subset = 'song')
|
p@39
|
127 #ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',
|
p@39
|
128 #columns=['song'],
|
p@39
|
129 #header=False,
|
p@39
|
130 #index=False) |