Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
changeset 11:38f44dd7e54b
Creating subset from Taste Profile
Retrieving mp3 from 7Digital
Successfully
Some error due to uniencode of Artist and Song metadata.
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Fri, 17 Jul 2015 21:52:18 +0100 |
parents | cc503565339e |
children | 20589ba1908a |
files | Code/preview_clip.py Code/read_taste_profile.py |
diffstat | 2 files changed, 53 insertions(+), 75 deletions(-) [+] |
line wrap: on
line diff
--- a/Code/preview_clip.py Wed Jul 15 19:25:16 2015 +0100 +++ b/Code/preview_clip.py Fri Jul 17 21:52:18 2015 +0100 @@ -8,12 +8,18 @@ import csv import time from pyechonest import song, config +import oauth2 as oauth +import urllib2 + +consumer_key = '7ds28qendsk9' +consumer_secret = 'm5nsktn3hu6x45cy' +consumer = oauth.Consumer(consumer_key, consumer_secret) config.ECHO_NEST_API_KEY="LINDFDUTQZQ781IE8" -with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as input, open('/homes/pchilguano/dataset/test_echonest_url.txt', 'wb') as output: +with open('/homes/pchilguano/dataset/test_echonest_songID.txt', 'rb') as input, open('/homes/pchilguano/dataset/7digital_url.txt', 'wb') as output: writer = csv.writer(output, delimiter=',') - #for i in xrange(1412): - # input.readline() + for i in xrange(3867): + input.readline() next = input.readline() while next != "": try: @@ -23,13 +29,36 @@ next = input.readline() else: time.sleep(3) - ss_tracks = s.get_tracks('7digital-UK') - time.sleep(3) - #print(len(ss_tracks)) - if len(ss_tracks) != 0: - ss_track = ss_tracks[0] - preview_url = ss_track.get('preview_url') - print([next[:-2], preview_url]) - writer.writerow([next[:-2], preview_url]) - next = input.readline() + try: + ss_tracks = s.get_tracks('7digital-UK') + except: + time.sleep(3) + next = input.readline() + else: + #print(len(ss_tracks)) + if len(ss_tracks) != 0: + ss_track = ss_tracks[0] + preview_url = ss_track.get('preview_url') + + req = oauth.Request(method="GET", url=preview_url, is_form_encoded=True) + req['oauth_timestamp'] = oauth.Request.make_timestamp() + req['oauth_nonce'] = oauth.Request.make_nonce() + req['country'] = "GB" + sig_method = oauth.SignatureMethod_HMAC_SHA1() + req.sign_request(sig_method, consumer, token=None) + + try: + response = urllib2.urlopen(req.to_url()) + except: + time.sleep(22) + print([next[:-2], 'NA']) + writer.writerow([next[:-2], 'NA', s.artist_name, s.title]) + else: + time.sleep(22) + print([next[:-2], preview_url, s.artist_name, s.title]) + writer.writerow([next[:-2], preview_url, s.artist_name.encode("utf-8"), s.title.encode("utf-8")]) + with open(next[:-2]+'.mp3', 'wb') as songfile: + songfile.write(response.read()) + + next = input.readline() \ No newline at end of file
--- a/Code/read_taste_profile.py Wed Jul 15 19:25:16 2015 +0100 +++ b/Code/read_taste_profile.py Fri Jul 17 21:52:18 2015 +0100 @@ -1,23 +1,9 @@ -import os import csv import pandas as pd -import numpy as np import itertools -import time - -# List of h5 files (audio streams) -#with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out: -# writer = csv.writer(out, delimiter=',') -# for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"): -# for file in files: -# if file.endswith(".h5"): -# #print(os.path.join(root, file)) -# track = ''.join(['SO',str(file)[2:-3]]) -# print(track) -# writer.writerow([track]) # Read songIDs from Million Song Dataset songID-trackID mismatches -with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out: +with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out: writer = csv.writer(out, delimiter=',') next = f.readline() while next != "": @@ -29,52 +15,15 @@ location = r'~/dataset/train_triplets.txt' df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) ddf = df.drop_duplicates(subset = 'song') -ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False) +ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False) -with open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_song.csv', 'rb') as input2, open('/homes/pchilguano/dataset/test_echonest1.txt', 'wb') as output: - reader1 = csv.reader(input1) - reader2 = csv.reader(input2) - l1 = list(reader1) - l1.sort() - l2 = list(reader2) - l2.sort() - l3 = [x for x in l1 if x not in l2] - - -with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as f: - reader = csv.reader(f) - your_list = list(reader) - your_list.sort() - chain = list(itertools.chain(*your_list)) - - -#store = pd.HDFStore('/homes/pchilguano/dataset/store.h5') - -chunksize = 20000 -count = 0 -frame = pd.DataFrame() -for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize): - chunk = chunk.sort(columns='song') - chunk = chunk[chunk['song'].isin(chain)] - #frame = chunk.query('song == your_list') - frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True) - count = count + 1 - print([count, frame.shape]) - #for item in your_list: - # chunk = chunk[chunk['song'].isin(item)] - #store.append('df', chunk[chunk['song'].isin(item)]) -#store.close() - - - - -df = store['df'] -chunks = (df.groupby().sum() for df in store.select('df', chunksize=500)) -res = next(chunks) # will raise if there are no chunks! -for c in chunks: - res = res.add(c, fill_value=0) - - -sdf = df.to_sparse() - - +# Delete songIDs mismatches from Taste Profile Dataset +with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: + l1 = list(csv.reader(input1)) + chain1 = list(itertools.chain(*l1)) + l2 = list(csv.reader(input2)) + chain2 = list(itertools.chain(*l2)) + l3 = set(chain2) - set(chain1) + wr = csv.writer(myfile, delimiter=',') + for item in l3: + wr.writerow([item])