# HG changeset patch
# User Paulo Chiliguano
# Date 1437166338 -3600
# Node ID 38f44dd7e54bd562d07084435c064f0dd79c6dec
# Parent cc503565339e541d271016aa92c7ad2a43bc0063
Creating subset from Taste Profile
Retrieving mp3 from 7Digital
Successfully
Some error due to uniencode of Artist and Song metadata.
diff -r cc503565339e -r 38f44dd7e54b Code/preview_clip.py
--- a/Code/preview_clip.py Wed Jul 15 19:25:16 2015 +0100
+++ b/Code/preview_clip.py Fri Jul 17 21:52:18 2015 +0100
@@ -8,12 +8,18 @@
import csv
import time
from pyechonest import song, config
+import oauth2 as oauth
+import urllib2
+
+consumer_key = '7ds28qendsk9'
+consumer_secret = 'm5nsktn3hu6x45cy'
+consumer = oauth.Consumer(consumer_key, consumer_secret)
config.ECHO_NEST_API_KEY="LINDFDUTQZQ781IE8"
-with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as input, open('/homes/pchilguano/dataset/test_echonest_url.txt', 'wb') as output:
+with open('/homes/pchilguano/dataset/test_echonest_songID.txt', 'rb') as input, open('/homes/pchilguano/dataset/7digital_url.txt', 'wb') as output:
writer = csv.writer(output, delimiter=',')
- #for i in xrange(1412):
- # input.readline()
+ for i in xrange(3867):
+ input.readline()
next = input.readline()
while next != "":
try:
@@ -23,13 +29,36 @@
next = input.readline()
else:
time.sleep(3)
- ss_tracks = s.get_tracks('7digital-UK')
- time.sleep(3)
- #print(len(ss_tracks))
- if len(ss_tracks) != 0:
- ss_track = ss_tracks[0]
- preview_url = ss_track.get('preview_url')
- print([next[:-2], preview_url])
- writer.writerow([next[:-2], preview_url])
- next = input.readline()
+ try:
+ ss_tracks = s.get_tracks('7digital-UK')
+ except:
+ time.sleep(3)
+ next = input.readline()
+ else:
+ #print(len(ss_tracks))
+ if len(ss_tracks) != 0:
+ ss_track = ss_tracks[0]
+ preview_url = ss_track.get('preview_url')
+
+ req = oauth.Request(method="GET", url=preview_url, is_form_encoded=True)
+ req['oauth_timestamp'] = oauth.Request.make_timestamp()
+ req['oauth_nonce'] = oauth.Request.make_nonce()
+ req['country'] = "GB"
+ sig_method = oauth.SignatureMethod_HMAC_SHA1()
+ req.sign_request(sig_method, consumer, token=None)
+
+ try:
+ response = urllib2.urlopen(req.to_url())
+ except:
+ time.sleep(22)
+ print([next[:-2], 'NA'])
+ writer.writerow([next[:-2], 'NA', s.artist_name, s.title])
+ else:
+ time.sleep(22)
+ print([next[:-2], preview_url, s.artist_name, s.title])
+ writer.writerow([next[:-2], preview_url, s.artist_name.encode("utf-8"), s.title.encode("utf-8")])
+ with open(next[:-2]+'.mp3', 'wb') as songfile:
+ songfile.write(response.read())
+
+ next = input.readline()
\ No newline at end of file
diff -r cc503565339e -r 38f44dd7e54b Code/read_taste_profile.py
--- a/Code/read_taste_profile.py Wed Jul 15 19:25:16 2015 +0100
+++ b/Code/read_taste_profile.py Fri Jul 17 21:52:18 2015 +0100
@@ -1,23 +1,9 @@
-import os
import csv
import pandas as pd
-import numpy as np
import itertools
-import time
-
-# List of h5 files (audio streams)
-#with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
-# writer = csv.writer(out, delimiter=',')
-# for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
-# for file in files:
-# if file.endswith(".h5"):
-# #print(os.path.join(root, file))
-# track = ''.join(['SO',str(file)[2:-3]])
-# print(track)
-# writer.writerow([track])
# Read songIDs from Million Song Dataset songID-trackID mismatches
-with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out:
+with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
writer = csv.writer(out, delimiter=',')
next = f.readline()
while next != "":
@@ -29,52 +15,15 @@
location = r'~/dataset/train_triplets.txt'
df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
ddf = df.drop_duplicates(subset = 'song')
-ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False)
+ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False)
-with open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_song.csv', 'rb') as input2, open('/homes/pchilguano/dataset/test_echonest1.txt', 'wb') as output:
- reader1 = csv.reader(input1)
- reader2 = csv.reader(input2)
- l1 = list(reader1)
- l1.sort()
- l2 = list(reader2)
- l2.sort()
- l3 = [x for x in l1 if x not in l2]
-
-
-with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as f:
- reader = csv.reader(f)
- your_list = list(reader)
- your_list.sort()
- chain = list(itertools.chain(*your_list))
-
-
-#store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')
-
-chunksize = 20000
-count = 0
-frame = pd.DataFrame()
-for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
- chunk = chunk.sort(columns='song')
- chunk = chunk[chunk['song'].isin(chain)]
- #frame = chunk.query('song == your_list')
- frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True)
- count = count + 1
- print([count, frame.shape])
- #for item in your_list:
- # chunk = chunk[chunk['song'].isin(item)]
- #store.append('df', chunk[chunk['song'].isin(item)])
-#store.close()
-
-
-
-
-df = store['df']
-chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
-res = next(chunks) # will raise if there are no chunks!
-for c in chunks:
- res = res.add(c, fill_value=0)
-
-
-sdf = df.to_sparse()
-
-
+# Delete songIDs mismatches from Taste Profile Dataset
+with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
+ l1 = list(csv.reader(input1))
+ chain1 = list(itertools.chain(*l1))
+ l2 = list(csv.reader(input2))
+ chain2 = list(itertools.chain(*l2))
+ l3 = set(chain2) - set(chain1)
+ wr = csv.writer(myfile, delimiter=',')
+ for item in l3:
+ wr.writerow([item])