# HG changeset patch
# User Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
# Date 1437166338 -3600
# Node ID 38f44dd7e54bd562d07084435c064f0dd79c6dec
# Parent  cc503565339e541d271016aa92c7ad2a43bc0063
Creating subset from Taste Profile
Retrieving mp3 from 7Digital
Successfully

Some error due to uniencode of Artist and Song metadata.

diff -r cc503565339e -r 38f44dd7e54b Code/preview_clip.py
--- a/Code/preview_clip.py	Wed Jul 15 19:25:16 2015 +0100
+++ b/Code/preview_clip.py	Fri Jul 17 21:52:18 2015 +0100
@@ -8,12 +8,18 @@
 import csv
 import time
 from pyechonest import song, config
+import oauth2 as oauth
+import urllib2
+
+consumer_key = '7ds28qendsk9'
+consumer_secret = 'm5nsktn3hu6x45cy'
+consumer = oauth.Consumer(consumer_key, consumer_secret)
 
 config.ECHO_NEST_API_KEY="LINDFDUTQZQ781IE8"	
-with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as input, open('/homes/pchilguano/dataset/test_echonest_url.txt', 'wb') as output:
+with open('/homes/pchilguano/dataset/test_echonest_songID.txt', 'rb') as input, open('/homes/pchilguano/dataset/7digital_url.txt', 'wb') as output:
     writer = csv.writer(output, delimiter=',')	
-        #for i in xrange(1412):
-        #    input.readline()
+    for i in xrange(3867):
+        input.readline()
     next = input.readline()
     while next != "":
         try:
@@ -23,13 +29,36 @@
             next = input.readline()
         else:
             time.sleep(3)
-            ss_tracks = s.get_tracks('7digital-UK')
-            time.sleep(3)
-                    #print(len(ss_tracks))	
-            if len(ss_tracks) != 0:
-                ss_track = ss_tracks[0]	
-                preview_url = ss_track.get('preview_url')	
-                print([next[:-2], preview_url])	
-                writer.writerow([next[:-2], preview_url])
-            next = input.readline()	
+            try:
+                ss_tracks = s.get_tracks('7digital-UK')
+            except:
+                time.sleep(3)
+                next = input.readline()
+            else:
+                #print(len(ss_tracks))
+                if len(ss_tracks) != 0:
+                    ss_track = ss_tracks[0]	
+                    preview_url = ss_track.get('preview_url')	
+                    
+                    req = oauth.Request(method="GET", url=preview_url, is_form_encoded=True)
+                    req['oauth_timestamp'] = oauth.Request.make_timestamp()
+                    req['oauth_nonce'] = oauth.Request.make_nonce()
+                    req['country'] = "GB"
+                    sig_method = oauth.SignatureMethod_HMAC_SHA1()
+                    req.sign_request(sig_method, consumer, token=None)
+                    
+                    try:
+                        response = urllib2.urlopen(req.to_url())
+                    except:
+                        time.sleep(22)
+                        print([next[:-2], 'NA'])
+                        writer.writerow([next[:-2], 'NA', s.artist_name, s.title])
+                    else:
+                        time.sleep(22)                        
+                        print([next[:-2], preview_url, s.artist_name, s.title])
+                        writer.writerow([next[:-2], preview_url, s.artist_name.encode("utf-8"), s.title.encode("utf-8")])
+                        with open(next[:-2]+'.mp3', 'wb') as songfile:
+                            songfile.write(response.read())
+                    
+                next = input.readline()	
         
\ No newline at end of file
diff -r cc503565339e -r 38f44dd7e54b Code/read_taste_profile.py
--- a/Code/read_taste_profile.py	Wed Jul 15 19:25:16 2015 +0100
+++ b/Code/read_taste_profile.py	Fri Jul 17 21:52:18 2015 +0100
@@ -1,23 +1,9 @@
-import os
 import csv
 import pandas as pd
-import numpy as np
 import itertools
-import time
-
-# List of h5 files (audio streams)
-#with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
-#	writer = csv.writer(out, delimiter=',')
-#	for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
-#		for file in files:
-#			if file.endswith(".h5"):
-#				#print(os.path.join(root, file))
-#				track = ''.join(['SO',str(file)[2:-3]])
-#				print(track)
-#				writer.writerow([track])
 
 # Read songIDs from Million Song Dataset songID-trackID mismatches 
-with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out:
+with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
     writer = csv.writer(out, delimiter=',')
     next = f.readline()
     while next != "":
@@ -29,52 +15,15 @@
 location = r'~/dataset/train_triplets.txt'		
 df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
 ddf = df.drop_duplicates(subset = 'song')
-ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False)
+ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False)
 
-with open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_song.csv', 'rb') as input2, open('/homes/pchilguano/dataset/test_echonest1.txt', 'wb') as output:
-    reader1 = csv.reader(input1)
-    reader2 = csv.reader(input2)
-    l1 = list(reader1)
-    l1.sort()
-    l2 = list(reader2)
-    l2.sort()
-    l3 = [x for x in l1 if x not in l2]
-	
-		
-with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as f:
-	reader = csv.reader(f)
-	your_list = list(reader)
-	your_list.sort()
-	chain = list(itertools.chain(*your_list))
-
-
-#store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')
-
-chunksize = 20000
-count = 0
-frame = pd.DataFrame()
-for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
-	chunk = chunk.sort(columns='song')
-	chunk = chunk[chunk['song'].isin(chain)]
-	#frame = chunk.query('song == your_list')
-	frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True)
-	count = count + 1
-	print([count, frame.shape])
-	#for item in your_list:
-	#	chunk = chunk[chunk['song'].isin(item)]
-	#store.append('df', chunk[chunk['song'].isin(item)])
-#store.close()
-	
-	
-
-	
-df = store['df']
-chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
-res = next(chunks)  # will raise if there are no chunks!
-for c in chunks:
-    res = res.add(c, fill_value=0)
-
-
-sdf = df.to_sparse()
-
-
+# Delete songIDs mismatches from Taste Profile Dataset 
+with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
+    l1 = list(csv.reader(input1))
+    chain1 = list(itertools.chain(*l1))
+    l2 = list(csv.reader(input2))
+    chain2 = list(itertools.chain(*l2))
+    l3 = set(chain2) - set(chain1)
+    wr = csv.writer(myfile, delimiter=',')
+    for item in l3:
+        wr.writerow([item])