changeset 18:c0a08cbdfacd

First script
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Tue, 28 Jul 2015 20:58:57 +0100
parents c63dac455296
children f1504bb2c552
files Code/read_songID.py
diffstat 1 files changed, 42 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Code/read_songID.py	Tue Jul 28 20:58:57 2015 +0100
@@ -0,0 +1,42 @@
+import csv
+import pandas as pd
+import itertools
+
+# Read songIDs from Million Song Dataset songID-trackID mismatches 
+with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
+    writer = csv.writer(out, delimiter=',')
+    next = f.readline()
+    while next != "":
+        writer.writerow([next[8:26]])
+        #print(next[8:26])
+        next = f.readline()
+
+# Read unique songIDs from Taste Profile dataset
+location = r'~/dataset/train_triplets.txt'		
+df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
+ddf = df.drop_duplicates(subset = 'song')
+ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False)
+
+# Delete songIDs mismatches from Taste Profile Dataset 
+with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
+    l1 = list(csv.reader(input1))
+    chain1 = list(itertools.chain(*l1))
+    l2 = list(csv.reader(input2))
+    chain2 = list(itertools.chain(*l2))
+    l3 = set(chain2) - set(chain1)
+    wr = csv.writer(myfile, delimiter=',')
+    for item in l3:
+        wr.writerow([item])
+
+# Save Taste Profile dataset without SongID mismatches
+mdf = df[df.song.isin(l3)]
+mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False)
+
+result = pd.DataFrame()
+for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
+    chunk = chunk[chunk.song.isin(l3)]    
+    result = result.append(chunk.pivot(index='user', columns='song', values='plays')    
+    , ignore_index=True)
+    print (result.shape)
+
+    
\ No newline at end of file