# HG changeset patch
# User Paulo Chiliguano
# Date 1438113537 -3600
# Node ID c0a08cbdfacd040f68af9183878189a88f1a0cc8
# Parent c63dac455296239fcc5164fb424d286eacd84baf
First script
diff -r c63dac455296 -r c0a08cbdfacd Code/read_songID.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Code/read_songID.py Tue Jul 28 20:58:57 2015 +0100
@@ -0,0 +1,42 @@
+import csv
+import pandas as pd
+import itertools
+
+# Read songIDs from Million Song Dataset songID-trackID mismatches
+with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
+ writer = csv.writer(out, delimiter=',')
+ next = f.readline()
+ while next != "":
+ writer.writerow([next[8:26]])
+ #print(next[8:26])
+ next = f.readline()
+
+# Read unique songIDs from Taste Profile dataset
+location = r'~/dataset/train_triplets.txt'
+df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
+ddf = df.drop_duplicates(subset = 'song')
+ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False)
+
+# Delete songIDs mismatches from Taste Profile Dataset
+with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
+ l1 = list(csv.reader(input1))
+ chain1 = list(itertools.chain(*l1))
+ l2 = list(csv.reader(input2))
+ chain2 = list(itertools.chain(*l2))
+ l3 = set(chain2) - set(chain1)
+ wr = csv.writer(myfile, delimiter=',')
+ for item in l3:
+ wr.writerow([item])
+
+# Save Taste Profile dataset without SongID mismatches
+mdf = df[df.song.isin(l3)]
+mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False)
+
+result = pd.DataFrame()
+for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
+ chunk = chunk[chunk.song.isin(l3)]
+ result = result.append(chunk.pivot(index='user', columns='song', values='plays')
+ , ignore_index=True)
+ print (result.shape)
+
+
\ No newline at end of file