changeset 14:c63dac455296

Forgotten WMF code
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Tue, 21 Jul 2015 11:54:30 +0100
parents 0a0d6203638a
children 2e3c57fba632 c0a08cbdfacd
files Code/latent_vectors.py Code/read_taste_profile.py
diffstat 2 files changed, 33 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Code/latent_vectors.py	Tue Jul 21 11:54:30 2015 +0100
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul 20 13:37:43 2015
+
+@author: Paulo
+"""
+
+
+import pandas as pd
+import csv
+import itertools
+import numpy as np
+import wmf
+
+# Read songID of downloaded audio clips
+with open('/homes/pchilguano/dataset/audio_files.txt', 'rb') as input1:
+    available = list(csv.reader(input1))
+    chain1 = list(itertools.chain(*available))
+    
+# Sparse user-item matrix
+result = pd.DataFrame()
+for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
+    chunk = chunk[chunk.song.isin(chain1)]    
+    result = result.append(chunk.pivot(index='user', columns='song', values='plays')    
+    , ignore_index=True)
+    print (result.shape)
+sresult = result.to_sparse()
+sresult.to_pickle('/homes/pchilguano/dataset/taste_profile_sparse.pkl')
+
+# Weight Matrix Factorization
+B = np.load("test_matrix.pkl")
+S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6)
+U, V = wmf.factorize(S, num_factors=40, lambda_reg=1e-5, num_iterations=2, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias)
\ No newline at end of file
--- a/Code/read_taste_profile.py	Tue Jul 21 10:03:34 2015 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,29 +0,0 @@
-import csv
-import pandas as pd
-import itertools
-
-# Read songIDs from Million Song Dataset songID-trackID mismatches 
-with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
-    writer = csv.writer(out, delimiter=',')
-    next = f.readline()
-    while next != "":
-        writer.writerow([next[8:26]])
-        #print(next[8:26])
-        next = f.readline()
-
-# Read unique songIDs from Taste Profile dataset
-location = r'~/dataset/train_triplets.txt'		
-df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
-ddf = df.drop_duplicates(subset = 'song')
-ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False)
-
-# Delete songIDs mismatches from Taste Profile Dataset 
-with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
-    l1 = list(csv.reader(input1))
-    chain1 = list(itertools.chain(*l1))
-    l2 = list(csv.reader(input2))
-    chain2 = list(itertools.chain(*l2))
-    l3 = set(chain2) - set(chain1)
-    wr = csv.writer(myfile, delimiter=',')
-    for item in l3:
-        wr.writerow([item])