diff Code/content_based.py @ 26:e4bcfe00abf4

Final version of code
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Wed, 26 Aug 2015 02:00:48 +0100
parents fafc0b249a73
children ae650489d3a8
line wrap: on
line diff
--- a/Code/content_based.py	Sun Aug 23 16:47:54 2015 +0100
+++ b/Code/content_based.py	Wed Aug 26 02:00:48 2015 +0100
@@ -9,6 +9,7 @@
 from math import sqrt
 import numpy as np
 import pandas as pd
+import time
 
 # Item-vector dictionary
 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
@@ -16,6 +17,18 @@
 song_library = pickle.load(f)
 f.close()
 
+# Normalisation
+#test = []
+#for k, v in song_library.iteritems():
+#    test.append(v)
+#test = np.array(test)
+#test_median = np.median(test, axis=0)
+#test_abs = abs(test - test_median)
+#test_asd = test_abs.sum(axis=0) / test.shape[0]
+#for k, v in song_library.iteritems():
+#    modified_standard_score = (np.array(v) - test_median) / test_asd
+#    song_library[k] = modified_standard_score.tolist()
+
 # Load training and test data
 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
 cross_validation.pkl', 'rb')
@@ -35,45 +48,26 @@
     dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
     return num / (sqrt(dem1) * sqrt(dem2))
 
-def computeNearestNeighbor(itemName, itemVector, items):
-    """creates a sorted list of items based on their distance to item"""
-    similarities = []
-    for otherItem in items:
-        if otherItem != itemName:
-            sim = adj_cos_sim(itemVector, items[otherItem])
-            similarities.append((sim, otherItem))
-    # sort based on distance -- closest first
-    similarities.sort(reverse=True)
-    #if len(similarities) > N:
-        #similarities = similarities[0:N]
-    return similarities
-
-def nearest_neighbours(song, train_songs, N):
-    similarities = []
-    for k in train_songs:
-        sim = adj_cos_sim(song_library[song], song_library[k])
-        similarities.append((sim, k))
-    similarities.sort(reverse=True)
-    #if len(similarities) > N:
-        #similarities = similarities[0:N] 
-    return similarities
-    #return {t[1]: t[0] for t in similarities}
-
-def build_model_cb(song_library, k=30):
-    other_songs = song_library.keys()
+def build_model_cb(train_data, k=30):
+    a = []
+    for user, info in train_data.iteritems():
+        a.extend([i for i in info])
+    songIDs = list(set(a))       
+    #other_songs = song_library.keys()
+    
     similarity_matrix = {}
-    for song in song_library:
+    for song in songIDs:
         similarities = []
-        for other in other_songs:
+        for other in songIDs:
             if other != song:
                 sim = adj_cos_sim(song_library[song], song_library[other])
                 similarities.append((sim, other))
         similarities.sort(reverse=True)
         similarity_matrix[song] = similarities[0:k]
+    
     return similarity_matrix
         #similarity_rows[song] = {t[1]: t[0] for t in similarities}
 
-
 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10): 
     candidate = pd.DataFrame()
     entries = song_rating.keys()
@@ -99,7 +93,7 @@
     
     return list(topN.head(N).keys())
 
-def evaluate_cb(topN, test_data, rating_threshold=3):    
+def evaluate_cb(topN, test_data, rating_threshold=2):    
     
     tp = 0.
     fp = 0.
@@ -119,22 +113,35 @@
                 elif rating <= rating_threshold:
                     tn += 1
     print tp, fp, fn, tn
-    precision = tp / (tp + fp)
-    recall = tp / (tp + fn)
-    F1 = 2 * precision * recall / (precision + recall)
+    if tp != 0:
+        precision = tp / (tp + fp)
+        recall = tp / (tp + fn)
+        F1 = 2 * precision * recall / (precision + recall)
+    else:
+        precision = 0
+        recall = 0
+        F1 = 0
+    
     accuracy = (tp + tn) / (tp + fp + tn + fn)
     
     return precision, recall, F1, accuracy
-  
-sim_matrix = build_model_cb(song_library, 30)
+
 p = np.array([])
 f = np.array([])
 r = np.array([])
 a = np.array([])
+
 for i in range(len(users_train)):
+ 
+    start_time = time.time()
+    sim_matrix = build_model_cb(users_train[i])
+    
     topN = {}
     for user, song_rating in users_train[i].iteritems():
         topN[user] = top_n(sim_matrix, user, song_rating)
+    elapsed_time = time.time() - start_time
+    print 'Training execution time: %.3f seconds' % elapsed_time
+        
     pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
     
     p = np.append(p, pi)
@@ -142,7 +149,6 @@
     f = np.append(f, fi)
     a = np.append(a, ai)
     
-    
 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
 print "F1 = %f3 ± %f3" % (f.mean(), f.std())