diff Code/content_based.py @ 25:fafc0b249a73

Final code
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 23 Aug 2015 16:47:54 +0100
parents
children e4bcfe00abf4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Code/content_based.py	Sun Aug 23 16:47:54 2015 +0100
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Aug 19 11:58:19 2015
+
+@author: paulochiliguano
+"""
+
+import cPickle as pickle
+from math import sqrt
+import numpy as np
+import pandas as pd
+
+# Item-vector dictionary
+f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
+genre_classification/genre_prob.pkl', 'rb')
+song_library = pickle.load(f)
+f.close()
+
+# Load training and test data
+f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
+cross_validation.pkl', 'rb')
+users_train, users_test = pickle.load(f)
+f.close()
+
+# Adjusted Cosine Similarity
+def adj_cos_sim(vector_i, vector_j):
+    avrg_w_i = (float(sum(vector_i)) / len(vector_i))
+    avrg_w_j = (float(sum(vector_j)) / len(vector_j))
+    num = sum(map(
+        lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
+        vector_i,
+        vector_j)
+    )
+    dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
+    dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
+    return num / (sqrt(dem1) * sqrt(dem2))
+
+def computeNearestNeighbor(itemName, itemVector, items):
+    """creates a sorted list of items based on their distance to item"""
+    similarities = []
+    for otherItem in items:
+        if otherItem != itemName:
+            sim = adj_cos_sim(itemVector, items[otherItem])
+            similarities.append((sim, otherItem))
+    # sort based on distance -- closest first
+    similarities.sort(reverse=True)
+    #if len(similarities) > N:
+        #similarities = similarities[0:N]
+    return similarities
+
+def nearest_neighbours(song, train_songs, N):
+    similarities = []
+    for k in train_songs:
+        sim = adj_cos_sim(song_library[song], song_library[k])
+        similarities.append((sim, k))
+    similarities.sort(reverse=True)
+    #if len(similarities) > N:
+        #similarities = similarities[0:N] 
+    return similarities
+    #return {t[1]: t[0] for t in similarities}
+
+def build_model_cb(song_library, k=30):
+    other_songs = song_library.keys()
+    similarity_matrix = {}
+    for song in song_library:
+        similarities = []
+        for other in other_songs:
+            if other != song:
+                sim = adj_cos_sim(song_library[song], song_library[other])
+                similarities.append((sim, other))
+        similarities.sort(reverse=True)
+        similarity_matrix[song] = similarities[0:k]
+    return similarity_matrix
+        #similarity_rows[song] = {t[1]: t[0] for t in similarities}
+
+
+def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10): 
+    candidate = pd.DataFrame()
+    entries = song_rating.keys()
+    for song, rating in song_rating.iteritems():
+        if rating > rating_threshold:
+            sim = sim_matrix[song]
+            list_a = [k for v, k in sim]
+            raw = [v for v, k in sim]
+            sim_norm = [float(i)/max(raw) for i in raw]
+            the_dict = dict(zip(list_a, sim_norm))
+            for key in entries:
+                if key in the_dict:
+                    del the_dict[key]
+            candidate_aux = pd.DataFrame(
+                the_dict.items(),
+                columns=['song', 'similarity']
+            )
+            candidate = candidate.append(candidate_aux, ignore_index=True)
+            #tuples = [(k,v) for k,v in the_dict.iteritems()]
+            #candidate.extend(tuples)
+    topN = candidate.groupby('song')['similarity'].sum()
+    topN.sort(1, ascending=False)
+    
+    return list(topN.head(N).keys())
+
+def evaluate_cb(topN, test_data, rating_threshold=3):    
+    
+    tp = 0.
+    fp = 0.
+    fn = 0.
+    tn = 0.
+    for user, song_rating in test_data.iteritems():
+        entries = topN[user]
+        for song, rating in song_rating.iteritems():
+            if song in entries:
+                if rating > rating_threshold:
+                    tp += 1
+                elif rating <= rating_threshold:
+                    fp += 1   
+            else:
+                if rating > rating_threshold:
+                    fn += 1
+                elif rating <= rating_threshold:
+                    tn += 1
+    print tp, fp, fn, tn
+    precision = tp / (tp + fp)
+    recall = tp / (tp + fn)
+    F1 = 2 * precision * recall / (precision + recall)
+    accuracy = (tp + tn) / (tp + fp + tn + fn)
+    
+    return precision, recall, F1, accuracy
+  
+sim_matrix = build_model_cb(song_library, 30)
+p = np.array([])
+f = np.array([])
+r = np.array([])
+a = np.array([])
+for i in range(len(users_train)):
+    topN = {}
+    for user, song_rating in users_train[i].iteritems():
+        topN[user] = top_n(sim_matrix, user, song_rating)
+    pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
+    
+    p = np.append(p, pi)
+    r = np.append(r, ri)
+    f = np.append(f, fi)
+    a = np.append(a, ai)
+    
+    
+print "Precision = %f3 ± %f3" % (p.mean(), p.std())
+print "Recall = %f3 ± %f3" % (r.mean(), r.std())
+print "F1 = %f3 ± %f3" % (f.mean(), f.std())
+print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
+
+#        set_C = {t[0]: t[1] for t in candidate}
+#        for song in set_C:
+#            sim = sim_matrix[song]
+#            the_dict = {t[1]: t[0] for t in sim}
+#            for key in entries:
+#                if key in the_dict:
+#                    the_dict[key]