comparison Code/content_based.py @ 25:fafc0b249a73

Final code
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 23 Aug 2015 16:47:54 +0100
parents
children e4bcfe00abf4
comparison
equal deleted inserted replaced
24:68a62ca32441 25:fafc0b249a73
1 # -*- coding: utf-8 -*-
2 """
3 Created on Wed Aug 19 11:58:19 2015
4
5 @author: paulochiliguano
6 """
7
8 import cPickle as pickle
9 from math import sqrt
10 import numpy as np
11 import pandas as pd
12
13 # Item-vector dictionary
14 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
15 genre_classification/genre_prob.pkl', 'rb')
16 song_library = pickle.load(f)
17 f.close()
18
19 # Load training and test data
20 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
21 cross_validation.pkl', 'rb')
22 users_train, users_test = pickle.load(f)
23 f.close()
24
25 # Adjusted Cosine Similarity
26 def adj_cos_sim(vector_i, vector_j):
27 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
28 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
29 num = sum(map(
30 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
31 vector_i,
32 vector_j)
33 )
34 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
35 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
36 return num / (sqrt(dem1) * sqrt(dem2))
37
38 def computeNearestNeighbor(itemName, itemVector, items):
39 """creates a sorted list of items based on their distance to item"""
40 similarities = []
41 for otherItem in items:
42 if otherItem != itemName:
43 sim = adj_cos_sim(itemVector, items[otherItem])
44 similarities.append((sim, otherItem))
45 # sort based on distance -- closest first
46 similarities.sort(reverse=True)
47 #if len(similarities) > N:
48 #similarities = similarities[0:N]
49 return similarities
50
51 def nearest_neighbours(song, train_songs, N):
52 similarities = []
53 for k in train_songs:
54 sim = adj_cos_sim(song_library[song], song_library[k])
55 similarities.append((sim, k))
56 similarities.sort(reverse=True)
57 #if len(similarities) > N:
58 #similarities = similarities[0:N]
59 return similarities
60 #return {t[1]: t[0] for t in similarities}
61
62 def build_model_cb(song_library, k=30):
63 other_songs = song_library.keys()
64 similarity_matrix = {}
65 for song in song_library:
66 similarities = []
67 for other in other_songs:
68 if other != song:
69 sim = adj_cos_sim(song_library[song], song_library[other])
70 similarities.append((sim, other))
71 similarities.sort(reverse=True)
72 similarity_matrix[song] = similarities[0:k]
73 return similarity_matrix
74 #similarity_rows[song] = {t[1]: t[0] for t in similarities}
75
76
77 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10):
78 candidate = pd.DataFrame()
79 entries = song_rating.keys()
80 for song, rating in song_rating.iteritems():
81 if rating > rating_threshold:
82 sim = sim_matrix[song]
83 list_a = [k for v, k in sim]
84 raw = [v for v, k in sim]
85 sim_norm = [float(i)/max(raw) for i in raw]
86 the_dict = dict(zip(list_a, sim_norm))
87 for key in entries:
88 if key in the_dict:
89 del the_dict[key]
90 candidate_aux = pd.DataFrame(
91 the_dict.items(),
92 columns=['song', 'similarity']
93 )
94 candidate = candidate.append(candidate_aux, ignore_index=True)
95 #tuples = [(k,v) for k,v in the_dict.iteritems()]
96 #candidate.extend(tuples)
97 topN = candidate.groupby('song')['similarity'].sum()
98 topN.sort(1, ascending=False)
99
100 return list(topN.head(N).keys())
101
102 def evaluate_cb(topN, test_data, rating_threshold=3):
103
104 tp = 0.
105 fp = 0.
106 fn = 0.
107 tn = 0.
108 for user, song_rating in test_data.iteritems():
109 entries = topN[user]
110 for song, rating in song_rating.iteritems():
111 if song in entries:
112 if rating > rating_threshold:
113 tp += 1
114 elif rating <= rating_threshold:
115 fp += 1
116 else:
117 if rating > rating_threshold:
118 fn += 1
119 elif rating <= rating_threshold:
120 tn += 1
121 print tp, fp, fn, tn
122 precision = tp / (tp + fp)
123 recall = tp / (tp + fn)
124 F1 = 2 * precision * recall / (precision + recall)
125 accuracy = (tp + tn) / (tp + fp + tn + fn)
126
127 return precision, recall, F1, accuracy
128
129 sim_matrix = build_model_cb(song_library, 30)
130 p = np.array([])
131 f = np.array([])
132 r = np.array([])
133 a = np.array([])
134 for i in range(len(users_train)):
135 topN = {}
136 for user, song_rating in users_train[i].iteritems():
137 topN[user] = top_n(sim_matrix, user, song_rating)
138 pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
139
140 p = np.append(p, pi)
141 r = np.append(r, ri)
142 f = np.append(f, fi)
143 a = np.append(a, ai)
144
145
146 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
147 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
148 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
149 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
150
151 # set_C = {t[0]: t[1] for t in candidate}
152 # for song in set_C:
153 # sim = sim_matrix[song]
154 # the_dict = {t[1]: t[0] for t in sim}
155 # for key in entries:
156 # if key in the_dict:
157 # the_dict[key]