p@25
|
1 # -*- coding: utf-8 -*-
|
p@25
|
2 """
|
p@25
|
3 Created on Wed Aug 19 11:58:19 2015
|
p@25
|
4
|
p@25
|
5 @author: paulochiliguano
|
p@25
|
6 """
|
p@25
|
7
|
p@25
|
8 import cPickle as pickle
|
p@25
|
9 from math import sqrt
|
p@25
|
10 import numpy as np
|
p@25
|
11 import pandas as pd
|
p@25
|
12
|
p@25
|
13 # Item-vector dictionary
|
p@25
|
14 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
15 genre_classification/genre_prob.pkl', 'rb')
|
p@25
|
16 song_library = pickle.load(f)
|
p@25
|
17 f.close()
|
p@25
|
18
|
p@25
|
19 # Load training and test data
|
p@25
|
20 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
21 cross_validation.pkl', 'rb')
|
p@25
|
22 users_train, users_test = pickle.load(f)
|
p@25
|
23 f.close()
|
p@25
|
24
|
p@25
|
25 # Adjusted Cosine Similarity
|
p@25
|
26 def adj_cos_sim(vector_i, vector_j):
|
p@25
|
27 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
|
p@25
|
28 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
|
p@25
|
29 num = sum(map(
|
p@25
|
30 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
|
p@25
|
31 vector_i,
|
p@25
|
32 vector_j)
|
p@25
|
33 )
|
p@25
|
34 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
|
p@25
|
35 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
|
p@25
|
36 return num / (sqrt(dem1) * sqrt(dem2))
|
p@25
|
37
|
p@25
|
38 def computeNearestNeighbor(itemName, itemVector, items):
|
p@25
|
39 """creates a sorted list of items based on their distance to item"""
|
p@25
|
40 similarities = []
|
p@25
|
41 for otherItem in items:
|
p@25
|
42 if otherItem != itemName:
|
p@25
|
43 sim = adj_cos_sim(itemVector, items[otherItem])
|
p@25
|
44 similarities.append((sim, otherItem))
|
p@25
|
45 # sort based on distance -- closest first
|
p@25
|
46 similarities.sort(reverse=True)
|
p@25
|
47 #if len(similarities) > N:
|
p@25
|
48 #similarities = similarities[0:N]
|
p@25
|
49 return similarities
|
p@25
|
50
|
p@25
|
51 def nearest_neighbours(song, train_songs, N):
|
p@25
|
52 similarities = []
|
p@25
|
53 for k in train_songs:
|
p@25
|
54 sim = adj_cos_sim(song_library[song], song_library[k])
|
p@25
|
55 similarities.append((sim, k))
|
p@25
|
56 similarities.sort(reverse=True)
|
p@25
|
57 #if len(similarities) > N:
|
p@25
|
58 #similarities = similarities[0:N]
|
p@25
|
59 return similarities
|
p@25
|
60 #return {t[1]: t[0] for t in similarities}
|
p@25
|
61
|
p@25
|
62 def build_model_cb(song_library, k=30):
|
p@25
|
63 other_songs = song_library.keys()
|
p@25
|
64 similarity_matrix = {}
|
p@25
|
65 for song in song_library:
|
p@25
|
66 similarities = []
|
p@25
|
67 for other in other_songs:
|
p@25
|
68 if other != song:
|
p@25
|
69 sim = adj_cos_sim(song_library[song], song_library[other])
|
p@25
|
70 similarities.append((sim, other))
|
p@25
|
71 similarities.sort(reverse=True)
|
p@25
|
72 similarity_matrix[song] = similarities[0:k]
|
p@25
|
73 return similarity_matrix
|
p@25
|
74 #similarity_rows[song] = {t[1]: t[0] for t in similarities}
|
p@25
|
75
|
p@25
|
76
|
p@25
|
77 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10):
|
p@25
|
78 candidate = pd.DataFrame()
|
p@25
|
79 entries = song_rating.keys()
|
p@25
|
80 for song, rating in song_rating.iteritems():
|
p@25
|
81 if rating > rating_threshold:
|
p@25
|
82 sim = sim_matrix[song]
|
p@25
|
83 list_a = [k for v, k in sim]
|
p@25
|
84 raw = [v for v, k in sim]
|
p@25
|
85 sim_norm = [float(i)/max(raw) for i in raw]
|
p@25
|
86 the_dict = dict(zip(list_a, sim_norm))
|
p@25
|
87 for key in entries:
|
p@25
|
88 if key in the_dict:
|
p@25
|
89 del the_dict[key]
|
p@25
|
90 candidate_aux = pd.DataFrame(
|
p@25
|
91 the_dict.items(),
|
p@25
|
92 columns=['song', 'similarity']
|
p@25
|
93 )
|
p@25
|
94 candidate = candidate.append(candidate_aux, ignore_index=True)
|
p@25
|
95 #tuples = [(k,v) for k,v in the_dict.iteritems()]
|
p@25
|
96 #candidate.extend(tuples)
|
p@25
|
97 topN = candidate.groupby('song')['similarity'].sum()
|
p@25
|
98 topN.sort(1, ascending=False)
|
p@25
|
99
|
p@25
|
100 return list(topN.head(N).keys())
|
p@25
|
101
|
p@25
|
102 def evaluate_cb(topN, test_data, rating_threshold=3):
|
p@25
|
103
|
p@25
|
104 tp = 0.
|
p@25
|
105 fp = 0.
|
p@25
|
106 fn = 0.
|
p@25
|
107 tn = 0.
|
p@25
|
108 for user, song_rating in test_data.iteritems():
|
p@25
|
109 entries = topN[user]
|
p@25
|
110 for song, rating in song_rating.iteritems():
|
p@25
|
111 if song in entries:
|
p@25
|
112 if rating > rating_threshold:
|
p@25
|
113 tp += 1
|
p@25
|
114 elif rating <= rating_threshold:
|
p@25
|
115 fp += 1
|
p@25
|
116 else:
|
p@25
|
117 if rating > rating_threshold:
|
p@25
|
118 fn += 1
|
p@25
|
119 elif rating <= rating_threshold:
|
p@25
|
120 tn += 1
|
p@25
|
121 print tp, fp, fn, tn
|
p@25
|
122 precision = tp / (tp + fp)
|
p@25
|
123 recall = tp / (tp + fn)
|
p@25
|
124 F1 = 2 * precision * recall / (precision + recall)
|
p@25
|
125 accuracy = (tp + tn) / (tp + fp + tn + fn)
|
p@25
|
126
|
p@25
|
127 return precision, recall, F1, accuracy
|
p@25
|
128
|
p@25
|
129 sim_matrix = build_model_cb(song_library, 30)
|
p@25
|
130 p = np.array([])
|
p@25
|
131 f = np.array([])
|
p@25
|
132 r = np.array([])
|
p@25
|
133 a = np.array([])
|
p@25
|
134 for i in range(len(users_train)):
|
p@25
|
135 topN = {}
|
p@25
|
136 for user, song_rating in users_train[i].iteritems():
|
p@25
|
137 topN[user] = top_n(sim_matrix, user, song_rating)
|
p@25
|
138 pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
|
p@25
|
139
|
p@25
|
140 p = np.append(p, pi)
|
p@25
|
141 r = np.append(r, ri)
|
p@25
|
142 f = np.append(f, fi)
|
p@25
|
143 a = np.append(a, ai)
|
p@25
|
144
|
p@25
|
145
|
p@25
|
146 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
|
p@25
|
147 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
|
p@25
|
148 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
|
p@25
|
149 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
|
p@25
|
150
|
p@25
|
151 # set_C = {t[0]: t[1] for t in candidate}
|
p@25
|
152 # for song in set_C:
|
p@25
|
153 # sim = sim_matrix[song]
|
p@25
|
154 # the_dict = {t[1]: t[0] for t in sim}
|
p@25
|
155 # for key in entries:
|
p@25
|
156 # if key in the_dict:
|
p@25
|
157 # the_dict[key]
|