p@25
|
1 # -*- coding: utf-8 -*-
|
p@25
|
2 """
|
p@25
|
3 Created on Wed Aug 19 11:58:19 2015
|
p@25
|
4
|
p@25
|
5 @author: paulochiliguano
|
p@25
|
6 """
|
p@25
|
7
|
p@25
|
8 import cPickle as pickle
|
p@25
|
9 from math import sqrt
|
p@25
|
10 import numpy as np
|
p@25
|
11 import pandas as pd
|
p@26
|
12 import time
|
p@25
|
13
|
p@25
|
14 # Item-vector dictionary
|
p@25
|
15 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
16 genre_classification/genre_prob.pkl', 'rb')
|
p@25
|
17 song_library = pickle.load(f)
|
p@25
|
18 f.close()
|
p@25
|
19
|
p@26
|
20 # Normalisation
|
p@26
|
21 #test = []
|
p@26
|
22 #for k, v in song_library.iteritems():
|
p@26
|
23 # test.append(v)
|
p@26
|
24 #test = np.array(test)
|
p@26
|
25 #test_median = np.median(test, axis=0)
|
p@26
|
26 #test_abs = abs(test - test_median)
|
p@26
|
27 #test_asd = test_abs.sum(axis=0) / test.shape[0]
|
p@26
|
28 #for k, v in song_library.iteritems():
|
p@26
|
29 # modified_standard_score = (np.array(v) - test_median) / test_asd
|
p@26
|
30 # song_library[k] = modified_standard_score.tolist()
|
p@26
|
31
|
p@25
|
32 # Load training and test data
|
p@25
|
33 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
34 cross_validation.pkl', 'rb')
|
p@25
|
35 users_train, users_test = pickle.load(f)
|
p@25
|
36 f.close()
|
p@25
|
37
|
p@25
|
38 # Adjusted Cosine Similarity
|
p@25
|
39 def adj_cos_sim(vector_i, vector_j):
|
p@25
|
40 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
|
p@25
|
41 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
|
p@25
|
42 num = sum(map(
|
p@25
|
43 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
|
p@25
|
44 vector_i,
|
p@25
|
45 vector_j)
|
p@25
|
46 )
|
p@25
|
47 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
|
p@25
|
48 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
|
p@25
|
49 return num / (sqrt(dem1) * sqrt(dem2))
|
p@25
|
50
|
p@26
|
51 def build_model_cb(train_data, k=30):
|
p@26
|
52 a = []
|
p@26
|
53 for user, info in train_data.iteritems():
|
p@26
|
54 a.extend([i for i in info])
|
p@26
|
55 songIDs = list(set(a))
|
p@26
|
56 #other_songs = song_library.keys()
|
p@26
|
57
|
p@25
|
58 similarity_matrix = {}
|
p@26
|
59 for song in songIDs:
|
p@25
|
60 similarities = []
|
p@26
|
61 for other in songIDs:
|
p@25
|
62 if other != song:
|
p@25
|
63 sim = adj_cos_sim(song_library[song], song_library[other])
|
p@25
|
64 similarities.append((sim, other))
|
p@25
|
65 similarities.sort(reverse=True)
|
p@25
|
66 similarity_matrix[song] = similarities[0:k]
|
p@26
|
67
|
p@25
|
68 return similarity_matrix
|
p@25
|
69 #similarity_rows[song] = {t[1]: t[0] for t in similarities}
|
p@25
|
70
|
p@25
|
71 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10):
|
p@25
|
72 candidate = pd.DataFrame()
|
p@25
|
73 entries = song_rating.keys()
|
p@25
|
74 for song, rating in song_rating.iteritems():
|
p@25
|
75 if rating > rating_threshold:
|
p@25
|
76 sim = sim_matrix[song]
|
p@25
|
77 list_a = [k for v, k in sim]
|
p@25
|
78 raw = [v for v, k in sim]
|
p@25
|
79 sim_norm = [float(i)/max(raw) for i in raw]
|
p@25
|
80 the_dict = dict(zip(list_a, sim_norm))
|
p@25
|
81 for key in entries:
|
p@25
|
82 if key in the_dict:
|
p@25
|
83 del the_dict[key]
|
p@25
|
84 candidate_aux = pd.DataFrame(
|
p@25
|
85 the_dict.items(),
|
p@25
|
86 columns=['song', 'similarity']
|
p@25
|
87 )
|
p@25
|
88 candidate = candidate.append(candidate_aux, ignore_index=True)
|
p@25
|
89 #tuples = [(k,v) for k,v in the_dict.iteritems()]
|
p@25
|
90 #candidate.extend(tuples)
|
p@25
|
91 topN = candidate.groupby('song')['similarity'].sum()
|
p@25
|
92 topN.sort(1, ascending=False)
|
p@25
|
93
|
p@25
|
94 return list(topN.head(N).keys())
|
p@25
|
95
|
p@26
|
96 def evaluate_cb(topN, test_data, rating_threshold=2):
|
p@25
|
97
|
p@25
|
98 tp = 0.
|
p@25
|
99 fp = 0.
|
p@25
|
100 fn = 0.
|
p@25
|
101 tn = 0.
|
p@25
|
102 for user, song_rating in test_data.iteritems():
|
p@25
|
103 entries = topN[user]
|
p@25
|
104 for song, rating in song_rating.iteritems():
|
p@25
|
105 if song in entries:
|
p@25
|
106 if rating > rating_threshold:
|
p@25
|
107 tp += 1
|
p@25
|
108 elif rating <= rating_threshold:
|
p@25
|
109 fp += 1
|
p@25
|
110 else:
|
p@25
|
111 if rating > rating_threshold:
|
p@25
|
112 fn += 1
|
p@25
|
113 elif rating <= rating_threshold:
|
p@25
|
114 tn += 1
|
p@27
|
115 #print tp, fp, fn, tn
|
p@26
|
116 if tp != 0:
|
p@26
|
117 precision = tp / (tp + fp)
|
p@26
|
118 recall = tp / (tp + fn)
|
p@26
|
119 F1 = 2 * precision * recall / (precision + recall)
|
p@26
|
120 else:
|
p@26
|
121 precision = 0
|
p@26
|
122 recall = 0
|
p@26
|
123 F1 = 0
|
p@26
|
124
|
p@25
|
125 accuracy = (tp + tn) / (tp + fp + tn + fn)
|
p@25
|
126
|
p@25
|
127 return precision, recall, F1, accuracy
|
p@26
|
128
|
p@25
|
129 p = np.array([])
|
p@25
|
130 f = np.array([])
|
p@25
|
131 r = np.array([])
|
p@25
|
132 a = np.array([])
|
p@26
|
133
|
p@25
|
134 for i in range(len(users_train)):
|
p@26
|
135
|
p@26
|
136 start_time = time.time()
|
p@26
|
137 sim_matrix = build_model_cb(users_train[i])
|
p@26
|
138
|
p@25
|
139 topN = {}
|
p@25
|
140 for user, song_rating in users_train[i].iteritems():
|
p@27
|
141 topN[user] = top_n(sim_matrix, user, song_rating, rating_threshold=2, N=20)
|
p@26
|
142 elapsed_time = time.time() - start_time
|
p@26
|
143 print 'Training execution time: %.3f seconds' % elapsed_time
|
p@26
|
144
|
p@25
|
145 pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
|
p@25
|
146
|
p@25
|
147 p = np.append(p, pi)
|
p@25
|
148 r = np.append(r, ri)
|
p@25
|
149 f = np.append(f, fi)
|
p@25
|
150 a = np.append(a, ai)
|
p@25
|
151
|
p@25
|
152 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
|
p@25
|
153 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
|
p@25
|
154 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
|
p@25
|
155 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
|
p@25
|
156
|
p@25
|
157 # set_C = {t[0]: t[1] for t in candidate}
|
p@25
|
158 # for song in set_C:
|
p@25
|
159 # sim = sim_matrix[song]
|
p@25
|
160 # the_dict = {t[1]: t[0] for t in sim}
|
p@25
|
161 # for key in entries:
|
p@25
|
162 # if key in the_dict:
|
p@25
|
163 # the_dict[key]
|