p@15
|
1 # -*- coding: utf-8 -*-
|
p@15
|
2 """
|
p@15
|
3 Created on Wed Jul 22 17:42:09 2015
|
p@15
|
4
|
p@15
|
5 @author: paulochiliguano
|
p@15
|
6 """
|
p@15
|
7
|
p@16
|
8
|
p@25
|
9 from math import log, sqrt
|
p@15
|
10 import numpy as np
|
p@24
|
11 import pandas as pd
|
p@25
|
12 import cPickle as pickle
|
p@26
|
13 import time
|
p@15
|
14
|
p@25
|
15 # Item-vector dictionary
|
p@25
|
16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
17 genre_classification/genre_prob.pkl', 'rb')
|
p@25
|
18 song_library = pickle.load(f)
|
p@25
|
19 f.close()
|
p@24
|
20
|
p@25
|
21 # Load training and test data
|
p@25
|
22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
23 cross_validation.pkl', 'rb')
|
p@25
|
24 users_train, users_test = pickle.load(f)
|
p@25
|
25 f.close()
|
p@25
|
26
|
p@25
|
27 # Cosine Similarity
|
p@25
|
28 def cosine_similarity(vector1, vector2):
|
p@25
|
29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
|
p@25
|
30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
|
p@25
|
31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
|
p@25
|
32 return dot_product / (length_x * length_y)
|
p@25
|
33
|
p@25
|
34 # Adjusted Cosine Similarity
|
p@25
|
35 def adj_cos_sim(vector_i, vector_j):
|
p@25
|
36 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
|
p@25
|
37 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
|
p@25
|
38 num = sum(map(
|
p@25
|
39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
|
p@25
|
40 vector_i,
|
p@25
|
41 vector_j)
|
p@25
|
42 )
|
p@25
|
43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
|
p@25
|
44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
|
p@25
|
45 return num / (sqrt(dem1) * sqrt(dem2))
|
p@25
|
46
|
p@25
|
47 # Fitness function for EDA
|
p@25
|
48 def Fitness(profile_u, user_subset):
|
p@25
|
49 fitness_value = 0
|
p@25
|
50 for songID, score in user_subset.iteritems():
|
p@25
|
51 #print cosine_similarity(profile, song_library[songID])
|
p@25
|
52 sim = cosine_similarity(profile_u, song_library[songID])
|
p@25
|
53 if sim <= 0:
|
p@25
|
54 fitness_value += -708
|
p@25
|
55 #math.log(sys.float_info.min)
|
p@25
|
56 else:
|
p@25
|
57 fitness_value += log(score * sim)
|
p@25
|
58 #fitness_value += log(score * manhattan(profile, song_library[songID]))
|
p@25
|
59 #fitness_value += score * cosine_similarity(profile, song_library[songID])
|
p@25
|
60 return fitness_value
|
p@25
|
61
|
p@26
|
62 def users_likes_subset(users, rating_threshold=2):
|
p@25
|
63 # Subset of most-liked items
|
p@25
|
64 users_subset = {}
|
p@25
|
65 for userID, songs in users.iteritems():
|
p@25
|
66 scores_above_threshold = {
|
p@25
|
67 songID: score for songID, score in songs.iteritems() if score > rating_threshold
|
p@15
|
68 }
|
p@25
|
69 users_subset[userID]= scores_above_threshold
|
p@25
|
70
|
p@25
|
71 #for songID, score in songs.iteritems():
|
p@25
|
72 #print score >0
|
p@25
|
73 #if score > 0:
|
p@25
|
74 #print {userID: {songID: score}}
|
p@15
|
75
|
p@25
|
76 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
|
p@25
|
77
|
p@25
|
78 return users_subset
|
p@15
|
79
|
p@26
|
80 def eda_train(users_subset, max_gen=250):
|
p@25
|
81 # TRAINING
|
p@25
|
82 num_features = len(song_library.values()[0])
|
p@25
|
83 # Given parameters for EDA
|
p@25
|
84 population_size = len(users_subset)
|
p@25
|
85 fraction_of_population = int(round(0.5 * population_size))
|
p@23
|
86
|
p@25
|
87 # Generation of M individuals uniformly
|
p@25
|
88 np.random.seed(12345)
|
p@25
|
89 M = np.random.uniform(
|
p@25
|
90 0,
|
p@25
|
91 1,
|
p@25
|
92 population_size * num_features
|
p@25
|
93 )
|
p@25
|
94 M.shape = (-1, num_features)
|
p@25
|
95 profile_u = {}
|
p@25
|
96 i = 0
|
p@25
|
97 for userID in users_subset:
|
p@25
|
98 profile_u[userID] = M.tolist()[i]
|
p@25
|
99 i += 1
|
p@23
|
100
|
p@26
|
101 fitnesses = []
|
p@25
|
102 generation = 0
|
p@25
|
103 while generation < max_gen:
|
p@25
|
104 # Compute fitness values
|
p@25
|
105 users_fitness = {}
|
p@25
|
106 for userID in profile_u:
|
p@25
|
107 users_fitness[userID] = Fitness(
|
p@25
|
108 profile_u[userID],
|
p@25
|
109 users_subset[userID]
|
p@25
|
110 )
|
p@25
|
111 users_fitness_df = pd.DataFrame(
|
p@25
|
112 users_fitness.items(),
|
p@25
|
113 columns=["userID", "fitness"]
|
p@25
|
114 )
|
p@26
|
115 fitnesses.append(users_fitness_df.fitness.values.tolist())
|
p@26
|
116
|
p@25
|
117 # Selection of best individuals based on fitness values
|
p@25
|
118 best_individuals = {}
|
p@25
|
119 users_fitness_df = users_fitness_df.sort(columns='fitness')
|
p@25
|
120 M_sel = users_fitness_df.tail(fraction_of_population)
|
p@25
|
121 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
|
p@25
|
122 for userID in M_sel_dict:
|
p@25
|
123 best_individuals[userID] = profile_u[userID]
|
p@25
|
124
|
p@25
|
125 # Calculate sample mean and standard deviation
|
p@25
|
126 D = np.array([])
|
p@25
|
127 for userID, features in best_individuals.iteritems():
|
p@25
|
128 D = np.append(D, features, axis=0)
|
p@25
|
129 D.shape = (-1, num_features)
|
p@25
|
130 D_mu = np.mean(D, axis=0)
|
p@25
|
131 D_sigma = np.std(D, axis=0, ddof=1)
|
p@25
|
132
|
p@25
|
133 # Sample M individuals
|
p@25
|
134 M = np.random.normal(
|
p@25
|
135 D_mu,
|
p@25
|
136 D_sigma,
|
p@25
|
137 (population_size, num_features)
|
p@25
|
138 )
|
p@25
|
139 #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2))
|
p@25
|
140
|
p@25
|
141 #M.shape = (-1, len(items.values()[0]))
|
p@25
|
142 #M = D_sigma * np.random.normal(
|
p@25
|
143 #population_size,
|
p@25
|
144 #len(items.values()[0])
|
p@25
|
145 #) + D_mu
|
p@25
|
146 profile_u = {}
|
p@25
|
147 i = 0
|
p@25
|
148 for userID in users_subset:
|
p@25
|
149 profile_u[userID] = M.tolist()[i]
|
p@25
|
150 i += 1
|
p@25
|
151 generation += 1
|
p@25
|
152
|
p@26
|
153 return profile_u, D, np.array(fitnesses)
|
p@25
|
154
|
p@25
|
155 # Similarity matrix
|
p@25
|
156 def cb_similarity(profileID, profile_data, test_data, N):
|
p@26
|
157
|
p@26
|
158 a = []
|
p@26
|
159 for user, info in test_data.iteritems():
|
p@26
|
160 a.extend([i for i in info])
|
p@26
|
161 songIDs = list(set(a))
|
p@26
|
162
|
p@25
|
163 ''' Content-based: Similarity matrix '''
|
p@25
|
164 similarity = []
|
p@26
|
165 for songID in songIDs:
|
p@25
|
166 sim = adj_cos_sim(profile_data, song_library[songID])
|
p@25
|
167 similarity.append((sim, songID))
|
p@26
|
168
|
p@26
|
169 ''' Top-N recommendation '''
|
p@26
|
170 similarity.sort(reverse=True)
|
p@26
|
171 if len(similarity) > N:
|
p@26
|
172 similarity = similarity[0:N]
|
p@25
|
173
|
p@25
|
174 #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
|
p@25
|
175 return {t[1]: t[0] for t in similarity}
|
p@25
|
176
|
p@25
|
177 def evaluate_eda(
|
p@25
|
178 profiles,
|
p@25
|
179 test_data,
|
p@25
|
180 N=10,
|
p@26
|
181 rating_threshold=2,
|
p@26
|
182 EDA_treshold=0.5):
|
p@25
|
183
|
p@25
|
184 ''' Evaluation '''
|
p@25
|
185
|
p@25
|
186 sim_matrix = {}
|
p@25
|
187 for userID, features in profiles.iteritems():
|
p@25
|
188 sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
|
p@25
|
189
|
p@25
|
190 # Content-Based: Evaluation
|
p@25
|
191 tp = 0.
|
p@25
|
192 fp = 0.
|
p@25
|
193 fn = 0.
|
p@25
|
194 tn = 0.
|
p@25
|
195
|
p@26
|
196 for user, song_rating in test_data.iteritems():
|
p@26
|
197 entries = sim_matrix[user]
|
p@26
|
198 for song, rating in song_rating.iteritems():
|
p@26
|
199 if song in entries:
|
p@26
|
200 if rating > rating_threshold:
|
p@26
|
201 tp += 1
|
p@26
|
202 elif rating <= rating_threshold:
|
p@26
|
203 fp += 1
|
p@26
|
204 else:
|
p@26
|
205 if rating > rating_threshold:
|
p@26
|
206 fn += 1
|
p@26
|
207 elif rating <= rating_threshold:
|
p@26
|
208 tn += 1
|
p@26
|
209
|
p@26
|
210
|
p@26
|
211 # for userID, songID_sim in sim_matrix.iteritems():
|
p@26
|
212 # for songID, sim_value in songID_sim.iteritems():
|
p@26
|
213 # score = test_data[userID][songID]
|
p@26
|
214 # if score > rating_threshold and sim_value >= EDA_treshold:
|
p@26
|
215 # tp += 1
|
p@26
|
216 # elif score <= rating_threshold and sim_value >= EDA_treshold:
|
p@26
|
217 # fp += 1
|
p@26
|
218 # elif score > rating_threshold and sim_value < EDA_treshold:
|
p@26
|
219 # fn += 1
|
p@26
|
220 # elif score <= rating_threshold and sim_value < EDA_treshold:
|
p@26
|
221 # tn += 1
|
p@27
|
222 #print tp, fp, fn, tn
|
p@26
|
223 if tp != 0:
|
p@26
|
224 precision = tp / (tp + fp)
|
p@26
|
225 recall = tp / (tp + fn)
|
p@26
|
226 F1 = 2 * precision * recall / (precision + recall)
|
p@26
|
227 else:
|
p@26
|
228 precision = 0
|
p@26
|
229 recall = 0
|
p@26
|
230 F1 = 0
|
p@26
|
231
|
p@25
|
232 accuracy = (tp + tn) / (tp + fp + tn + fn)
|
p@25
|
233
|
p@25
|
234 return precision, recall, F1, accuracy
|
p@25
|
235
|
p@25
|
236 #keys_a = set(users[userID].keys())
|
p@25
|
237 #keys_b = set(test_data.keys())
|
p@25
|
238 #intersection = keys_a & keys_b
|
p@25
|
239 #if len(intersection) != 0:
|
p@25
|
240 #similarity = {}
|
p@25
|
241 #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs}
|
p@25
|
242 #for songID in intersection:
|
p@25
|
243 #if songID == k:
|
p@25
|
244 #similarity[songID] = adj_cos_sim(
|
p@25
|
245 #profile[userID],
|
p@25
|
246 #test_data[songID]
|
p@25
|
247 #)
|
p@25
|
248 #max_sim = max(similarity, key=similarity.get)
|
p@25
|
249 #if max_sim >= EDA_treshold:
|
p@25
|
250 #sim_matrix[userID] = {max_sim: similarity[max_sim]}
|
p@25
|
251 #sim_matrix[userID] = similarity
|
p@25
|
252 #sim_matrix[userID] = {max_sim: similarity[max_sim]}
|
p@25
|
253
|
p@25
|
254 #print len(sim_matrix)
|
p@25
|
255 p = np.array([])
|
p@25
|
256 f = np.array([])
|
p@25
|
257 r = np.array([])
|
p@25
|
258 a = np.array([])
|
p@25
|
259
|
p@25
|
260 for i in range(len(users_train)):
|
p@26
|
261 start_time = time.time()
|
p@26
|
262 profile_u, prob, fffitness = eda_train(users_likes_subset(users_train[i]))
|
p@26
|
263 elapsed_time = time.time() - start_time
|
p@26
|
264 print 'Training execution time: %.3f seconds' % elapsed_time
|
p@25
|
265
|
p@27
|
266 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i], N=20)
|
p@25
|
267 p = np.append(p, pi)
|
p@25
|
268 r = np.append(r, ri)
|
p@25
|
269 f = np.append(f, fi)
|
p@25
|
270 a = np.append(a, ai)
|
p@25
|
271
|
p@25
|
272 #precision = np.array(p)
|
p@25
|
273 #rec = np.array(r)
|
p@25
|
274 #F1 = np.array(f)
|
p@25
|
275 #accuracy = np.array(a)
|
p@25
|
276
|
p@25
|
277 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
|
p@25
|
278 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
|
p@25
|
279 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
|
p@25
|
280 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
|