p@15
|
1 # -*- coding: utf-8 -*-
|
p@15
|
2 """
|
p@15
|
3 Created on Wed Jul 22 17:42:09 2015
|
p@15
|
4
|
p@15
|
5 @author: paulochiliguano
|
p@15
|
6 """
|
p@15
|
7
|
p@16
|
8
|
p@25
|
9 from math import log, sqrt
|
p@15
|
10 import numpy as np
|
p@24
|
11 import pandas as pd
|
p@25
|
12 import cPickle as pickle
|
p@25
|
13 #import random
|
p@15
|
14
|
p@25
|
15 # Item-vector dictionary
|
p@25
|
16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
17 genre_classification/genre_prob.pkl', 'rb')
|
p@25
|
18 song_library = pickle.load(f)
|
p@25
|
19 f.close()
|
p@24
|
20
|
p@25
|
21 # Load training and test data
|
p@25
|
22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
23 cross_validation.pkl', 'rb')
|
p@25
|
24 users_train, users_test = pickle.load(f)
|
p@25
|
25 f.close()
|
p@25
|
26
|
p@25
|
27 # Cosine Similarity
|
p@25
|
28 def cosine_similarity(vector1, vector2):
|
p@25
|
29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
|
p@25
|
30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
|
p@25
|
31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
|
p@25
|
32 return dot_product / (length_x * length_y)
|
p@25
|
33
|
p@25
|
34 # Adjusted Cosine Similarity
|
p@25
|
35 def adj_cos_sim(vector_i, vector_j):
|
p@25
|
36 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
|
p@25
|
37 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
|
p@25
|
38 num = sum(map(
|
p@25
|
39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
|
p@25
|
40 vector_i,
|
p@25
|
41 vector_j)
|
p@25
|
42 )
|
p@25
|
43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
|
p@25
|
44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
|
p@25
|
45 return num / (sqrt(dem1) * sqrt(dem2))
|
p@25
|
46
|
p@25
|
47 # Fitness function for EDA
|
p@25
|
48 def Fitness(profile_u, user_subset):
|
p@25
|
49 fitness_value = 0
|
p@25
|
50 for songID, score in user_subset.iteritems():
|
p@25
|
51 #print cosine_similarity(profile, song_library[songID])
|
p@25
|
52 sim = cosine_similarity(profile_u, song_library[songID])
|
p@25
|
53 if sim <= 0:
|
p@25
|
54 fitness_value += -708
|
p@25
|
55 #math.log(sys.float_info.min)
|
p@25
|
56 else:
|
p@25
|
57 fitness_value += log(score * sim)
|
p@25
|
58 #fitness_value += log(score * manhattan(profile, song_library[songID]))
|
p@25
|
59 #fitness_value += score * cosine_similarity(profile, song_library[songID])
|
p@25
|
60 return fitness_value
|
p@25
|
61
|
p@25
|
62 def users_likes_subset(users, rating_threshold=3):
|
p@25
|
63 # Subset of most-liked items
|
p@25
|
64 users_subset = {}
|
p@25
|
65 for userID, songs in users.iteritems():
|
p@25
|
66 scores_above_threshold = {
|
p@25
|
67 songID: score for songID, score in songs.iteritems() if score > rating_threshold
|
p@15
|
68 }
|
p@25
|
69 users_subset[userID]= scores_above_threshold
|
p@25
|
70
|
p@25
|
71 #for songID, score in songs.iteritems():
|
p@25
|
72 #print score >0
|
p@25
|
73 #if score > 0:
|
p@25
|
74 #print {userID: {songID: score}}
|
p@15
|
75
|
p@25
|
76 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
|
p@25
|
77
|
p@25
|
78 return users_subset
|
p@15
|
79
|
p@25
|
80 def eda_train(users_subset, max_gen=200):
|
p@25
|
81 # TRAINING
|
p@25
|
82 num_features = len(song_library.values()[0])
|
p@25
|
83 # Given parameters for EDA
|
p@25
|
84 population_size = len(users_subset)
|
p@25
|
85 fraction_of_population = int(round(0.5 * population_size))
|
p@23
|
86
|
p@25
|
87 # Generation of M individuals uniformly
|
p@25
|
88 np.random.seed(12345)
|
p@25
|
89 M = np.random.uniform(
|
p@25
|
90 0,
|
p@25
|
91 1,
|
p@25
|
92 population_size * num_features
|
p@25
|
93 )
|
p@25
|
94 M.shape = (-1, num_features)
|
p@25
|
95 profile_u = {}
|
p@25
|
96 i = 0
|
p@25
|
97 for userID in users_subset:
|
p@25
|
98 profile_u[userID] = M.tolist()[i]
|
p@25
|
99 i += 1
|
p@23
|
100
|
p@25
|
101 generation = 0
|
p@25
|
102 while generation < max_gen:
|
p@25
|
103 # Compute fitness values
|
p@25
|
104 users_fitness = {}
|
p@25
|
105 for userID in profile_u:
|
p@25
|
106 users_fitness[userID] = Fitness(
|
p@25
|
107 profile_u[userID],
|
p@25
|
108 users_subset[userID]
|
p@25
|
109 )
|
p@25
|
110 users_fitness_df = pd.DataFrame(
|
p@25
|
111 users_fitness.items(),
|
p@25
|
112 columns=["userID", "fitness"]
|
p@25
|
113 )
|
p@25
|
114
|
p@25
|
115 # Selection of best individuals based on fitness values
|
p@25
|
116 best_individuals = {}
|
p@25
|
117 users_fitness_df = users_fitness_df.sort(columns='fitness')
|
p@25
|
118 M_sel = users_fitness_df.tail(fraction_of_population)
|
p@25
|
119 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
|
p@25
|
120 for userID in M_sel_dict:
|
p@25
|
121 best_individuals[userID] = profile_u[userID]
|
p@25
|
122
|
p@25
|
123 # Calculate sample mean and standard deviation
|
p@25
|
124 D = np.array([])
|
p@25
|
125 for userID, features in best_individuals.iteritems():
|
p@25
|
126 D = np.append(D, features, axis=0)
|
p@25
|
127 D.shape = (-1, num_features)
|
p@25
|
128 D_mu = np.mean(D, axis=0)
|
p@25
|
129 D_sigma = np.std(D, axis=0, ddof=1)
|
p@25
|
130
|
p@25
|
131 # Sample M individuals
|
p@25
|
132 M = np.random.normal(
|
p@25
|
133 D_mu,
|
p@25
|
134 D_sigma,
|
p@25
|
135 (population_size, num_features)
|
p@25
|
136 )
|
p@25
|
137 #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2))
|
p@25
|
138
|
p@25
|
139 #M.shape = (-1, len(items.values()[0]))
|
p@25
|
140 #M = D_sigma * np.random.normal(
|
p@25
|
141 #population_size,
|
p@25
|
142 #len(items.values()[0])
|
p@25
|
143 #) + D_mu
|
p@25
|
144 profile_u = {}
|
p@25
|
145 i = 0
|
p@25
|
146 for userID in users_subset:
|
p@25
|
147 profile_u[userID] = M.tolist()[i]
|
p@25
|
148 i += 1
|
p@25
|
149 generation += 1
|
p@25
|
150
|
p@25
|
151 return profile_u
|
p@25
|
152
|
p@25
|
153 # Similarity matrix
|
p@25
|
154 def cb_similarity(profileID, profile_data, test_data, N):
|
p@25
|
155 ''' Content-based: Similarity matrix '''
|
p@25
|
156 similarity = []
|
p@25
|
157 for songID in test_data[profileID]:
|
p@25
|
158 sim = adj_cos_sim(profile_data, song_library[songID])
|
p@25
|
159 similarity.append((sim, songID))
|
p@25
|
160 # Top-N recommendation
|
p@25
|
161 #similarity.sort(reverse=True)
|
p@25
|
162 #if len(similarity) > N:
|
p@25
|
163 #similarity = similarity[0:N]
|
p@25
|
164
|
p@25
|
165 #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
|
p@25
|
166 return {t[1]: t[0] for t in similarity}
|
p@25
|
167
|
p@25
|
168 def evaluate_eda(
|
p@25
|
169 profiles,
|
p@25
|
170 test_data,
|
p@25
|
171 N=10,
|
p@25
|
172 rating_threshold=3,
|
p@25
|
173 EDA_treshold=0.3):
|
p@25
|
174
|
p@25
|
175 ''' Evaluation '''
|
p@25
|
176
|
p@25
|
177 sim_matrix = {}
|
p@25
|
178 for userID, features in profiles.iteritems():
|
p@25
|
179 sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
|
p@25
|
180
|
p@25
|
181 # Content-Based: Evaluation
|
p@25
|
182 tp = 0.
|
p@25
|
183 fp = 0.
|
p@25
|
184 fn = 0.
|
p@25
|
185 tn = 0.
|
p@25
|
186 for userID, songID_sim in sim_matrix.iteritems():
|
p@25
|
187 for songID, sim_value in songID_sim.iteritems():
|
p@25
|
188 score = test_data[userID][songID]
|
p@25
|
189 if score > rating_threshold and sim_value >= EDA_treshold:
|
p@25
|
190 tp += 1
|
p@25
|
191 elif score <= rating_threshold and sim_value >= EDA_treshold:
|
p@25
|
192 fp += 1
|
p@25
|
193 elif score > rating_threshold and sim_value < EDA_treshold:
|
p@25
|
194 fn += 1
|
p@25
|
195 elif score <= rating_threshold and sim_value < EDA_treshold:
|
p@25
|
196 tn += 1
|
p@25
|
197
|
p@25
|
198 precision = tp / (tp + fp)
|
p@25
|
199 recall = tp / (tp + fn)
|
p@25
|
200 F1 = 2 * precision * recall / (precision + recall)
|
p@25
|
201 accuracy = (tp + tn) / (tp + fp + tn + fn)
|
p@25
|
202
|
p@25
|
203 return precision, recall, F1, accuracy
|
p@25
|
204
|
p@25
|
205 #keys_a = set(users[userID].keys())
|
p@25
|
206 #keys_b = set(test_data.keys())
|
p@25
|
207 #intersection = keys_a & keys_b
|
p@25
|
208 #if len(intersection) != 0:
|
p@25
|
209 #similarity = {}
|
p@25
|
210 #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs}
|
p@25
|
211 #for songID in intersection:
|
p@25
|
212 #if songID == k:
|
p@25
|
213 #similarity[songID] = adj_cos_sim(
|
p@25
|
214 #profile[userID],
|
p@25
|
215 #test_data[songID]
|
p@25
|
216 #)
|
p@25
|
217 #max_sim = max(similarity, key=similarity.get)
|
p@25
|
218 #if max_sim >= EDA_treshold:
|
p@25
|
219 #sim_matrix[userID] = {max_sim: similarity[max_sim]}
|
p@25
|
220 #sim_matrix[userID] = similarity
|
p@25
|
221 #sim_matrix[userID] = {max_sim: similarity[max_sim]}
|
p@25
|
222
|
p@25
|
223 #print len(sim_matrix)
|
p@25
|
224 p = np.array([])
|
p@25
|
225 f = np.array([])
|
p@25
|
226 r = np.array([])
|
p@25
|
227 a = np.array([])
|
p@25
|
228
|
p@25
|
229 for i in range(len(users_train)):
|
p@25
|
230
|
p@25
|
231 profile_u = eda_train(users_likes_subset(users_train[i]))
|
p@25
|
232 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i])
|
p@25
|
233 p = np.append(p, pi)
|
p@25
|
234 r = np.append(r, ri)
|
p@25
|
235 f = np.append(f, fi)
|
p@25
|
236 a = np.append(a, ai)
|
p@25
|
237
|
p@25
|
238 #precision = np.array(p)
|
p@25
|
239 #rec = np.array(r)
|
p@25
|
240 #F1 = np.array(f)
|
p@25
|
241 #accuracy = np.array(a)
|
p@25
|
242
|
p@25
|
243 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
|
p@25
|
244 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
|
p@25
|
245 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
|
p@25
|
246 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
|
p@25
|
247
|
p@25
|
248 '''# Collaborative-filtering: Similarity matrix
|
p@25
|
249 sim_matrix_cf = {}
|
p@25
|
250 count = 0
|
p@25
|
251 for userID_1 in profile:
|
p@25
|
252 similarities = {}
|
p@25
|
253 for userID_2 in profile:
|
p@25
|
254 if userID_1 != userID_2:
|
p@25
|
255 similarities[userID_2] = adj_cos_sim(
|
p@25
|
256 profile[userID_1],
|
p@25
|
257 profile[userID_2]
|
p@25
|
258 )
|
p@25
|
259 #print similarities
|
p@25
|
260 sim_matrix_cf[userID_1] = similarities'''
|
p@25
|
261
|
p@25
|
262 # Predicted rating
|
p@25
|
263 #for userID in users:
|
p@25
|
264 # print np.array(users[userID].values()).mean()
|
p@25
|
265
|
p@25
|
266 '''scores_above_threshold = {
|
p@25
|
267 songID: score for songID, score in songs.iteritems() if score > rating_threshold
|
p@25
|
268 }'''
|
p@25
|
269
|
p@25
|
270 '''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True):
|
p@25
|
271 print "%s: %s" % (key, value)
|
p@25
|
272 break'''
|
p@25
|
273
|
p@25
|
274
|
p@25
|
275 # Recommend new item
|
p@25
|
276
|
p@25
|
277 '''
|
p@25
|
278 def computeNearestNeighbor(itemName, itemVector, items):
|
p@25
|
279 """creates a sorted list of items based on their distance to item"""
|
p@25
|
280 distances = []
|
p@25
|
281 for otherItem in items:
|
p@25
|
282 if otherItem != itemName:
|
p@25
|
283 distance = adj_cos_sim(itemVector, items[otherItem])
|
p@25
|
284 distances.append((distance, otherItem))
|
p@25
|
285 # sort based on distance -- closest first
|
p@25
|
286 distances.sort(reverse=True)
|
p@25
|
287 return distances
|
p@25
|
288
|
p@25
|
289 def classify(user, itemName, itemVector):
|
p@25
|
290 """Classify the itemName based on user ratings
|
p@25
|
291 Should really have items and users as parameters"""
|
p@25
|
292 # first find nearest neighbor
|
p@25
|
293 nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1]
|
p@25
|
294 rating = users[user][nearest]
|
p@25
|
295 return rating
|
p@25
|
296 '''
|
p@25
|
297 # Source: guidetodatamining.com
|
p@25
|
298 '''def computeSimilarity(band1, band2, userRatings):
|
p@25
|
299 averages = {}
|
p@25
|
300 for (key, ratings) in userRatings.items():
|
p@25
|
301 averages[key] = (float(sum(ratings.values())) / len(ratings.values()))
|
p@25
|
302 num = 0 # numerator
|
p@25
|
303 dem1 = 0 # first half of denominator
|
p@25
|
304 dem2 = 0
|
p@25
|
305 for (user, ratings) in userRatings.items():
|
p@25
|
306 if band1 in ratings and band2 in ratings:
|
p@25
|
307 avg = averages[user]
|
p@25
|
308 num += (ratings[band1] - avg) * (ratings[band2] - avg)
|
p@25
|
309 dem1 += (ratings[band1] - avg)**2
|
p@25
|
310 dem2 += (ratings[band2] - avg)**2
|
p@25
|
311 return num / (sqrt(dem1) * sqrt(dem2))'''
|
p@25
|
312
|
p@25
|
313 '''
|
p@25
|
314 sum_xy = 0
|
p@25
|
315 sum_x2 = 0
|
p@25
|
316 sum_y2 = 0
|
p@25
|
317 n = 0
|
p@25
|
318 for key in rating1:
|
p@25
|
319 if key in rating2:
|
p@25
|
320 n += 1
|
p@25
|
321 x = rating1[key]
|
p@25
|
322 y = rating2[key]
|
p@25
|
323 sum_xy += x * y
|
p@25
|
324 if n == 0:
|
p@25
|
325 return 0
|
p@25
|
326
|
p@25
|
327 # now compute denominator
|
p@25
|
328 for key in rating1:
|
p@25
|
329 x = rating1[key]
|
p@25
|
330 sum_x2 += pow(x, 2)
|
p@25
|
331
|
p@25
|
332 for key in rating2:
|
p@25
|
333 y = rating2[key]
|
p@25
|
334 sum_y2 += pow(y, 2)
|
p@25
|
335
|
p@25
|
336 denominator = sqrt(sum_x2) * sqrt(sum_y2)
|
p@25
|
337 if denominator == 0:
|
p@25
|
338 return 0
|
p@25
|
339 else:
|
p@25
|
340 return sum_xy / denominator'''
|
p@25
|
341
|
p@24
|
342 '''
|
p@24
|
343 # Median
|
p@24
|
344 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
|
p@24
|
345 def get_median(lst):
|
p@24
|
346 return np.median(np.array(lst))
|
p@23
|
347
|
p@24
|
348 # Absolute Standard Deviation
|
p@24
|
349 def get_asd(lst, median):
|
p@24
|
350 sum = 0
|
p@24
|
351 for item in lst:
|
p@24
|
352 sum += abs(item - median)
|
p@24
|
353 return sum / len(lst)
|
p@23
|
354
|
p@24
|
355 # Normalisation rating with Modified Standard Score
|
p@24
|
356 def normalize_rating(ratings, median, asd):
|
p@24
|
357 for i in range(len(ratings)):
|
p@24
|
358 ratings[i] = (ratings[i] - median) / asd
|
p@24
|
359 return ratings
|
p@24
|
360 '''
|
p@24
|
361
|
p@24
|
362 '''
|
p@17
|
363 # Pearson Correlation Coefficient
|
p@17
|
364 def pearson(rating1, rating2):
|
p@17
|
365 sum_xy = 0
|
p@17
|
366 sum_x = 0
|
p@17
|
367 sum_y = 0
|
p@17
|
368 sum_x2 = 0
|
p@17
|
369 sum_y2 = 0
|
p@17
|
370 n = 0
|
p@17
|
371 for key in rating1:
|
p@17
|
372 if key in rating2:
|
p@17
|
373 n += 1
|
p@17
|
374 x = rating1[key]
|
p@17
|
375 y = rating2[key]
|
p@17
|
376 sum_xy += x * y
|
p@17
|
377 sum_x += x
|
p@17
|
378 sum_y += y
|
p@17
|
379 sum_x2 += pow(x, 2)
|
p@17
|
380 sum_y2 += pow(y, 2)
|
p@17
|
381 if n == 0:
|
p@17
|
382 return 0
|
p@17
|
383 # now compute denominator
|
p@17
|
384 denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
|
p@17
|
385 sqrt(sum_y2 - pow(sum_y, 2) / n)
|
p@17
|
386 if denominator == 0:
|
p@17
|
387 return 0
|
p@17
|
388 else:
|
p@17
|
389 return (sum_xy - (sum_x * sum_y) / n) / denominator
|
p@25
|
390 '''
|
p@25
|
391 '''
|
p@25
|
392 def buckets(filename, bucketName, separator, classColumn):
|
p@25
|
393 """the original data is in the file named filename
|
p@25
|
394 bucketName is the prefix for all the bucket names
|
p@25
|
395 separator is the character that divides the columns
|
p@25
|
396 (for ex., a tab or comma and classColumn is the column
|
p@25
|
397 that indicates the class"""
|
p@25
|
398 # put the data in 10 buckets
|
p@25
|
399 numberOfBuckets = 10
|
p@25
|
400 data = {}
|
p@25
|
401 # first read in the data and divide by category
|
p@25
|
402 with open(filename) as f:
|
p@25
|
403 lines = f.readlines()
|
p@25
|
404 for line in lines:
|
p@25
|
405 if separator != '\t':
|
p@25
|
406 line = line.replace(separator, '\t')
|
p@25
|
407 # first get the category
|
p@25
|
408 category = line.split()[classColumn]
|
p@25
|
409 data.setdefault(category, [])
|
p@25
|
410 data[category].append(line)
|
p@25
|
411 # initialize the buckets
|
p@25
|
412 buckets = []
|
p@25
|
413 for i in range(numberOfBuckets):
|
p@25
|
414 buckets.append([])
|
p@25
|
415 # now for each category put the data into the buckets
|
p@25
|
416 for k in song_library.keys():
|
p@25
|
417 #randomize order of instances for each class
|
p@25
|
418 print random.shuffle(song_library[k])
|
p@25
|
419 bNum = 0
|
p@25
|
420 # divide into buckets
|
p@25
|
421 for item in song_library[k]:
|
p@25
|
422 buckets[bNum].append(item)
|
p@25
|
423 bNum = (bNum + 1) % numberOfBuckets
|
p@25
|
424 # write to file
|
p@25
|
425 for bNum in range(numberOfBuckets):
|
p@25
|
426 f = open("%s-%02i" % (bucketName, bNum + 1), 'w')
|
p@25
|
427 for item in buckets[bNum]:
|
p@25
|
428 f.write(item)
|
p@25
|
429 f.close()
|
p@17
|
430 '''
|
p@15
|
431
|
p@25
|
432 '''# Functions to compute similarity between items or between profiles
|
p@25
|
433 # Source: http://www.guidetodatamining.com
|
p@25
|
434 def manhattan(vector1, vector2):
|
p@25
|
435 """Computes the Manhattan distance."""
|
p@25
|
436 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))'''
|