comparison Code/eda.py @ 25:fafc0b249a73

Final code
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 23 Aug 2015 16:47:54 +0100
parents 68a62ca32441
children e4bcfe00abf4
comparison
equal deleted inserted replaced
24:68a62ca32441 25:fafc0b249a73
4 4
5 @author: paulochiliguano 5 @author: paulochiliguano
6 """ 6 """
7 7
8 8
9 from math import sqrt, log10 9 from math import log, sqrt
10 import numpy as np 10 import numpy as np
11 import pandas as pd 11 import pandas as pd
12 from sklearn import mixture 12 import cPickle as pickle
13 13 #import random
14 #Fine tuning 14
15 15 # Item-vector dictionary
16 #User-item dictionary 16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, 17 genre_classification/genre_prob.pkl', 'rb')
18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, 18 song_library = pickle.load(f)
19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, 19 f.close()
20 "SOAJZEP12A8C14379B": 2.0}, 20
21 "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5, 21 # Load training and test data
22 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0, 22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
23 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0}, 23 cross_validation.pkl', 'rb')
24 "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0, 24 users_train, users_test = pickle.load(f)
25 "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0, 25 f.close()
26 "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0}, 26
27 "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0, 27 # Cosine Similarity
28 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0, 28 def cosine_similarity(vector1, vector2):
29 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, 29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
30 "SOAJZEP12A8C14379B": 2.0}, 30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
31 "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0, 31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
32 "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0, 32 return dot_product / (length_x * length_y)
33 "SOAJZEP12A8C14379B": 1.0}, 33
34 "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0, 34 # Adjusted Cosine Similarity
35 "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0, 35 def adj_cos_sim(vector_i, vector_j):
36 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, 36 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
37 "SOAJZEP12A8C14379B": 4.0}, 37 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
38 "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0, 38 num = sum(map(
39 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0, 39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
40 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0}, 40 vector_i,
41 "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0, 41 vector_j)
42 "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5, 42 )
43 "SOAKNZI12A58A79CAC": 3.0} 43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
45 return num / (sqrt(dem1) * sqrt(dem2))
46
47 # Fitness function for EDA
48 def Fitness(profile_u, user_subset):
49 fitness_value = 0
50 for songID, score in user_subset.iteritems():
51 #print cosine_similarity(profile, song_library[songID])
52 sim = cosine_similarity(profile_u, song_library[songID])
53 if sim <= 0:
54 fitness_value += -708
55 #math.log(sys.float_info.min)
56 else:
57 fitness_value += log(score * sim)
58 #fitness_value += log(score * manhattan(profile, song_library[songID]))
59 #fitness_value += score * cosine_similarity(profile, song_library[songID])
60 return fitness_value
61
62 def users_likes_subset(users, rating_threshold=3):
63 # Subset of most-liked items
64 users_subset = {}
65 for userID, songs in users.iteritems():
66 scores_above_threshold = {
67 songID: score for songID, score in songs.iteritems() if score > rating_threshold
44 } 68 }
45 69 users_subset[userID]= scores_above_threshold
46 items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1], 70
47 "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1], 71 #for songID, score in songs.iteritems():
48 "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1], 72 #print score >0
49 "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1], 73 #if score > 0:
50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], 74 #print {userID: {songID: score}}
51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], 75
52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], 76 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} 77
54 78 return users_subset
55 # Functions to compute similarity between items or between profiles 79
56 # Source: http://www.guidetodatamining.com 80 def eda_train(users_subset, max_gen=200):
57 def manhattan(vector1, vector2): 81 # TRAINING
58 """Computes the Manhattan distance.""" 82 num_features = len(song_library.values()[0])
59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) 83 # Given parameters for EDA
60 84 population_size = len(users_subset)
61 def nearestNeighbor(self, itemVector): 85 fraction_of_population = int(round(0.5 * population_size))
62 """return nearest neighbor to itemVector""" 86
63 return min([( 87 # Generation of M individuals uniformly
64 self.manhattan(itemVector, item[1]), item) for item in self.data 88 np.random.seed(12345)
65 ]) 89 M = np.random.uniform(
66 90 0,
67 def classify(self, itemVector): 91 1,
68 """Return class we think item Vector is in""" 92 population_size * num_features
69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) 93 )
94 M.shape = (-1, num_features)
95 profile_u = {}
96 i = 0
97 for userID in users_subset:
98 profile_u[userID] = M.tolist()[i]
99 i += 1
100
101 generation = 0
102 while generation < max_gen:
103 # Compute fitness values
104 users_fitness = {}
105 for userID in profile_u:
106 users_fitness[userID] = Fitness(
107 profile_u[userID],
108 users_subset[userID]
109 )
110 users_fitness_df = pd.DataFrame(
111 users_fitness.items(),
112 columns=["userID", "fitness"]
113 )
114
115 # Selection of best individuals based on fitness values
116 best_individuals = {}
117 users_fitness_df = users_fitness_df.sort(columns='fitness')
118 M_sel = users_fitness_df.tail(fraction_of_population)
119 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
120 for userID in M_sel_dict:
121 best_individuals[userID] = profile_u[userID]
122
123 # Calculate sample mean and standard deviation
124 D = np.array([])
125 for userID, features in best_individuals.iteritems():
126 D = np.append(D, features, axis=0)
127 D.shape = (-1, num_features)
128 D_mu = np.mean(D, axis=0)
129 D_sigma = np.std(D, axis=0, ddof=1)
130
131 # Sample M individuals
132 M = np.random.normal(
133 D_mu,
134 D_sigma,
135 (population_size, num_features)
136 )
137 #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2))
138
139 #M.shape = (-1, len(items.values()[0]))
140 #M = D_sigma * np.random.normal(
141 #population_size,
142 #len(items.values()[0])
143 #) + D_mu
144 profile_u = {}
145 i = 0
146 for userID in users_subset:
147 profile_u[userID] = M.tolist()[i]
148 i += 1
149 generation += 1
150
151 return profile_u
152
153 # Similarity matrix
154 def cb_similarity(profileID, profile_data, test_data, N):
155 ''' Content-based: Similarity matrix '''
156 similarity = []
157 for songID in test_data[profileID]:
158 sim = adj_cos_sim(profile_data, song_library[songID])
159 similarity.append((sim, songID))
160 # Top-N recommendation
161 #similarity.sort(reverse=True)
162 #if len(similarity) > N:
163 #similarity = similarity[0:N]
164
165 #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
166 return {t[1]: t[0] for t in similarity}
167
168 def evaluate_eda(
169 profiles,
170 test_data,
171 N=10,
172 rating_threshold=3,
173 EDA_treshold=0.3):
174
175 ''' Evaluation '''
176
177 sim_matrix = {}
178 for userID, features in profiles.iteritems():
179 sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
180
181 # Content-Based: Evaluation
182 tp = 0.
183 fp = 0.
184 fn = 0.
185 tn = 0.
186 for userID, songID_sim in sim_matrix.iteritems():
187 for songID, sim_value in songID_sim.iteritems():
188 score = test_data[userID][songID]
189 if score > rating_threshold and sim_value >= EDA_treshold:
190 tp += 1
191 elif score <= rating_threshold and sim_value >= EDA_treshold:
192 fp += 1
193 elif score > rating_threshold and sim_value < EDA_treshold:
194 fn += 1
195 elif score <= rating_threshold and sim_value < EDA_treshold:
196 tn += 1
197
198 precision = tp / (tp + fp)
199 recall = tp / (tp + fn)
200 F1 = 2 * precision * recall / (precision + recall)
201 accuracy = (tp + tn) / (tp + fp + tn + fn)
202
203 return precision, recall, F1, accuracy
204
205 #keys_a = set(users[userID].keys())
206 #keys_b = set(test_data.keys())
207 #intersection = keys_a & keys_b
208 #if len(intersection) != 0:
209 #similarity = {}
210 #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs}
211 #for songID in intersection:
212 #if songID == k:
213 #similarity[songID] = adj_cos_sim(
214 #profile[userID],
215 #test_data[songID]
216 #)
217 #max_sim = max(similarity, key=similarity.get)
218 #if max_sim >= EDA_treshold:
219 #sim_matrix[userID] = {max_sim: similarity[max_sim]}
220 #sim_matrix[userID] = similarity
221 #sim_matrix[userID] = {max_sim: similarity[max_sim]}
222
223 #print len(sim_matrix)
224 p = np.array([])
225 f = np.array([])
226 r = np.array([])
227 a = np.array([])
228
229 for i in range(len(users_train)):
230
231 profile_u = eda_train(users_likes_subset(users_train[i]))
232 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i])
233 p = np.append(p, pi)
234 r = np.append(r, ri)
235 f = np.append(f, fi)
236 a = np.append(a, ai)
237
238 #precision = np.array(p)
239 #rec = np.array(r)
240 #F1 = np.array(f)
241 #accuracy = np.array(a)
242
243 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
244 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
245 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
246 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
247
248 '''# Collaborative-filtering: Similarity matrix
249 sim_matrix_cf = {}
250 count = 0
251 for userID_1 in profile:
252 similarities = {}
253 for userID_2 in profile:
254 if userID_1 != userID_2:
255 similarities[userID_2] = adj_cos_sim(
256 profile[userID_1],
257 profile[userID_2]
258 )
259 #print similarities
260 sim_matrix_cf[userID_1] = similarities'''
261
262 # Predicted rating
263 #for userID in users:
264 # print np.array(users[userID].values()).mean()
265
266 '''scores_above_threshold = {
267 songID: score for songID, score in songs.iteritems() if score > rating_threshold
268 }'''
269
270 '''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True):
271 print "%s: %s" % (key, value)
272 break'''
273
274
275 # Recommend new item
276
277 '''
278 def computeNearestNeighbor(itemName, itemVector, items):
279 """creates a sorted list of items based on their distance to item"""
280 distances = []
281 for otherItem in items:
282 if otherItem != itemName:
283 distance = adj_cos_sim(itemVector, items[otherItem])
284 distances.append((distance, otherItem))
285 # sort based on distance -- closest first
286 distances.sort(reverse=True)
287 return distances
288
289 def classify(user, itemName, itemVector):
290 """Classify the itemName based on user ratings
291 Should really have items and users as parameters"""
292 # first find nearest neighbor
293 nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1]
294 rating = users[user][nearest]
295 return rating
296 '''
297 # Source: guidetodatamining.com
298 '''def computeSimilarity(band1, band2, userRatings):
299 averages = {}
300 for (key, ratings) in userRatings.items():
301 averages[key] = (float(sum(ratings.values())) / len(ratings.values()))
302 num = 0 # numerator
303 dem1 = 0 # first half of denominator
304 dem2 = 0
305 for (user, ratings) in userRatings.items():
306 if band1 in ratings and band2 in ratings:
307 avg = averages[user]
308 num += (ratings[band1] - avg) * (ratings[band2] - avg)
309 dem1 += (ratings[band1] - avg)**2
310 dem2 += (ratings[band2] - avg)**2
311 return num / (sqrt(dem1) * sqrt(dem2))'''
312
313 '''
314 sum_xy = 0
315 sum_x2 = 0
316 sum_y2 = 0
317 n = 0
318 for key in rating1:
319 if key in rating2:
320 n += 1
321 x = rating1[key]
322 y = rating2[key]
323 sum_xy += x * y
324 if n == 0:
325 return 0
326
327 # now compute denominator
328 for key in rating1:
329 x = rating1[key]
330 sum_x2 += pow(x, 2)
331
332 for key in rating2:
333 y = rating2[key]
334 sum_y2 += pow(y, 2)
335
336 denominator = sqrt(sum_x2) * sqrt(sum_y2)
337 if denominator == 0:
338 return 0
339 else:
340 return sum_xy / denominator'''
341
70 ''' 342 '''
71 # Median 343 # Median
72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python 344 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
73 def get_median(lst): 345 def get_median(lst):
74 return np.median(np.array(lst)) 346 return np.median(np.array(lst))
84 def normalize_rating(ratings, median, asd): 356 def normalize_rating(ratings, median, asd):
85 for i in range(len(ratings)): 357 for i in range(len(ratings)):
86 ratings[i] = (ratings[i] - median) / asd 358 ratings[i] = (ratings[i] - median) / asd
87 return ratings 359 return ratings
88 ''' 360 '''
89 # Normalise user play count
90 for userID in users:
91 song_play_count = pd.DataFrame(
92 users[userID].items(),
93 columns=["songID", "play_count"]
94 )
95 '''Coefficient of variation'''
96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
97 #user_ratings = np.array(users[userID].values())
98 #cv = user_ratings.std()/user_ratings.mean()
99 #print userID, cv
100 if cv <= 0.5:
101 for songID, play_count in users[userID].items():
102 users[userID][songID] = 3
103 else:
104 song_play_count_q = pd.cut(
105 song_play_count["play_count"],
106 5,
107 labels=False
108 ) + 1
109 song_play_count.play_count = song_play_count_q
110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
111 #print song_play_count
112 #median = get_median(user_ratings)
113 #asd = get_asd(user_ratings, median)
114 #for songID, play_count in users[userID].items():
115 #users[userID][songID] = (play_count - median) / asd
116
117 # Subset of most-liked items
118 users_subset = {}
119 for userID, songs in users.iteritems():
120 scores_above_threshold = {
121 songID: score for songID, score in songs.iteritems() if score > 2
122 }
123 users_subset[userID]= scores_above_threshold
124 '''
125 for songID, score in songs.iteritems():
126 print score >0
127 if score > 0:
128 print {userID: {songID: score}}
129
130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
131 '''
132 # Fitness function for EDA
133 def Fitness(profile, user_subset):
134 fitness_value = 0
135 for songID, score in user_subset.iteritems():
136 fitness_value += log10(score * manhattan(profile, items[songID]))
137 return fitness_value
138
139 # Given parameters for EDA
140 population_size = len(users_subset)
141 fraction_of_population = int(round(0.5 * population_size))
142
143 # Generation of M individuals uniformly
144 np.random.seed(len(users_subset))
145 M = np.random.rand(population_size, len(items.values()[0]))
146 #M.shape = (-1, len(items.values()[0]))
147 profile = {}
148 i = 0
149 for userID in users_subset:
150 profile[userID] = M.tolist()[i]
151 i += 1
152
153 # Compute fitness values
154 users_fitness = {}
155 for userID in profile:
156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
157 users_fitness_df = pd.DataFrame(
158 users_fitness.items(),
159 columns=["userID", "fitness"]
160 )
161
162 # Selection of best individuals based on fitness values
163 best_individuals = {}
164 users_fitness_df = users_fitness_df.sort(columns='fitness')
165 M_sel = users_fitness_df.head(fraction_of_population)
166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
167 for userID in M_sel_dict:
168 best_individuals[userID] = profile[userID]
169
170 # Calculate sample mean and standard deviation
171 np.random.seed(1)
172 g = mixture.GMM(n_components=10)
173 # Generate random observations with two modes centered on 0
174 # and 10 to use for training.
175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
176 g.fit(obs)
177 np.round(g.weights_, 2)
178 np.round(g.means_, 2)
179 np.round(g.covars_, 2)
180 g.predict([[0], [2], [9], [10]])
181 np.round(g.score([[0], [2], [9], [10]]), 2)
182 # Refit the model on new data (initial parameters remain the
183 # same), this time with an even split between the two modes.
184 g.fit(20 * [[0]] + 20 * [[10]])
185 np.round(g.weights_, 2)
186
187 361
188 ''' 362 '''
189 # Pearson Correlation Coefficient 363 # Pearson Correlation Coefficient
190 def pearson(rating1, rating2): 364 def pearson(rating1, rating2):
191 sum_xy = 0 365 sum_xy = 0
211 sqrt(sum_y2 - pow(sum_y, 2) / n) 385 sqrt(sum_y2 - pow(sum_y, 2) / n)
212 if denominator == 0: 386 if denominator == 0:
213 return 0 387 return 0
214 else: 388 else:
215 return (sum_xy - (sum_x * sum_y) / n) / denominator 389 return (sum_xy - (sum_x * sum_y) / n) / denominator
216 390 '''
217 # Cosine Similarity for test purposes 391 '''
218 def cosine_similarity(rating1, rating2): 392 def buckets(filename, bucketName, separator, classColumn):
219 sum_xy = 0 393 """the original data is in the file named filename
220 sum_x2 = 0 394 bucketName is the prefix for all the bucket names
221 sum_y2 = 0 395 separator is the character that divides the columns
222 n = 0 396 (for ex., a tab or comma and classColumn is the column
223 for key in rating1: 397 that indicates the class"""
224 if key in rating2: 398 # put the data in 10 buckets
225 n += 1 399 numberOfBuckets = 10
226 x = rating1[key] 400 data = {}
227 y = rating2[key] 401 # first read in the data and divide by category
228 sum_xy += x * y 402 with open(filename) as f:
229 if n == 0: 403 lines = f.readlines()
230 return 0 404 for line in lines:
231 405 if separator != '\t':
232 # now compute denominator 406 line = line.replace(separator, '\t')
233 for key in rating1: 407 # first get the category
234 x = rating1[key] 408 category = line.split()[classColumn]
235 sum_x2 += pow(x, 2) 409 data.setdefault(category, [])
236 410 data[category].append(line)
237 for key in rating2: 411 # initialize the buckets
238 y = rating2[key] 412 buckets = []
239 sum_y2 += pow(y, 2) 413 for i in range(numberOfBuckets):
240 414 buckets.append([])
241 denominator = sqrt(sum_x2) * sqrt(sum_y2) 415 # now for each category put the data into the buckets
242 if denominator == 0: 416 for k in song_library.keys():
243 return 0 417 #randomize order of instances for each class
244 else: 418 print random.shuffle(song_library[k])
245 return sum_xy / denominator 419 bNum = 0
246 420 # divide into buckets
247 421 for item in song_library[k]:
248 422 buckets[bNum].append(item)
249 def Fitness(profile, user_index): 423 bNum = (bNum + 1) % numberOfBuckets
250 sim = 0 424 # write to file
251 sum_log = 0 425 for bNum in range(numberOfBuckets):
252 426 f = open("%s-%02i" % (bucketName, bNum + 1), 'w')
253 features = profile.items()[user_index][1] 427 for item in buckets[bNum]:
254 songs = users.items()[user_index][1] 428 f.write(item)
255 429 f.close()
256 for song, rating in songs.items(): 430 '''
257 sim = pearson(features, items[song]) 431
258 print(sim) 432 '''# Functions to compute similarity between items or between profiles
259 433 # Source: http://www.guidetodatamining.com
260 for username, songs in users.items(): 434 def manhattan(vector1, vector2):
261 for song, rating in songs.items(): 435 """Computes the Manhattan distance."""
262 sim = pearson(profile, items[song]) 436 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))'''
263 #sum_log += log10(rating * sim)
264 return sim
265 '''
266
267
268
269
270
271
272
273
274