Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/eda.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | 68a62ca32441 |
children | e4bcfe00abf4 |
comparison
equal
deleted
inserted
replaced
24:68a62ca32441 | 25:fafc0b249a73 |
---|---|
4 | 4 |
5 @author: paulochiliguano | 5 @author: paulochiliguano |
6 """ | 6 """ |
7 | 7 |
8 | 8 |
9 from math import sqrt, log10 | 9 from math import log, sqrt |
10 import numpy as np | 10 import numpy as np |
11 import pandas as pd | 11 import pandas as pd |
12 from sklearn import mixture | 12 import cPickle as pickle |
13 | 13 #import random |
14 #Fine tuning | 14 |
15 | 15 # Item-vector dictionary |
16 #User-item dictionary | 16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ |
17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, | 17 genre_classification/genre_prob.pkl', 'rb') |
18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, | 18 song_library = pickle.load(f) |
19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, | 19 f.close() |
20 "SOAJZEP12A8C14379B": 2.0}, | 20 |
21 "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5, | 21 # Load training and test data |
22 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0, | 22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ |
23 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0}, | 23 cross_validation.pkl', 'rb') |
24 "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0, | 24 users_train, users_test = pickle.load(f) |
25 "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0, | 25 f.close() |
26 "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0}, | 26 |
27 "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0, | 27 # Cosine Similarity |
28 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0, | 28 def cosine_similarity(vector1, vector2): |
29 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, | 29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) |
30 "SOAJZEP12A8C14379B": 2.0}, | 30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) |
31 "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0, | 31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) |
32 "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0, | 32 return dot_product / (length_x * length_y) |
33 "SOAJZEP12A8C14379B": 1.0}, | 33 |
34 "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0, | 34 # Adjusted Cosine Similarity |
35 "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0, | 35 def adj_cos_sim(vector_i, vector_j): |
36 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, | 36 avrg_w_i = (float(sum(vector_i)) / len(vector_i)) |
37 "SOAJZEP12A8C14379B": 4.0}, | 37 avrg_w_j = (float(sum(vector_j)) / len(vector_j)) |
38 "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0, | 38 num = sum(map( |
39 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0, | 39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), |
40 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0}, | 40 vector_i, |
41 "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0, | 41 vector_j) |
42 "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5, | 42 ) |
43 "SOAKNZI12A58A79CAC": 3.0} | 43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) |
44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) | |
45 return num / (sqrt(dem1) * sqrt(dem2)) | |
46 | |
47 # Fitness function for EDA | |
48 def Fitness(profile_u, user_subset): | |
49 fitness_value = 0 | |
50 for songID, score in user_subset.iteritems(): | |
51 #print cosine_similarity(profile, song_library[songID]) | |
52 sim = cosine_similarity(profile_u, song_library[songID]) | |
53 if sim <= 0: | |
54 fitness_value += -708 | |
55 #math.log(sys.float_info.min) | |
56 else: | |
57 fitness_value += log(score * sim) | |
58 #fitness_value += log(score * manhattan(profile, song_library[songID])) | |
59 #fitness_value += score * cosine_similarity(profile, song_library[songID]) | |
60 return fitness_value | |
61 | |
62 def users_likes_subset(users, rating_threshold=3): | |
63 # Subset of most-liked items | |
64 users_subset = {} | |
65 for userID, songs in users.iteritems(): | |
66 scores_above_threshold = { | |
67 songID: score for songID, score in songs.iteritems() if score > rating_threshold | |
44 } | 68 } |
45 | 69 users_subset[userID]= scores_above_threshold |
46 items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1], | 70 |
47 "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1], | 71 #for songID, score in songs.iteritems(): |
48 "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1], | 72 #print score >0 |
49 "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1], | 73 #if score > 0: |
50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], | 74 #print {userID: {songID: score}} |
51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], | 75 |
52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], | 76 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} |
53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} | 77 |
54 | 78 return users_subset |
55 # Functions to compute similarity between items or between profiles | 79 |
56 # Source: http://www.guidetodatamining.com | 80 def eda_train(users_subset, max_gen=200): |
57 def manhattan(vector1, vector2): | 81 # TRAINING |
58 """Computes the Manhattan distance.""" | 82 num_features = len(song_library.values()[0]) |
59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) | 83 # Given parameters for EDA |
60 | 84 population_size = len(users_subset) |
61 def nearestNeighbor(self, itemVector): | 85 fraction_of_population = int(round(0.5 * population_size)) |
62 """return nearest neighbor to itemVector""" | 86 |
63 return min([( | 87 # Generation of M individuals uniformly |
64 self.manhattan(itemVector, item[1]), item) for item in self.data | 88 np.random.seed(12345) |
65 ]) | 89 M = np.random.uniform( |
66 | 90 0, |
67 def classify(self, itemVector): | 91 1, |
68 """Return class we think item Vector is in""" | 92 population_size * num_features |
69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) | 93 ) |
94 M.shape = (-1, num_features) | |
95 profile_u = {} | |
96 i = 0 | |
97 for userID in users_subset: | |
98 profile_u[userID] = M.tolist()[i] | |
99 i += 1 | |
100 | |
101 generation = 0 | |
102 while generation < max_gen: | |
103 # Compute fitness values | |
104 users_fitness = {} | |
105 for userID in profile_u: | |
106 users_fitness[userID] = Fitness( | |
107 profile_u[userID], | |
108 users_subset[userID] | |
109 ) | |
110 users_fitness_df = pd.DataFrame( | |
111 users_fitness.items(), | |
112 columns=["userID", "fitness"] | |
113 ) | |
114 | |
115 # Selection of best individuals based on fitness values | |
116 best_individuals = {} | |
117 users_fitness_df = users_fitness_df.sort(columns='fitness') | |
118 M_sel = users_fitness_df.tail(fraction_of_population) | |
119 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() | |
120 for userID in M_sel_dict: | |
121 best_individuals[userID] = profile_u[userID] | |
122 | |
123 # Calculate sample mean and standard deviation | |
124 D = np.array([]) | |
125 for userID, features in best_individuals.iteritems(): | |
126 D = np.append(D, features, axis=0) | |
127 D.shape = (-1, num_features) | |
128 D_mu = np.mean(D, axis=0) | |
129 D_sigma = np.std(D, axis=0, ddof=1) | |
130 | |
131 # Sample M individuals | |
132 M = np.random.normal( | |
133 D_mu, | |
134 D_sigma, | |
135 (population_size, num_features) | |
136 ) | |
137 #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2)) | |
138 | |
139 #M.shape = (-1, len(items.values()[0])) | |
140 #M = D_sigma * np.random.normal( | |
141 #population_size, | |
142 #len(items.values()[0]) | |
143 #) + D_mu | |
144 profile_u = {} | |
145 i = 0 | |
146 for userID in users_subset: | |
147 profile_u[userID] = M.tolist()[i] | |
148 i += 1 | |
149 generation += 1 | |
150 | |
151 return profile_u | |
152 | |
153 # Similarity matrix | |
154 def cb_similarity(profileID, profile_data, test_data, N): | |
155 ''' Content-based: Similarity matrix ''' | |
156 similarity = [] | |
157 for songID in test_data[profileID]: | |
158 sim = adj_cos_sim(profile_data, song_library[songID]) | |
159 similarity.append((sim, songID)) | |
160 # Top-N recommendation | |
161 #similarity.sort(reverse=True) | |
162 #if len(similarity) > N: | |
163 #similarity = similarity[0:N] | |
164 | |
165 #sim_matrix[userID] = {t[1]: t[0] for t in similarity} | |
166 return {t[1]: t[0] for t in similarity} | |
167 | |
168 def evaluate_eda( | |
169 profiles, | |
170 test_data, | |
171 N=10, | |
172 rating_threshold=3, | |
173 EDA_treshold=0.3): | |
174 | |
175 ''' Evaluation ''' | |
176 | |
177 sim_matrix = {} | |
178 for userID, features in profiles.iteritems(): | |
179 sim_matrix[userID] = cb_similarity(userID, features, test_data, N) | |
180 | |
181 # Content-Based: Evaluation | |
182 tp = 0. | |
183 fp = 0. | |
184 fn = 0. | |
185 tn = 0. | |
186 for userID, songID_sim in sim_matrix.iteritems(): | |
187 for songID, sim_value in songID_sim.iteritems(): | |
188 score = test_data[userID][songID] | |
189 if score > rating_threshold and sim_value >= EDA_treshold: | |
190 tp += 1 | |
191 elif score <= rating_threshold and sim_value >= EDA_treshold: | |
192 fp += 1 | |
193 elif score > rating_threshold and sim_value < EDA_treshold: | |
194 fn += 1 | |
195 elif score <= rating_threshold and sim_value < EDA_treshold: | |
196 tn += 1 | |
197 | |
198 precision = tp / (tp + fp) | |
199 recall = tp / (tp + fn) | |
200 F1 = 2 * precision * recall / (precision + recall) | |
201 accuracy = (tp + tn) / (tp + fp + tn + fn) | |
202 | |
203 return precision, recall, F1, accuracy | |
204 | |
205 #keys_a = set(users[userID].keys()) | |
206 #keys_b = set(test_data.keys()) | |
207 #intersection = keys_a & keys_b | |
208 #if len(intersection) != 0: | |
209 #similarity = {} | |
210 #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs} | |
211 #for songID in intersection: | |
212 #if songID == k: | |
213 #similarity[songID] = adj_cos_sim( | |
214 #profile[userID], | |
215 #test_data[songID] | |
216 #) | |
217 #max_sim = max(similarity, key=similarity.get) | |
218 #if max_sim >= EDA_treshold: | |
219 #sim_matrix[userID] = {max_sim: similarity[max_sim]} | |
220 #sim_matrix[userID] = similarity | |
221 #sim_matrix[userID] = {max_sim: similarity[max_sim]} | |
222 | |
223 #print len(sim_matrix) | |
224 p = np.array([]) | |
225 f = np.array([]) | |
226 r = np.array([]) | |
227 a = np.array([]) | |
228 | |
229 for i in range(len(users_train)): | |
230 | |
231 profile_u = eda_train(users_likes_subset(users_train[i])) | |
232 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i]) | |
233 p = np.append(p, pi) | |
234 r = np.append(r, ri) | |
235 f = np.append(f, fi) | |
236 a = np.append(a, ai) | |
237 | |
238 #precision = np.array(p) | |
239 #rec = np.array(r) | |
240 #F1 = np.array(f) | |
241 #accuracy = np.array(a) | |
242 | |
243 print "Precision = %f3 ± %f3" % (p.mean(), p.std()) | |
244 print "Recall = %f3 ± %f3" % (r.mean(), r.std()) | |
245 print "F1 = %f3 ± %f3" % (f.mean(), f.std()) | |
246 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) | |
247 | |
248 '''# Collaborative-filtering: Similarity matrix | |
249 sim_matrix_cf = {} | |
250 count = 0 | |
251 for userID_1 in profile: | |
252 similarities = {} | |
253 for userID_2 in profile: | |
254 if userID_1 != userID_2: | |
255 similarities[userID_2] = adj_cos_sim( | |
256 profile[userID_1], | |
257 profile[userID_2] | |
258 ) | |
259 #print similarities | |
260 sim_matrix_cf[userID_1] = similarities''' | |
261 | |
262 # Predicted rating | |
263 #for userID in users: | |
264 # print np.array(users[userID].values()).mean() | |
265 | |
266 '''scores_above_threshold = { | |
267 songID: score for songID, score in songs.iteritems() if score > rating_threshold | |
268 }''' | |
269 | |
270 '''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True): | |
271 print "%s: %s" % (key, value) | |
272 break''' | |
273 | |
274 | |
275 # Recommend new item | |
276 | |
277 ''' | |
278 def computeNearestNeighbor(itemName, itemVector, items): | |
279 """creates a sorted list of items based on their distance to item""" | |
280 distances = [] | |
281 for otherItem in items: | |
282 if otherItem != itemName: | |
283 distance = adj_cos_sim(itemVector, items[otherItem]) | |
284 distances.append((distance, otherItem)) | |
285 # sort based on distance -- closest first | |
286 distances.sort(reverse=True) | |
287 return distances | |
288 | |
289 def classify(user, itemName, itemVector): | |
290 """Classify the itemName based on user ratings | |
291 Should really have items and users as parameters""" | |
292 # first find nearest neighbor | |
293 nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1] | |
294 rating = users[user][nearest] | |
295 return rating | |
296 ''' | |
297 # Source: guidetodatamining.com | |
298 '''def computeSimilarity(band1, band2, userRatings): | |
299 averages = {} | |
300 for (key, ratings) in userRatings.items(): | |
301 averages[key] = (float(sum(ratings.values())) / len(ratings.values())) | |
302 num = 0 # numerator | |
303 dem1 = 0 # first half of denominator | |
304 dem2 = 0 | |
305 for (user, ratings) in userRatings.items(): | |
306 if band1 in ratings and band2 in ratings: | |
307 avg = averages[user] | |
308 num += (ratings[band1] - avg) * (ratings[band2] - avg) | |
309 dem1 += (ratings[band1] - avg)**2 | |
310 dem2 += (ratings[band2] - avg)**2 | |
311 return num / (sqrt(dem1) * sqrt(dem2))''' | |
312 | |
313 ''' | |
314 sum_xy = 0 | |
315 sum_x2 = 0 | |
316 sum_y2 = 0 | |
317 n = 0 | |
318 for key in rating1: | |
319 if key in rating2: | |
320 n += 1 | |
321 x = rating1[key] | |
322 y = rating2[key] | |
323 sum_xy += x * y | |
324 if n == 0: | |
325 return 0 | |
326 | |
327 # now compute denominator | |
328 for key in rating1: | |
329 x = rating1[key] | |
330 sum_x2 += pow(x, 2) | |
331 | |
332 for key in rating2: | |
333 y = rating2[key] | |
334 sum_y2 += pow(y, 2) | |
335 | |
336 denominator = sqrt(sum_x2) * sqrt(sum_y2) | |
337 if denominator == 0: | |
338 return 0 | |
339 else: | |
340 return sum_xy / denominator''' | |
341 | |
70 ''' | 342 ''' |
71 # Median | 343 # Median |
72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python | 344 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python |
73 def get_median(lst): | 345 def get_median(lst): |
74 return np.median(np.array(lst)) | 346 return np.median(np.array(lst)) |
84 def normalize_rating(ratings, median, asd): | 356 def normalize_rating(ratings, median, asd): |
85 for i in range(len(ratings)): | 357 for i in range(len(ratings)): |
86 ratings[i] = (ratings[i] - median) / asd | 358 ratings[i] = (ratings[i] - median) / asd |
87 return ratings | 359 return ratings |
88 ''' | 360 ''' |
89 # Normalise user play count | |
90 for userID in users: | |
91 song_play_count = pd.DataFrame( | |
92 users[userID].items(), | |
93 columns=["songID", "play_count"] | |
94 ) | |
95 '''Coefficient of variation''' | |
96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean() | |
97 #user_ratings = np.array(users[userID].values()) | |
98 #cv = user_ratings.std()/user_ratings.mean() | |
99 #print userID, cv | |
100 if cv <= 0.5: | |
101 for songID, play_count in users[userID].items(): | |
102 users[userID][songID] = 3 | |
103 else: | |
104 song_play_count_q = pd.cut( | |
105 song_play_count["play_count"], | |
106 5, | |
107 labels=False | |
108 ) + 1 | |
109 song_play_count.play_count = song_play_count_q | |
110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict() | |
111 #print song_play_count | |
112 #median = get_median(user_ratings) | |
113 #asd = get_asd(user_ratings, median) | |
114 #for songID, play_count in users[userID].items(): | |
115 #users[userID][songID] = (play_count - median) / asd | |
116 | |
117 # Subset of most-liked items | |
118 users_subset = {} | |
119 for userID, songs in users.iteritems(): | |
120 scores_above_threshold = { | |
121 songID: score for songID, score in songs.iteritems() if score > 2 | |
122 } | |
123 users_subset[userID]= scores_above_threshold | |
124 ''' | |
125 for songID, score in songs.iteritems(): | |
126 print score >0 | |
127 if score > 0: | |
128 print {userID: {songID: score}} | |
129 | |
130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} | |
131 ''' | |
132 # Fitness function for EDA | |
133 def Fitness(profile, user_subset): | |
134 fitness_value = 0 | |
135 for songID, score in user_subset.iteritems(): | |
136 fitness_value += log10(score * manhattan(profile, items[songID])) | |
137 return fitness_value | |
138 | |
139 # Given parameters for EDA | |
140 population_size = len(users_subset) | |
141 fraction_of_population = int(round(0.5 * population_size)) | |
142 | |
143 # Generation of M individuals uniformly | |
144 np.random.seed(len(users_subset)) | |
145 M = np.random.rand(population_size, len(items.values()[0])) | |
146 #M.shape = (-1, len(items.values()[0])) | |
147 profile = {} | |
148 i = 0 | |
149 for userID in users_subset: | |
150 profile[userID] = M.tolist()[i] | |
151 i += 1 | |
152 | |
153 # Compute fitness values | |
154 users_fitness = {} | |
155 for userID in profile: | |
156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID]) | |
157 users_fitness_df = pd.DataFrame( | |
158 users_fitness.items(), | |
159 columns=["userID", "fitness"] | |
160 ) | |
161 | |
162 # Selection of best individuals based on fitness values | |
163 best_individuals = {} | |
164 users_fitness_df = users_fitness_df.sort(columns='fitness') | |
165 M_sel = users_fitness_df.head(fraction_of_population) | |
166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() | |
167 for userID in M_sel_dict: | |
168 best_individuals[userID] = profile[userID] | |
169 | |
170 # Calculate sample mean and standard deviation | |
171 np.random.seed(1) | |
172 g = mixture.GMM(n_components=10) | |
173 # Generate random observations with two modes centered on 0 | |
174 # and 10 to use for training. | |
175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) | |
176 g.fit(obs) | |
177 np.round(g.weights_, 2) | |
178 np.round(g.means_, 2) | |
179 np.round(g.covars_, 2) | |
180 g.predict([[0], [2], [9], [10]]) | |
181 np.round(g.score([[0], [2], [9], [10]]), 2) | |
182 # Refit the model on new data (initial parameters remain the | |
183 # same), this time with an even split between the two modes. | |
184 g.fit(20 * [[0]] + 20 * [[10]]) | |
185 np.round(g.weights_, 2) | |
186 | |
187 | 361 |
188 ''' | 362 ''' |
189 # Pearson Correlation Coefficient | 363 # Pearson Correlation Coefficient |
190 def pearson(rating1, rating2): | 364 def pearson(rating1, rating2): |
191 sum_xy = 0 | 365 sum_xy = 0 |
211 sqrt(sum_y2 - pow(sum_y, 2) / n) | 385 sqrt(sum_y2 - pow(sum_y, 2) / n) |
212 if denominator == 0: | 386 if denominator == 0: |
213 return 0 | 387 return 0 |
214 else: | 388 else: |
215 return (sum_xy - (sum_x * sum_y) / n) / denominator | 389 return (sum_xy - (sum_x * sum_y) / n) / denominator |
216 | 390 ''' |
217 # Cosine Similarity for test purposes | 391 ''' |
218 def cosine_similarity(rating1, rating2): | 392 def buckets(filename, bucketName, separator, classColumn): |
219 sum_xy = 0 | 393 """the original data is in the file named filename |
220 sum_x2 = 0 | 394 bucketName is the prefix for all the bucket names |
221 sum_y2 = 0 | 395 separator is the character that divides the columns |
222 n = 0 | 396 (for ex., a tab or comma and classColumn is the column |
223 for key in rating1: | 397 that indicates the class""" |
224 if key in rating2: | 398 # put the data in 10 buckets |
225 n += 1 | 399 numberOfBuckets = 10 |
226 x = rating1[key] | 400 data = {} |
227 y = rating2[key] | 401 # first read in the data and divide by category |
228 sum_xy += x * y | 402 with open(filename) as f: |
229 if n == 0: | 403 lines = f.readlines() |
230 return 0 | 404 for line in lines: |
231 | 405 if separator != '\t': |
232 # now compute denominator | 406 line = line.replace(separator, '\t') |
233 for key in rating1: | 407 # first get the category |
234 x = rating1[key] | 408 category = line.split()[classColumn] |
235 sum_x2 += pow(x, 2) | 409 data.setdefault(category, []) |
236 | 410 data[category].append(line) |
237 for key in rating2: | 411 # initialize the buckets |
238 y = rating2[key] | 412 buckets = [] |
239 sum_y2 += pow(y, 2) | 413 for i in range(numberOfBuckets): |
240 | 414 buckets.append([]) |
241 denominator = sqrt(sum_x2) * sqrt(sum_y2) | 415 # now for each category put the data into the buckets |
242 if denominator == 0: | 416 for k in song_library.keys(): |
243 return 0 | 417 #randomize order of instances for each class |
244 else: | 418 print random.shuffle(song_library[k]) |
245 return sum_xy / denominator | 419 bNum = 0 |
246 | 420 # divide into buckets |
247 | 421 for item in song_library[k]: |
248 | 422 buckets[bNum].append(item) |
249 def Fitness(profile, user_index): | 423 bNum = (bNum + 1) % numberOfBuckets |
250 sim = 0 | 424 # write to file |
251 sum_log = 0 | 425 for bNum in range(numberOfBuckets): |
252 | 426 f = open("%s-%02i" % (bucketName, bNum + 1), 'w') |
253 features = profile.items()[user_index][1] | 427 for item in buckets[bNum]: |
254 songs = users.items()[user_index][1] | 428 f.write(item) |
255 | 429 f.close() |
256 for song, rating in songs.items(): | 430 ''' |
257 sim = pearson(features, items[song]) | 431 |
258 print(sim) | 432 '''# Functions to compute similarity between items or between profiles |
259 | 433 # Source: http://www.guidetodatamining.com |
260 for username, songs in users.items(): | 434 def manhattan(vector1, vector2): |
261 for song, rating in songs.items(): | 435 """Computes the Manhattan distance.""" |
262 sim = pearson(profile, items[song]) | 436 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))''' |
263 #sum_log += log10(rating * sim) | |
264 return sim | |
265 ''' | |
266 | |
267 | |
268 | |
269 | |
270 | |
271 | |
272 | |
273 | |
274 |