Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/eda_discrete.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | |
children | e4bcfe00abf4 |
comparison
equal
deleted
inserted
replaced
24:68a62ca32441 | 25:fafc0b249a73 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Wed Jul 22 17:42:09 2015 | |
4 | |
5 @author: paulochiliguano | |
6 """ | |
7 | |
8 | |
9 from math import log, sqrt | |
10 import numpy as np | |
11 import pandas as pd | |
12 import cPickle as pickle | |
13 #import random | |
14 | |
15 # Item-vector dictionary | |
16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ | |
17 genre_classification/genre_prob.pkl', 'rb') | |
18 song_library = pickle.load(f) | |
19 f.close() | |
20 | |
21 # Load training and test data | |
22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ | |
23 cross_validation.pkl', 'rb') | |
24 users_train, users_test = pickle.load(f) | |
25 f.close() | |
26 | |
27 # Cosine Similarity | |
28 def cosine_similarity(vector1, vector2): | |
29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) | |
30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) | |
31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) | |
32 return dot_product / (length_x * length_y) | |
33 | |
34 # Adjusted Cosine Similarity | |
35 def adj_cos_sim(vector_i, vector_j): | |
36 avrg_w_i = (float(sum(vector_i)) / len(vector_i)) | |
37 avrg_w_j = (float(sum(vector_j)) / len(vector_j)) | |
38 num = sum(map( | |
39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), | |
40 vector_i, | |
41 vector_j) | |
42 ) | |
43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) | |
44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) | |
45 | |
46 return num / (sqrt(dem1) * sqrt(dem2)) | |
47 | |
48 # Fitness function for EDA | |
49 def Fitness(profile_u, user_subset): | |
50 fitness_value = 0 | |
51 for songID, score in user_subset.iteritems(): | |
52 #print cosine_similarity(profile_u, song_library[songID]) | |
53 sim = cosine_similarity(profile_u, song_library[songID]) | |
54 if sim <= 0: | |
55 fitness_value += -708 | |
56 #math.log(sys.float_info.min) | |
57 else: | |
58 fitness_value += log(score * sim) | |
59 #fitness_value += log(score * manhattan(profile, song_library[songID])) | |
60 #fitness_value += score * cosine_similarity(profile, song_library[songID]) | |
61 return fitness_value | |
62 | |
63 def users_likes_subset(users, rating_threshold=3): | |
64 # Subset of most-liked items | |
65 users_subset = {} | |
66 for userID, songs in users.iteritems(): | |
67 scores_above_threshold = { | |
68 songID: score for songID, score in songs.iteritems() if score > rating_threshold | |
69 } | |
70 users_subset[userID]= scores_above_threshold | |
71 | |
72 #for songID, score in songs.iteritems(): | |
73 #print score >0 | |
74 #if score > 0: | |
75 #print {userID: {songID: score}} | |
76 | |
77 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} | |
78 | |
79 return users_subset | |
80 | |
81 def eda_train(users_subset, max_gen=1000): | |
82 # TRAINING | |
83 num_features = len(song_library.values()[0]) | |
84 # Given parameters for EDA | |
85 population_size = len(users_subset) | |
86 fraction_of_population = int(round(0.5 * population_size)) | |
87 | |
88 # Ku set | |
89 weights = list(np.linspace(0.1, 0.9)) | |
90 tags = [ | |
91 'blues', | |
92 'classical', | |
93 'country', | |
94 'disco', | |
95 'hiphop', | |
96 'jazz', | |
97 'metal', | |
98 'pop', | |
99 'reggae', | |
100 'rock' | |
101 ] | |
102 for i, j in enumerate(tags): | |
103 tags[i] = i | |
104 list_a = np.tile(weights, num_features) | |
105 list_b = np.repeat(tags, len(weights)) | |
106 Ku = zip(list_b, list_a) | |
107 Ku_np = np.array(Ku, dtype=('int, float')) | |
108 | |
109 # Generate initial population | |
110 np.random.seed(12345) | |
111 profile_u = {} | |
112 profile_aux = {} | |
113 for userID in users_subset: | |
114 a = np.random.choice( | |
115 Ku_np, | |
116 num_features, | |
117 ).tolist() | |
118 #a = sorted(a, key=lambda student: student[1], reverse=True) | |
119 b = {t[0]: t[1] for t in a} | |
120 feature_v = list(np.zeros(num_features)) | |
121 for k, v in b.iteritems(): | |
122 feature_v[k] = v | |
123 profile_u[userID] = feature_v | |
124 profile_aux[userID] = [(k, v) for k, v in b.iteritems()] | |
125 | |
126 generation = 0 | |
127 while generation < max_gen: | |
128 # Compute fitness values | |
129 users_fitness = {} | |
130 for userID in profile_u: | |
131 users_fitness[userID] = Fitness( | |
132 profile_u[userID], | |
133 users_subset[userID] | |
134 ) | |
135 users_fitness_df = pd.DataFrame( | |
136 users_fitness.items(), | |
137 columns=["userID", "fitness"] | |
138 ) | |
139 | |
140 # Selection of best individuals based on fitness values | |
141 #best_individuals = {} | |
142 users_fitness_df = users_fitness_df.sort(columns='fitness') | |
143 M_sel = users_fitness_df.tail(fraction_of_population) | |
144 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() | |
145 #for userID in M_sel_dict: | |
146 #best_individuals[userID] = profile_u[userID] | |
147 | |
148 Xs = [] | |
149 for userID in M_sel_dict: | |
150 Xs.extend(profile_aux[userID]) | |
151 | |
152 # Update probability model | |
153 p = [] | |
154 for i in Ku: | |
155 p.append(float(Xs.count(i)) / fraction_of_population) | |
156 | |
157 # Sample new population | |
158 profile_u = {} | |
159 profile_aux = {} | |
160 for userID in users_subset: | |
161 a = np.random.choice( | |
162 Ku_np, | |
163 num_features, | |
164 p | |
165 ).tolist() | |
166 #a = sorted(a, key=lambda student: student[1], reverse=True) | |
167 b = {t[0]: t[1] for t in a} | |
168 feature_v = list(np.zeros(num_features)) | |
169 for k, v in b.iteritems(): | |
170 feature_v[k] = v | |
171 profile_u[userID] = feature_v | |
172 profile_aux[userID] = [(k, v) for k, v in b.iteritems()] | |
173 | |
174 generation += 1 | |
175 | |
176 return profile_u, p | |
177 | |
178 # Similarity matrix | |
179 def cb_similarity(profileID, profile_data, test_data, N): | |
180 ''' Content-based: Similarity matrix ''' | |
181 similarity = [] | |
182 #keys_a = train_data[profileID].keys() | |
183 for songID in test_data[profileID]: | |
184 sim = adj_cos_sim(profile_data, song_library[songID]) | |
185 similarity.append((sim, songID)) | |
186 # Top-N recommendation | |
187 #similarity.sort(reverse=True) | |
188 #if len(similarity) > N: | |
189 #similarity = similarity[0:N] | |
190 | |
191 #sim_matrix[userID] = {t[1]: t[0] for t in similarity} | |
192 return {t[1]: t[0] for t in similarity} | |
193 | |
194 def evaluate_eda( | |
195 profiles, | |
196 test_data, | |
197 N=10, | |
198 rating_threshold=3, | |
199 EDA_treshold=0.5): | |
200 | |
201 ''' Evaluation ''' | |
202 | |
203 sim_matrix = {} | |
204 for userID, features in profiles.iteritems(): | |
205 sim_matrix[userID] = cb_similarity(userID, features, test_data, N) | |
206 | |
207 # Content-Based: Evaluation | |
208 tp = 0. | |
209 fp = 0. | |
210 fn = 0. | |
211 tn = 0. | |
212 for userID, songID_sim in sim_matrix.iteritems(): | |
213 for songID, sim_value in songID_sim.iteritems(): | |
214 score = test_data[userID][songID] | |
215 if score > rating_threshold and sim_value >= EDA_treshold: | |
216 tp += 1 | |
217 elif score <= rating_threshold and sim_value >= EDA_treshold: | |
218 fp += 1 | |
219 elif score > rating_threshold and sim_value < EDA_treshold: | |
220 fn += 1 | |
221 elif score <= rating_threshold and sim_value < EDA_treshold: | |
222 tn += 1 | |
223 | |
224 precision = tp / (tp + fp) | |
225 recall = tp / (tp + fn) | |
226 F1 = 2 * precision * recall / (precision + recall) | |
227 accuracy = (tp + tn) / (tp + fp + tn + fn) | |
228 | |
229 return precision, recall, F1, accuracy | |
230 | |
231 p = np.array([]) | |
232 f = np.array([]) | |
233 r = np.array([]) | |
234 a = np.array([]) | |
235 | |
236 for i in range(len(users_train)): | |
237 | |
238 profile_u, prob = eda_train(users_likes_subset(users_train[i])) | |
239 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i]) | |
240 p = np.append(p, pi) | |
241 r = np.append(r, ri) | |
242 f = np.append(f, fi) | |
243 a = np.append(a, ai) | |
244 | |
245 print "Precision = %f3 ± %f3" % (p.mean(), p.std()) | |
246 print "Recall = %f3 ± %f3" % (r.mean(), r.std()) | |
247 print "F1 = %f3 ± %f3" % (f.mean(), f.std()) | |
248 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) |