comparison Code/eda_discrete.py @ 25:fafc0b249a73

Final code
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 23 Aug 2015 16:47:54 +0100
parents
children e4bcfe00abf4
comparison
equal deleted inserted replaced
24:68a62ca32441 25:fafc0b249a73
1 # -*- coding: utf-8 -*-
2 """
3 Created on Wed Jul 22 17:42:09 2015
4
5 @author: paulochiliguano
6 """
7
8
9 from math import log, sqrt
10 import numpy as np
11 import pandas as pd
12 import cPickle as pickle
13 #import random
14
15 # Item-vector dictionary
16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
17 genre_classification/genre_prob.pkl', 'rb')
18 song_library = pickle.load(f)
19 f.close()
20
21 # Load training and test data
22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
23 cross_validation.pkl', 'rb')
24 users_train, users_test = pickle.load(f)
25 f.close()
26
27 # Cosine Similarity
28 def cosine_similarity(vector1, vector2):
29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
32 return dot_product / (length_x * length_y)
33
34 # Adjusted Cosine Similarity
35 def adj_cos_sim(vector_i, vector_j):
36 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
37 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
38 num = sum(map(
39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
40 vector_i,
41 vector_j)
42 )
43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
45
46 return num / (sqrt(dem1) * sqrt(dem2))
47
48 # Fitness function for EDA
49 def Fitness(profile_u, user_subset):
50 fitness_value = 0
51 for songID, score in user_subset.iteritems():
52 #print cosine_similarity(profile_u, song_library[songID])
53 sim = cosine_similarity(profile_u, song_library[songID])
54 if sim <= 0:
55 fitness_value += -708
56 #math.log(sys.float_info.min)
57 else:
58 fitness_value += log(score * sim)
59 #fitness_value += log(score * manhattan(profile, song_library[songID]))
60 #fitness_value += score * cosine_similarity(profile, song_library[songID])
61 return fitness_value
62
63 def users_likes_subset(users, rating_threshold=3):
64 # Subset of most-liked items
65 users_subset = {}
66 for userID, songs in users.iteritems():
67 scores_above_threshold = {
68 songID: score for songID, score in songs.iteritems() if score > rating_threshold
69 }
70 users_subset[userID]= scores_above_threshold
71
72 #for songID, score in songs.iteritems():
73 #print score >0
74 #if score > 0:
75 #print {userID: {songID: score}}
76
77 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
78
79 return users_subset
80
81 def eda_train(users_subset, max_gen=1000):
82 # TRAINING
83 num_features = len(song_library.values()[0])
84 # Given parameters for EDA
85 population_size = len(users_subset)
86 fraction_of_population = int(round(0.5 * population_size))
87
88 # Ku set
89 weights = list(np.linspace(0.1, 0.9))
90 tags = [
91 'blues',
92 'classical',
93 'country',
94 'disco',
95 'hiphop',
96 'jazz',
97 'metal',
98 'pop',
99 'reggae',
100 'rock'
101 ]
102 for i, j in enumerate(tags):
103 tags[i] = i
104 list_a = np.tile(weights, num_features)
105 list_b = np.repeat(tags, len(weights))
106 Ku = zip(list_b, list_a)
107 Ku_np = np.array(Ku, dtype=('int, float'))
108
109 # Generate initial population
110 np.random.seed(12345)
111 profile_u = {}
112 profile_aux = {}
113 for userID in users_subset:
114 a = np.random.choice(
115 Ku_np,
116 num_features,
117 ).tolist()
118 #a = sorted(a, key=lambda student: student[1], reverse=True)
119 b = {t[0]: t[1] for t in a}
120 feature_v = list(np.zeros(num_features))
121 for k, v in b.iteritems():
122 feature_v[k] = v
123 profile_u[userID] = feature_v
124 profile_aux[userID] = [(k, v) for k, v in b.iteritems()]
125
126 generation = 0
127 while generation < max_gen:
128 # Compute fitness values
129 users_fitness = {}
130 for userID in profile_u:
131 users_fitness[userID] = Fitness(
132 profile_u[userID],
133 users_subset[userID]
134 )
135 users_fitness_df = pd.DataFrame(
136 users_fitness.items(),
137 columns=["userID", "fitness"]
138 )
139
140 # Selection of best individuals based on fitness values
141 #best_individuals = {}
142 users_fitness_df = users_fitness_df.sort(columns='fitness')
143 M_sel = users_fitness_df.tail(fraction_of_population)
144 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
145 #for userID in M_sel_dict:
146 #best_individuals[userID] = profile_u[userID]
147
148 Xs = []
149 for userID in M_sel_dict:
150 Xs.extend(profile_aux[userID])
151
152 # Update probability model
153 p = []
154 for i in Ku:
155 p.append(float(Xs.count(i)) / fraction_of_population)
156
157 # Sample new population
158 profile_u = {}
159 profile_aux = {}
160 for userID in users_subset:
161 a = np.random.choice(
162 Ku_np,
163 num_features,
164 p
165 ).tolist()
166 #a = sorted(a, key=lambda student: student[1], reverse=True)
167 b = {t[0]: t[1] for t in a}
168 feature_v = list(np.zeros(num_features))
169 for k, v in b.iteritems():
170 feature_v[k] = v
171 profile_u[userID] = feature_v
172 profile_aux[userID] = [(k, v) for k, v in b.iteritems()]
173
174 generation += 1
175
176 return profile_u, p
177
178 # Similarity matrix
179 def cb_similarity(profileID, profile_data, test_data, N):
180 ''' Content-based: Similarity matrix '''
181 similarity = []
182 #keys_a = train_data[profileID].keys()
183 for songID in test_data[profileID]:
184 sim = adj_cos_sim(profile_data, song_library[songID])
185 similarity.append((sim, songID))
186 # Top-N recommendation
187 #similarity.sort(reverse=True)
188 #if len(similarity) > N:
189 #similarity = similarity[0:N]
190
191 #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
192 return {t[1]: t[0] for t in similarity}
193
194 def evaluate_eda(
195 profiles,
196 test_data,
197 N=10,
198 rating_threshold=3,
199 EDA_treshold=0.5):
200
201 ''' Evaluation '''
202
203 sim_matrix = {}
204 for userID, features in profiles.iteritems():
205 sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
206
207 # Content-Based: Evaluation
208 tp = 0.
209 fp = 0.
210 fn = 0.
211 tn = 0.
212 for userID, songID_sim in sim_matrix.iteritems():
213 for songID, sim_value in songID_sim.iteritems():
214 score = test_data[userID][songID]
215 if score > rating_threshold and sim_value >= EDA_treshold:
216 tp += 1
217 elif score <= rating_threshold and sim_value >= EDA_treshold:
218 fp += 1
219 elif score > rating_threshold and sim_value < EDA_treshold:
220 fn += 1
221 elif score <= rating_threshold and sim_value < EDA_treshold:
222 tn += 1
223
224 precision = tp / (tp + fp)
225 recall = tp / (tp + fn)
226 F1 = 2 * precision * recall / (precision + recall)
227 accuracy = (tp + tn) / (tp + fp + tn + fn)
228
229 return precision, recall, F1, accuracy
230
231 p = np.array([])
232 f = np.array([])
233 r = np.array([])
234 a = np.array([])
235
236 for i in range(len(users_train)):
237
238 profile_u, prob = eda_train(users_likes_subset(users_train[i]))
239 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i])
240 p = np.append(p, pi)
241 r = np.append(r, ri)
242 f = np.append(f, fi)
243 a = np.append(a, ai)
244
245 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
246 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
247 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
248 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())