annotate Code/eda_discrete.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sat, 09 Jul 2022 00:50:43 -0500
parents ae650489d3a8
children
rev   line source
p@25 1 # -*- coding: utf-8 -*-
p@25 2 """
p@25 3 Created on Wed Jul 22 17:42:09 2015
p@25 4
p@25 5 @author: paulochiliguano
p@25 6 """
p@25 7
p@25 8
p@25 9 from math import log, sqrt
p@25 10 import numpy as np
p@25 11 import pandas as pd
p@25 12 import cPickle as pickle
p@26 13 import time
p@25 14
p@25 15 # Item-vector dictionary
p@25 16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25 17 genre_classification/genre_prob.pkl', 'rb')
p@25 18 song_library = pickle.load(f)
p@25 19 f.close()
p@25 20
p@25 21 # Load training and test data
p@25 22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25 23 cross_validation.pkl', 'rb')
p@25 24 users_train, users_test = pickle.load(f)
p@25 25 f.close()
p@25 26
p@25 27 # Cosine Similarity
p@25 28 def cosine_similarity(vector1, vector2):
p@25 29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
p@25 30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
p@25 31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
p@25 32 return dot_product / (length_x * length_y)
p@25 33
p@25 34 # Adjusted Cosine Similarity
p@25 35 def adj_cos_sim(vector_i, vector_j):
p@25 36 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
p@25 37 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
p@25 38 num = sum(map(
p@25 39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
p@25 40 vector_i,
p@25 41 vector_j)
p@25 42 )
p@25 43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
p@25 44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
p@25 45
p@25 46 return num / (sqrt(dem1) * sqrt(dem2))
p@25 47
p@25 48 # Fitness function for EDA
p@25 49 def Fitness(profile_u, user_subset):
p@25 50 fitness_value = 0
p@25 51 for songID, score in user_subset.iteritems():
p@25 52 #print cosine_similarity(profile_u, song_library[songID])
p@25 53 sim = cosine_similarity(profile_u, song_library[songID])
p@25 54 if sim <= 0:
p@25 55 fitness_value += -708
p@25 56 #math.log(sys.float_info.min)
p@25 57 else:
p@25 58 fitness_value += log(score * sim)
p@25 59 #fitness_value += log(score * manhattan(profile, song_library[songID]))
p@25 60 #fitness_value += score * cosine_similarity(profile, song_library[songID])
p@25 61 return fitness_value
p@25 62
p@26 63 def users_likes_subset(users, rating_threshold=2):
p@25 64 # Subset of most-liked items
p@25 65 users_subset = {}
p@25 66 for userID, songs in users.iteritems():
p@25 67 scores_above_threshold = {
p@25 68 songID: score for songID, score in songs.iteritems() if score > rating_threshold
p@25 69 }
p@25 70 users_subset[userID]= scores_above_threshold
p@25 71
p@25 72 #for songID, score in songs.iteritems():
p@25 73 #print score >0
p@25 74 #if score > 0:
p@25 75 #print {userID: {songID: score}}
p@25 76
p@25 77 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
p@25 78
p@25 79 return users_subset
p@25 80
p@26 81 def eda_train(users_subset, max_gen=250):
p@25 82 # TRAINING
p@25 83 num_features = len(song_library.values()[0])
p@25 84 # Given parameters for EDA
p@25 85 population_size = len(users_subset)
p@25 86 fraction_of_population = int(round(0.5 * population_size))
p@25 87
p@25 88 # Ku set
p@25 89 weights = list(np.linspace(0.1, 0.9))
p@25 90 tags = [
p@25 91 'blues',
p@25 92 'classical',
p@25 93 'country',
p@25 94 'disco',
p@25 95 'hiphop',
p@25 96 'jazz',
p@25 97 'metal',
p@25 98 'pop',
p@25 99 'reggae',
p@25 100 'rock'
p@25 101 ]
p@25 102 for i, j in enumerate(tags):
p@25 103 tags[i] = i
p@25 104 list_a = np.tile(weights, num_features)
p@25 105 list_b = np.repeat(tags, len(weights))
p@25 106 Ku = zip(list_b, list_a)
p@25 107 Ku_np = np.array(Ku, dtype=('int, float'))
p@25 108
p@25 109 # Generate initial population
p@25 110 np.random.seed(12345)
p@25 111 profile_u = {}
p@25 112 profile_aux = {}
p@25 113 for userID in users_subset:
p@25 114 a = np.random.choice(
p@25 115 Ku_np,
p@25 116 num_features,
p@25 117 ).tolist()
p@25 118 #a = sorted(a, key=lambda student: student[1], reverse=True)
p@25 119 b = {t[0]: t[1] for t in a}
p@25 120 feature_v = list(np.zeros(num_features))
p@25 121 for k, v in b.iteritems():
p@25 122 feature_v[k] = v
p@25 123 profile_u[userID] = feature_v
p@25 124 profile_aux[userID] = [(k, v) for k, v in b.iteritems()]
p@25 125
p@26 126 fitnesses = []
p@26 127
p@25 128 generation = 0
p@25 129 while generation < max_gen:
p@25 130 # Compute fitness values
p@25 131 users_fitness = {}
p@25 132 for userID in profile_u:
p@25 133 users_fitness[userID] = Fitness(
p@25 134 profile_u[userID],
p@25 135 users_subset[userID]
p@25 136 )
p@25 137 users_fitness_df = pd.DataFrame(
p@25 138 users_fitness.items(),
p@25 139 columns=["userID", "fitness"]
p@25 140 )
p@26 141 fitnesses.append(users_fitness_df.fitness.values.tolist())
p@25 142
p@25 143 # Selection of best individuals based on fitness values
p@25 144 #best_individuals = {}
p@25 145 users_fitness_df = users_fitness_df.sort(columns='fitness')
p@25 146 M_sel = users_fitness_df.tail(fraction_of_population)
p@25 147 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
p@25 148 #for userID in M_sel_dict:
p@25 149 #best_individuals[userID] = profile_u[userID]
p@25 150
p@25 151 Xs = []
p@25 152 for userID in M_sel_dict:
p@25 153 Xs.extend(profile_aux[userID])
p@25 154
p@25 155 # Update probability model
p@25 156 p = []
p@25 157 for i in Ku:
p@25 158 p.append(float(Xs.count(i)) / fraction_of_population)
p@25 159
p@25 160 # Sample new population
p@25 161 profile_u = {}
p@25 162 profile_aux = {}
p@25 163 for userID in users_subset:
p@25 164 a = np.random.choice(
p@25 165 Ku_np,
p@25 166 num_features,
p@25 167 p
p@25 168 ).tolist()
p@25 169 #a = sorted(a, key=lambda student: student[1], reverse=True)
p@25 170 b = {t[0]: t[1] for t in a}
p@25 171 feature_v = list(np.zeros(num_features))
p@25 172 for k, v in b.iteritems():
p@25 173 feature_v[k] = v
p@25 174 profile_u[userID] = feature_v
p@25 175 profile_aux[userID] = [(k, v) for k, v in b.iteritems()]
p@26 176
p@25 177 generation += 1
p@25 178
p@26 179 return profile_u, p, np.array(fitnesses)
p@25 180
p@25 181 # Similarity matrix
p@25 182 def cb_similarity(profileID, profile_data, test_data, N):
p@26 183
p@26 184 a = []
p@26 185 for user, info in test_data.iteritems():
p@26 186 a.extend([i for i in info])
p@26 187 songIDs = list(set(a))
p@26 188
p@25 189 ''' Content-based: Similarity matrix '''
p@25 190 similarity = []
p@25 191 #keys_a = train_data[profileID].keys()
p@26 192 for songID in songIDs:
p@25 193 sim = adj_cos_sim(profile_data, song_library[songID])
p@25 194 similarity.append((sim, songID))
p@26 195 ''' Top-N recommendation '''
p@26 196 similarity.sort(reverse=True)
p@26 197 if len(similarity) > N:
p@26 198 similarity = similarity[0:N]
p@25 199
p@25 200 #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
p@25 201 return {t[1]: t[0] for t in similarity}
p@25 202
p@25 203 def evaluate_eda(
p@25 204 profiles,
p@25 205 test_data,
p@25 206 N=10,
p@26 207 rating_threshold=2,
p@25 208 EDA_treshold=0.5):
p@25 209
p@25 210 ''' Evaluation '''
p@25 211
p@25 212 sim_matrix = {}
p@25 213 for userID, features in profiles.iteritems():
p@25 214 sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
p@25 215
p@25 216 # Content-Based: Evaluation
p@25 217 tp = 0.
p@25 218 fp = 0.
p@25 219 fn = 0.
p@25 220 tn = 0.
p@25 221
p@26 222 for user, song_rating in test_data.iteritems():
p@26 223 entries = sim_matrix[user]
p@26 224 for song, rating in song_rating.iteritems():
p@26 225 if song in entries:
p@26 226 if rating > rating_threshold:
p@26 227 tp += 1
p@26 228 elif rating <= rating_threshold:
p@26 229 fp += 1
p@26 230 else:
p@26 231 if rating > rating_threshold:
p@26 232 fn += 1
p@26 233 elif rating <= rating_threshold:
p@26 234 tn += 1
p@26 235
p@26 236 # for userID, songID_sim in sim_matrix.iteritems():
p@26 237 # for songID, sim_value in songID_sim.iteritems():
p@26 238 # score = test_data[userID][songID]
p@26 239 # if score > rating_threshold and sim_value >= EDA_treshold:
p@26 240 # tp += 1
p@26 241 # elif score <= rating_threshold and sim_value >= EDA_treshold:
p@26 242 # fp += 1
p@26 243 # elif score > rating_threshold and sim_value < EDA_treshold:
p@26 244 # fn += 1
p@26 245 # elif score <= rating_threshold and sim_value < EDA_treshold:
p@26 246 # tn += 1
p@27 247 #print tp, fp, fn, tn
p@26 248 if tp != 0:
p@26 249 precision = tp / (tp + fp)
p@26 250 recall = tp / (tp + fn)
p@26 251 F1 = 2 * precision * recall / (precision + recall)
p@26 252 else:
p@26 253 precision = 0
p@26 254 recall = 0
p@26 255 F1 = 0
p@26 256
p@25 257 accuracy = (tp + tn) / (tp + fp + tn + fn)
p@25 258
p@25 259 return precision, recall, F1, accuracy
p@25 260
p@25 261 p = np.array([])
p@25 262 f = np.array([])
p@25 263 r = np.array([])
p@25 264 a = np.array([])
p@25 265
p@25 266 for i in range(len(users_train)):
p@26 267 start_time = time.time()
p@26 268 profile_u, prob, fffitness = eda_train(users_likes_subset(users_train[i]))
p@26 269 elapsed_time = time.time() - start_time
p@26 270 print 'Training execution time: %.3f seconds' % elapsed_time
p@25 271
p@27 272 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i], N=20)
p@25 273 p = np.append(p, pi)
p@25 274 r = np.append(r, ri)
p@25 275 f = np.append(f, fi)
p@25 276 a = np.append(a, ai)
p@25 277
p@25 278 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
p@25 279 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
p@25 280 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
p@25 281 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())