hybrid-music-recommender-using-content-based-and-social-information: Code/eda

annotate Code/eda_discrete.py @ 25:fafc0b249a73

Final code

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sun, 23 Aug 2015 16:47:54 +0100
parents
children	e4bcfe00abf4

rev	line source
p@25	1 # -- coding: utf-8 --
p@25	2 """
p@25	3 Created on Wed Jul 22 17:42:09 2015
p@25	4
p@25	5 @author: paulochiliguano
p@25	6 """
p@25	7
p@25	8
p@25	9 from math import log, sqrt
p@25	10 import numpy as np
p@25	11 import pandas as pd
p@25	12 import cPickle as pickle
p@25	13 #import random
p@25	14
p@25	15 # Item-vector dictionary
p@25	16 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25	17 genre_classification/genre_prob.pkl', 'rb')
p@25	18 song_library = pickle.load(f)
p@25	19 f.close()
p@25	20
p@25	21 # Load training and test data
p@25	22 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25	23 cross_validation.pkl', 'rb')
p@25	24 users_train, users_test = pickle.load(f)
p@25	25 f.close()
p@25	26
p@25	27 # Cosine Similarity
p@25	28 def cosine_similarity(vector1, vector2):
p@25	29 dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
p@25	30 length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
p@25	31 length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
p@25	32 return dot_product / (length_x * length_y)
p@25	33
p@25	34 # Adjusted Cosine Similarity
p@25	35 def adj_cos_sim(vector_i, vector_j):
p@25	36 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
p@25	37 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
p@25	38 num = sum(map(
p@25	39 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
p@25	40 vector_i,
p@25	41 vector_j)
p@25	42 )
p@25	43 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
p@25	44 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
p@25	45
p@25	46 return num / (sqrt(dem1) * sqrt(dem2))
p@25	47
p@25	48 # Fitness function for EDA
p@25	49 def Fitness(profile_u, user_subset):
p@25	50 fitness_value = 0
p@25	51 for songID, score in user_subset.iteritems():
p@25	52 #print cosine_similarity(profile_u, song_library[songID])
p@25	53 sim = cosine_similarity(profile_u, song_library[songID])
p@25	54 if sim <= 0:
p@25	55 fitness_value += -708
p@25	56 #math.log(sys.float_info.min)
p@25	57 else:
p@25	58 fitness_value += log(score * sim)
p@25	59 #fitness_value += log(score * manhattan(profile, song_library[songID]))
p@25	60 #fitness_value += score * cosine_similarity(profile, song_library[songID])
p@25	61 return fitness_value
p@25	62
p@25	63 def users_likes_subset(users, rating_threshold=3):
p@25	64 # Subset of most-liked items
p@25	65 users_subset = {}
p@25	66 for userID, songs in users.iteritems():
p@25	67 scores_above_threshold = {
p@25	68 songID: score for songID, score in songs.iteritems() if score > rating_threshold
p@25	69 }
p@25	70 users_subset[userID]= scores_above_threshold
p@25	71
p@25	72 #for songID, score in songs.iteritems():
p@25	73 #print score >0
p@25	74 #if score > 0:
p@25	75 #print {userID: {songID: score}}
p@25	76
p@25	77 #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
p@25	78
p@25	79 return users_subset
p@25	80
p@25	81 def eda_train(users_subset, max_gen=1000):
p@25	82 # TRAINING
p@25	83 num_features = len(song_library.values()[0])
p@25	84 # Given parameters for EDA
p@25	85 population_size = len(users_subset)
p@25	86 fraction_of_population = int(round(0.5 * population_size))
p@25	87
p@25	88 # Ku set
p@25	89 weights = list(np.linspace(0.1, 0.9))
p@25	90 tags = [
p@25	91 'blues',
p@25	92 'classical',
p@25	93 'country',
p@25	94 'disco',
p@25	95 'hiphop',
p@25	96 'jazz',
p@25	97 'metal',
p@25	98 'pop',
p@25	99 'reggae',
p@25	100 'rock'
p@25	101 ]
p@25	102 for i, j in enumerate(tags):
p@25	103 tags[i] = i
p@25	104 list_a = np.tile(weights, num_features)
p@25	105 list_b = np.repeat(tags, len(weights))
p@25	106 Ku = zip(list_b, list_a)
p@25	107 Ku_np = np.array(Ku, dtype=('int, float'))
p@25	108
p@25	109 # Generate initial population
p@25	110 np.random.seed(12345)
p@25	111 profile_u = {}
p@25	112 profile_aux = {}
p@25	113 for userID in users_subset:
p@25	114 a = np.random.choice(
p@25	115 Ku_np,
p@25	116 num_features,
p@25	117 ).tolist()
p@25	118 #a = sorted(a, key=lambda student: student[1], reverse=True)
p@25	119 b = {t[0]: t[1] for t in a}
p@25	120 feature_v = list(np.zeros(num_features))
p@25	121 for k, v in b.iteritems():
p@25	122 feature_v[k] = v
p@25	123 profile_u[userID] = feature_v
p@25	124 profile_aux[userID] = [(k, v) for k, v in b.iteritems()]
p@25	125
p@25	126 generation = 0
p@25	127 while generation < max_gen:
p@25	128 # Compute fitness values
p@25	129 users_fitness = {}
p@25	130 for userID in profile_u:
p@25	131 users_fitness[userID] = Fitness(
p@25	132 profile_u[userID],
p@25	133 users_subset[userID]
p@25	134 )
p@25	135 users_fitness_df = pd.DataFrame(
p@25	136 users_fitness.items(),
p@25	137 columns=["userID", "fitness"]
p@25	138 )
p@25	139
p@25	140 # Selection of best individuals based on fitness values
p@25	141 #best_individuals = {}
p@25	142 users_fitness_df = users_fitness_df.sort(columns='fitness')
p@25	143 M_sel = users_fitness_df.tail(fraction_of_population)
p@25	144 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
p@25	145 #for userID in M_sel_dict:
p@25	146 #best_individuals[userID] = profile_u[userID]
p@25	147
p@25	148 Xs = []
p@25	149 for userID in M_sel_dict:
p@25	150 Xs.extend(profile_aux[userID])
p@25	151
p@25	152 # Update probability model
p@25	153 p = []
p@25	154 for i in Ku:
p@25	155 p.append(float(Xs.count(i)) / fraction_of_population)
p@25	156
p@25	157 # Sample new population
p@25	158 profile_u = {}
p@25	159 profile_aux = {}
p@25	160 for userID in users_subset:
p@25	161 a = np.random.choice(
p@25	162 Ku_np,
p@25	163 num_features,
p@25	164 p
p@25	165 ).tolist()
p@25	166 #a = sorted(a, key=lambda student: student[1], reverse=True)
p@25	167 b = {t[0]: t[1] for t in a}
p@25	168 feature_v = list(np.zeros(num_features))
p@25	169 for k, v in b.iteritems():
p@25	170 feature_v[k] = v
p@25	171 profile_u[userID] = feature_v
p@25	172 profile_aux[userID] = [(k, v) for k, v in b.iteritems()]
p@25	173
p@25	174 generation += 1
p@25	175
p@25	176 return profile_u, p
p@25	177
p@25	178 # Similarity matrix
p@25	179 def cb_similarity(profileID, profile_data, test_data, N):
p@25	180 ''' Content-based: Similarity matrix '''
p@25	181 similarity = []
p@25	182 #keys_a = train_data[profileID].keys()
p@25	183 for songID in test_data[profileID]:
p@25	184 sim = adj_cos_sim(profile_data, song_library[songID])
p@25	185 similarity.append((sim, songID))
p@25	186 # Top-N recommendation
p@25	187 #similarity.sort(reverse=True)
p@25	188 #if len(similarity) > N:
p@25	189 #similarity = similarity[0:N]
p@25	190
p@25	191 #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
p@25	192 return {t[1]: t[0] for t in similarity}
p@25	193
p@25	194 def evaluate_eda(
p@25	195 profiles,
p@25	196 test_data,
p@25	197 N=10,
p@25	198 rating_threshold=3,
p@25	199 EDA_treshold=0.5):
p@25	200
p@25	201 ''' Evaluation '''
p@25	202
p@25	203 sim_matrix = {}
p@25	204 for userID, features in profiles.iteritems():
p@25	205 sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
p@25	206
p@25	207 # Content-Based: Evaluation
p@25	208 tp = 0.
p@25	209 fp = 0.
p@25	210 fn = 0.
p@25	211 tn = 0.
p@25	212 for userID, songID_sim in sim_matrix.iteritems():
p@25	213 for songID, sim_value in songID_sim.iteritems():
p@25	214 score = test_data[userID][songID]
p@25	215 if score > rating_threshold and sim_value >= EDA_treshold:
p@25	216 tp += 1
p@25	217 elif score <= rating_threshold and sim_value >= EDA_treshold:
p@25	218 fp += 1
p@25	219 elif score > rating_threshold and sim_value < EDA_treshold:
p@25	220 fn += 1
p@25	221 elif score <= rating_threshold and sim_value < EDA_treshold:
p@25	222 tn += 1
p@25	223
p@25	224 precision = tp / (tp + fp)
p@25	225 recall = tp / (tp + fn)
p@25	226 F1 = 2 * precision * recall / (precision + recall)
p@25	227 accuracy = (tp + tn) / (tp + fp + tn + fn)
p@25	228
p@25	229 return precision, recall, F1, accuracy
p@25	230
p@25	231 p = np.array([])
p@25	232 f = np.array([])
p@25	233 r = np.array([])
p@25	234 a = np.array([])
p@25	235
p@25	236 for i in range(len(users_train)):
p@25	237
p@25	238 profile_u, prob = eda_train(users_likes_subset(users_train[i]))
p@25	239 pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i])
p@25	240 p = np.append(p, pi)
p@25	241 r = np.append(r, ri)
p@25	242 f = np.append(f, fi)
p@25	243 a = np.append(a, ai)
p@25	244
p@25	245 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
p@25	246 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
p@25	247 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
p@25	248 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

annotate Code/eda_discrete.py @ 25:fafc0b249a73