p@15: # -*- coding: utf-8 -*- p@15: """ p@15: Created on Wed Jul 22 17:42:09 2015 p@15: p@15: @author: paulochiliguano p@15: """ p@15: p@16: p@25: from math import log, sqrt p@15: import numpy as np p@24: import pandas as pd p@25: import cPickle as pickle p@26: import time p@15: p@25: # Item-vector dictionary p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: genre_classification/genre_prob.pkl', 'rb') p@25: song_library = pickle.load(f) p@25: f.close() p@24: p@25: # Load training and test data p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: cross_validation.pkl', 'rb') p@25: users_train, users_test = pickle.load(f) p@25: f.close() p@25: p@25: # Cosine Similarity p@25: def cosine_similarity(vector1, vector2): p@25: dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) p@25: length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) p@25: length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) p@25: return dot_product / (length_x * length_y) p@25: p@25: # Adjusted Cosine Similarity p@25: def adj_cos_sim(vector_i, vector_j): p@25: avrg_w_i = (float(sum(vector_i)) / len(vector_i)) p@25: avrg_w_j = (float(sum(vector_j)) / len(vector_j)) p@25: num = sum(map( p@25: lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), p@25: vector_i, p@25: vector_j) p@25: ) p@25: dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) p@25: dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) p@25: return num / (sqrt(dem1) * sqrt(dem2)) p@25: p@25: # Fitness function for EDA p@25: def Fitness(profile_u, user_subset): p@25: fitness_value = 0 p@25: for songID, score in user_subset.iteritems(): p@25: #print cosine_similarity(profile, song_library[songID]) p@25: sim = cosine_similarity(profile_u, song_library[songID]) p@25: if sim <= 0: p@25: fitness_value += -708 p@25: #math.log(sys.float_info.min) p@25: else: p@25: fitness_value += log(score * sim) p@25: #fitness_value += log(score * manhattan(profile, song_library[songID])) p@25: #fitness_value += score * cosine_similarity(profile, song_library[songID]) p@25: return fitness_value p@25: p@26: def users_likes_subset(users, rating_threshold=2): p@25: # Subset of most-liked items p@25: users_subset = {} p@25: for userID, songs in users.iteritems(): p@25: scores_above_threshold = { p@25: songID: score for songID, score in songs.iteritems() if score > rating_threshold p@15: } p@25: users_subset[userID]= scores_above_threshold p@25: p@25: #for songID, score in songs.iteritems(): p@25: #print score >0 p@25: #if score > 0: p@25: #print {userID: {songID: score}} p@15: p@25: #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} p@25: p@25: return users_subset p@15: p@26: def eda_train(users_subset, max_gen=250): p@25: # TRAINING p@25: num_features = len(song_library.values()[0]) p@25: # Given parameters for EDA p@25: population_size = len(users_subset) p@25: fraction_of_population = int(round(0.5 * population_size)) p@23: p@25: # Generation of M individuals uniformly p@25: np.random.seed(12345) p@25: M = np.random.uniform( p@25: 0, p@25: 1, p@25: population_size * num_features p@25: ) p@25: M.shape = (-1, num_features) p@25: profile_u = {} p@25: i = 0 p@25: for userID in users_subset: p@25: profile_u[userID] = M.tolist()[i] p@25: i += 1 p@23: p@26: fitnesses = [] p@25: generation = 0 p@25: while generation < max_gen: p@25: # Compute fitness values p@25: users_fitness = {} p@25: for userID in profile_u: p@25: users_fitness[userID] = Fitness( p@25: profile_u[userID], p@25: users_subset[userID] p@25: ) p@25: users_fitness_df = pd.DataFrame( p@25: users_fitness.items(), p@25: columns=["userID", "fitness"] p@25: ) p@26: fitnesses.append(users_fitness_df.fitness.values.tolist()) p@26: p@25: # Selection of best individuals based on fitness values p@25: best_individuals = {} p@25: users_fitness_df = users_fitness_df.sort(columns='fitness') p@25: M_sel = users_fitness_df.tail(fraction_of_population) p@25: M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() p@25: for userID in M_sel_dict: p@25: best_individuals[userID] = profile_u[userID] p@25: p@25: # Calculate sample mean and standard deviation p@25: D = np.array([]) p@25: for userID, features in best_individuals.iteritems(): p@25: D = np.append(D, features, axis=0) p@25: D.shape = (-1, num_features) p@25: D_mu = np.mean(D, axis=0) p@25: D_sigma = np.std(D, axis=0, ddof=1) p@25: p@25: # Sample M individuals p@25: M = np.random.normal( p@25: D_mu, p@25: D_sigma, p@25: (population_size, num_features) p@25: ) p@25: #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2)) p@25: p@25: #M.shape = (-1, len(items.values()[0])) p@25: #M = D_sigma * np.random.normal( p@25: #population_size, p@25: #len(items.values()[0]) p@25: #) + D_mu p@25: profile_u = {} p@25: i = 0 p@25: for userID in users_subset: p@25: profile_u[userID] = M.tolist()[i] p@25: i += 1 p@25: generation += 1 p@25: p@26: return profile_u, D, np.array(fitnesses) p@25: p@25: # Similarity matrix p@25: def cb_similarity(profileID, profile_data, test_data, N): p@26: p@26: a = [] p@26: for user, info in test_data.iteritems(): p@26: a.extend([i for i in info]) p@26: songIDs = list(set(a)) p@26: p@25: ''' Content-based: Similarity matrix ''' p@25: similarity = [] p@26: for songID in songIDs: p@25: sim = adj_cos_sim(profile_data, song_library[songID]) p@25: similarity.append((sim, songID)) p@26: p@26: ''' Top-N recommendation ''' p@26: similarity.sort(reverse=True) p@26: if len(similarity) > N: p@26: similarity = similarity[0:N] p@25: p@25: #sim_matrix[userID] = {t[1]: t[0] for t in similarity} p@25: return {t[1]: t[0] for t in similarity} p@25: p@25: def evaluate_eda( p@25: profiles, p@25: test_data, p@25: N=10, p@26: rating_threshold=2, p@26: EDA_treshold=0.5): p@25: p@25: ''' Evaluation ''' p@25: p@25: sim_matrix = {} p@25: for userID, features in profiles.iteritems(): p@25: sim_matrix[userID] = cb_similarity(userID, features, test_data, N) p@25: p@25: # Content-Based: Evaluation p@25: tp = 0. p@25: fp = 0. p@25: fn = 0. p@25: tn = 0. p@25: p@26: for user, song_rating in test_data.iteritems(): p@26: entries = sim_matrix[user] p@26: for song, rating in song_rating.iteritems(): p@26: if song in entries: p@26: if rating > rating_threshold: p@26: tp += 1 p@26: elif rating <= rating_threshold: p@26: fp += 1 p@26: else: p@26: if rating > rating_threshold: p@26: fn += 1 p@26: elif rating <= rating_threshold: p@26: tn += 1 p@26: p@26: p@26: # for userID, songID_sim in sim_matrix.iteritems(): p@26: # for songID, sim_value in songID_sim.iteritems(): p@26: # score = test_data[userID][songID] p@26: # if score > rating_threshold and sim_value >= EDA_treshold: p@26: # tp += 1 p@26: # elif score <= rating_threshold and sim_value >= EDA_treshold: p@26: # fp += 1 p@26: # elif score > rating_threshold and sim_value < EDA_treshold: p@26: # fn += 1 p@26: # elif score <= rating_threshold and sim_value < EDA_treshold: p@26: # tn += 1 p@27: #print tp, fp, fn, tn p@26: if tp != 0: p@26: precision = tp / (tp + fp) p@26: recall = tp / (tp + fn) p@26: F1 = 2 * precision * recall / (precision + recall) p@26: else: p@26: precision = 0 p@26: recall = 0 p@26: F1 = 0 p@26: p@25: accuracy = (tp + tn) / (tp + fp + tn + fn) p@25: p@25: return precision, recall, F1, accuracy p@25: p@25: #keys_a = set(users[userID].keys()) p@25: #keys_b = set(test_data.keys()) p@25: #intersection = keys_a & keys_b p@25: #if len(intersection) != 0: p@25: #similarity = {} p@25: #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs} p@25: #for songID in intersection: p@25: #if songID == k: p@25: #similarity[songID] = adj_cos_sim( p@25: #profile[userID], p@25: #test_data[songID] p@25: #) p@25: #max_sim = max(similarity, key=similarity.get) p@25: #if max_sim >= EDA_treshold: p@25: #sim_matrix[userID] = {max_sim: similarity[max_sim]} p@25: #sim_matrix[userID] = similarity p@25: #sim_matrix[userID] = {max_sim: similarity[max_sim]} p@25: p@25: #print len(sim_matrix) p@25: p = np.array([]) p@25: f = np.array([]) p@25: r = np.array([]) p@25: a = np.array([]) p@25: p@25: for i in range(len(users_train)): p@26: start_time = time.time() p@26: profile_u, prob, fffitness = eda_train(users_likes_subset(users_train[i])) p@26: elapsed_time = time.time() - start_time p@26: print 'Training execution time: %.3f seconds' % elapsed_time p@25: p@27: pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i], N=20) p@25: p = np.append(p, pi) p@25: r = np.append(r, ri) p@25: f = np.append(f, fi) p@25: a = np.append(a, ai) p@25: p@25: #precision = np.array(p) p@25: #rec = np.array(r) p@25: #F1 = np.array(f) p@25: #accuracy = np.array(a) p@25: p@25: print "Precision = %f3 ± %f3" % (p.mean(), p.std()) p@25: print "Recall = %f3 ± %f3" % (r.mean(), r.std()) p@25: print "F1 = %f3 ± %f3" % (f.mean(), f.std()) p@25: print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())