Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
diff Code/eda_discrete.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | |
children | e4bcfe00abf4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Code/eda_discrete.py Sun Aug 23 16:47:54 2015 +0100 @@ -0,0 +1,248 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 22 17:42:09 2015 + +@author: paulochiliguano +""" + + +from math import log, sqrt +import numpy as np +import pandas as pd +import cPickle as pickle +#import random + +# Item-vector dictionary +f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ +genre_classification/genre_prob.pkl', 'rb') +song_library = pickle.load(f) +f.close() + +# Load training and test data +f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ +cross_validation.pkl', 'rb') +users_train, users_test = pickle.load(f) +f.close() + +# Cosine Similarity +def cosine_similarity(vector1, vector2): + dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) + length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) + length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) + return dot_product / (length_x * length_y) + +# Adjusted Cosine Similarity +def adj_cos_sim(vector_i, vector_j): + avrg_w_i = (float(sum(vector_i)) / len(vector_i)) + avrg_w_j = (float(sum(vector_j)) / len(vector_j)) + num = sum(map( + lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), + vector_i, + vector_j) + ) + dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) + dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) + + return num / (sqrt(dem1) * sqrt(dem2)) + +# Fitness function for EDA +def Fitness(profile_u, user_subset): + fitness_value = 0 + for songID, score in user_subset.iteritems(): + #print cosine_similarity(profile_u, song_library[songID]) + sim = cosine_similarity(profile_u, song_library[songID]) + if sim <= 0: + fitness_value += -708 + #math.log(sys.float_info.min) + else: + fitness_value += log(score * sim) + #fitness_value += log(score * manhattan(profile, song_library[songID])) + #fitness_value += score * cosine_similarity(profile, song_library[songID]) + return fitness_value + +def users_likes_subset(users, rating_threshold=3): + # Subset of most-liked items + users_subset = {} + for userID, songs in users.iteritems(): + scores_above_threshold = { + songID: score for songID, score in songs.iteritems() if score > rating_threshold + } + users_subset[userID]= scores_above_threshold + + #for songID, score in songs.iteritems(): + #print score >0 + #if score > 0: + #print {userID: {songID: score}} + + #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} + + return users_subset + +def eda_train(users_subset, max_gen=1000): + # TRAINING + num_features = len(song_library.values()[0]) + # Given parameters for EDA + population_size = len(users_subset) + fraction_of_population = int(round(0.5 * population_size)) + + # Ku set + weights = list(np.linspace(0.1, 0.9)) + tags = [ + 'blues', + 'classical', + 'country', + 'disco', + 'hiphop', + 'jazz', + 'metal', + 'pop', + 'reggae', + 'rock' + ] + for i, j in enumerate(tags): + tags[i] = i + list_a = np.tile(weights, num_features) + list_b = np.repeat(tags, len(weights)) + Ku = zip(list_b, list_a) + Ku_np = np.array(Ku, dtype=('int, float')) + + # Generate initial population + np.random.seed(12345) + profile_u = {} + profile_aux = {} + for userID in users_subset: + a = np.random.choice( + Ku_np, + num_features, + ).tolist() + #a = sorted(a, key=lambda student: student[1], reverse=True) + b = {t[0]: t[1] for t in a} + feature_v = list(np.zeros(num_features)) + for k, v in b.iteritems(): + feature_v[k] = v + profile_u[userID] = feature_v + profile_aux[userID] = [(k, v) for k, v in b.iteritems()] + + generation = 0 + while generation < max_gen: + # Compute fitness values + users_fitness = {} + for userID in profile_u: + users_fitness[userID] = Fitness( + profile_u[userID], + users_subset[userID] + ) + users_fitness_df = pd.DataFrame( + users_fitness.items(), + columns=["userID", "fitness"] + ) + + # Selection of best individuals based on fitness values + #best_individuals = {} + users_fitness_df = users_fitness_df.sort(columns='fitness') + M_sel = users_fitness_df.tail(fraction_of_population) + M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() + #for userID in M_sel_dict: + #best_individuals[userID] = profile_u[userID] + + Xs = [] + for userID in M_sel_dict: + Xs.extend(profile_aux[userID]) + + # Update probability model + p = [] + for i in Ku: + p.append(float(Xs.count(i)) / fraction_of_population) + + # Sample new population + profile_u = {} + profile_aux = {} + for userID in users_subset: + a = np.random.choice( + Ku_np, + num_features, + p + ).tolist() + #a = sorted(a, key=lambda student: student[1], reverse=True) + b = {t[0]: t[1] for t in a} + feature_v = list(np.zeros(num_features)) + for k, v in b.iteritems(): + feature_v[k] = v + profile_u[userID] = feature_v + profile_aux[userID] = [(k, v) for k, v in b.iteritems()] + + generation += 1 + + return profile_u, p + +# Similarity matrix +def cb_similarity(profileID, profile_data, test_data, N): + ''' Content-based: Similarity matrix ''' + similarity = [] + #keys_a = train_data[profileID].keys() + for songID in test_data[profileID]: + sim = adj_cos_sim(profile_data, song_library[songID]) + similarity.append((sim, songID)) + # Top-N recommendation + #similarity.sort(reverse=True) + #if len(similarity) > N: + #similarity = similarity[0:N] + + #sim_matrix[userID] = {t[1]: t[0] for t in similarity} + return {t[1]: t[0] for t in similarity} + +def evaluate_eda( + profiles, + test_data, + N=10, + rating_threshold=3, + EDA_treshold=0.5): + + ''' Evaluation ''' + + sim_matrix = {} + for userID, features in profiles.iteritems(): + sim_matrix[userID] = cb_similarity(userID, features, test_data, N) + + # Content-Based: Evaluation + tp = 0. + fp = 0. + fn = 0. + tn = 0. + for userID, songID_sim in sim_matrix.iteritems(): + for songID, sim_value in songID_sim.iteritems(): + score = test_data[userID][songID] + if score > rating_threshold and sim_value >= EDA_treshold: + tp += 1 + elif score <= rating_threshold and sim_value >= EDA_treshold: + fp += 1 + elif score > rating_threshold and sim_value < EDA_treshold: + fn += 1 + elif score <= rating_threshold and sim_value < EDA_treshold: + tn += 1 + + precision = tp / (tp + fp) + recall = tp / (tp + fn) + F1 = 2 * precision * recall / (precision + recall) + accuracy = (tp + tn) / (tp + fp + tn + fn) + + return precision, recall, F1, accuracy + +p = np.array([]) +f = np.array([]) +r = np.array([]) +a = np.array([]) + +for i in range(len(users_train)): + + profile_u, prob = eda_train(users_likes_subset(users_train[i])) + pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i]) + p = np.append(p, pi) + r = np.append(r, ri) + f = np.append(f, fi) + a = np.append(a, ai) + +print "Precision = %f3 ± %f3" % (p.mean(), p.std()) +print "Recall = %f3 ± %f3" % (r.mean(), r.std()) +print "F1 = %f3 ± %f3" % (f.mean(), f.std()) +print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())