Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
diff Code/content_based.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | |
children | e4bcfe00abf4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Code/content_based.py Sun Aug 23 16:47:54 2015 +0100 @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Aug 19 11:58:19 2015 + +@author: paulochiliguano +""" + +import cPickle as pickle +from math import sqrt +import numpy as np +import pandas as pd + +# Item-vector dictionary +f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ +genre_classification/genre_prob.pkl', 'rb') +song_library = pickle.load(f) +f.close() + +# Load training and test data +f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ +cross_validation.pkl', 'rb') +users_train, users_test = pickle.load(f) +f.close() + +# Adjusted Cosine Similarity +def adj_cos_sim(vector_i, vector_j): + avrg_w_i = (float(sum(vector_i)) / len(vector_i)) + avrg_w_j = (float(sum(vector_j)) / len(vector_j)) + num = sum(map( + lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), + vector_i, + vector_j) + ) + dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) + dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) + return num / (sqrt(dem1) * sqrt(dem2)) + +def computeNearestNeighbor(itemName, itemVector, items): + """creates a sorted list of items based on their distance to item""" + similarities = [] + for otherItem in items: + if otherItem != itemName: + sim = adj_cos_sim(itemVector, items[otherItem]) + similarities.append((sim, otherItem)) + # sort based on distance -- closest first + similarities.sort(reverse=True) + #if len(similarities) > N: + #similarities = similarities[0:N] + return similarities + +def nearest_neighbours(song, train_songs, N): + similarities = [] + for k in train_songs: + sim = adj_cos_sim(song_library[song], song_library[k]) + similarities.append((sim, k)) + similarities.sort(reverse=True) + #if len(similarities) > N: + #similarities = similarities[0:N] + return similarities + #return {t[1]: t[0] for t in similarities} + +def build_model_cb(song_library, k=30): + other_songs = song_library.keys() + similarity_matrix = {} + for song in song_library: + similarities = [] + for other in other_songs: + if other != song: + sim = adj_cos_sim(song_library[song], song_library[other]) + similarities.append((sim, other)) + similarities.sort(reverse=True) + similarity_matrix[song] = similarities[0:k] + return similarity_matrix + #similarity_rows[song] = {t[1]: t[0] for t in similarities} + + +def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10): + candidate = pd.DataFrame() + entries = song_rating.keys() + for song, rating in song_rating.iteritems(): + if rating > rating_threshold: + sim = sim_matrix[song] + list_a = [k for v, k in sim] + raw = [v for v, k in sim] + sim_norm = [float(i)/max(raw) for i in raw] + the_dict = dict(zip(list_a, sim_norm)) + for key in entries: + if key in the_dict: + del the_dict[key] + candidate_aux = pd.DataFrame( + the_dict.items(), + columns=['song', 'similarity'] + ) + candidate = candidate.append(candidate_aux, ignore_index=True) + #tuples = [(k,v) for k,v in the_dict.iteritems()] + #candidate.extend(tuples) + topN = candidate.groupby('song')['similarity'].sum() + topN.sort(1, ascending=False) + + return list(topN.head(N).keys()) + +def evaluate_cb(topN, test_data, rating_threshold=3): + + tp = 0. + fp = 0. + fn = 0. + tn = 0. + for user, song_rating in test_data.iteritems(): + entries = topN[user] + for song, rating in song_rating.iteritems(): + if song in entries: + if rating > rating_threshold: + tp += 1 + elif rating <= rating_threshold: + fp += 1 + else: + if rating > rating_threshold: + fn += 1 + elif rating <= rating_threshold: + tn += 1 + print tp, fp, fn, tn + precision = tp / (tp + fp) + recall = tp / (tp + fn) + F1 = 2 * precision * recall / (precision + recall) + accuracy = (tp + tn) / (tp + fp + tn + fn) + + return precision, recall, F1, accuracy + +sim_matrix = build_model_cb(song_library, 30) +p = np.array([]) +f = np.array([]) +r = np.array([]) +a = np.array([]) +for i in range(len(users_train)): + topN = {} + for user, song_rating in users_train[i].iteritems(): + topN[user] = top_n(sim_matrix, user, song_rating) + pi, ri, fi, ai = evaluate_cb(topN, users_test[i]) + + p = np.append(p, pi) + r = np.append(r, ri) + f = np.append(f, fi) + a = np.append(a, ai) + + +print "Precision = %f3 ± %f3" % (p.mean(), p.std()) +print "Recall = %f3 ± %f3" % (r.mean(), r.std()) +print "F1 = %f3 ± %f3" % (f.mean(), f.std()) +print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) + +# set_C = {t[0]: t[1] for t in candidate} +# for song in set_C: +# sim = sim_matrix[song] +# the_dict = {t[1]: t[0] for t in sim} +# for key in entries: +# if key in the_dict: +# the_dict[key]