p@25: # -*- coding: utf-8 -*- p@25: """ p@25: Created on Wed Aug 19 11:58:19 2015 p@25: p@25: @author: paulochiliguano p@25: """ p@25: p@25: import cPickle as pickle p@25: from math import sqrt p@25: import numpy as np p@25: import pandas as pd p@26: import time p@25: p@25: # Item-vector dictionary p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: genre_classification/genre_prob.pkl', 'rb') p@25: song_library = pickle.load(f) p@25: f.close() p@25: p@26: # Normalisation p@26: #test = [] p@26: #for k, v in song_library.iteritems(): p@26: # test.append(v) p@26: #test = np.array(test) p@26: #test_median = np.median(test, axis=0) p@26: #test_abs = abs(test - test_median) p@26: #test_asd = test_abs.sum(axis=0) / test.shape[0] p@26: #for k, v in song_library.iteritems(): p@26: # modified_standard_score = (np.array(v) - test_median) / test_asd p@26: # song_library[k] = modified_standard_score.tolist() p@26: p@25: # Load training and test data p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: cross_validation.pkl', 'rb') p@25: users_train, users_test = pickle.load(f) p@25: f.close() p@25: p@25: # Adjusted Cosine Similarity p@25: def adj_cos_sim(vector_i, vector_j): p@25: avrg_w_i = (float(sum(vector_i)) / len(vector_i)) p@25: avrg_w_j = (float(sum(vector_j)) / len(vector_j)) p@25: num = sum(map( p@25: lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), p@25: vector_i, p@25: vector_j) p@25: ) p@25: dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) p@25: dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) p@25: return num / (sqrt(dem1) * sqrt(dem2)) p@25: p@26: def build_model_cb(train_data, k=30): p@26: a = [] p@26: for user, info in train_data.iteritems(): p@26: a.extend([i for i in info]) p@26: songIDs = list(set(a)) p@26: #other_songs = song_library.keys() p@26: p@25: similarity_matrix = {} p@26: for song in songIDs: p@25: similarities = [] p@26: for other in songIDs: p@25: if other != song: p@25: sim = adj_cos_sim(song_library[song], song_library[other]) p@25: similarities.append((sim, other)) p@25: similarities.sort(reverse=True) p@25: similarity_matrix[song] = similarities[0:k] p@26: p@25: return similarity_matrix p@25: #similarity_rows[song] = {t[1]: t[0] for t in similarities} p@25: p@25: def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10): p@25: candidate = pd.DataFrame() p@25: entries = song_rating.keys() p@25: for song, rating in song_rating.iteritems(): p@25: if rating > rating_threshold: p@25: sim = sim_matrix[song] p@25: list_a = [k for v, k in sim] p@25: raw = [v for v, k in sim] p@25: sim_norm = [float(i)/max(raw) for i in raw] p@25: the_dict = dict(zip(list_a, sim_norm)) p@25: for key in entries: p@25: if key in the_dict: p@25: del the_dict[key] p@25: candidate_aux = pd.DataFrame( p@25: the_dict.items(), p@25: columns=['song', 'similarity'] p@25: ) p@25: candidate = candidate.append(candidate_aux, ignore_index=True) p@25: #tuples = [(k,v) for k,v in the_dict.iteritems()] p@25: #candidate.extend(tuples) p@25: topN = candidate.groupby('song')['similarity'].sum() p@25: topN.sort(1, ascending=False) p@25: p@25: return list(topN.head(N).keys()) p@25: p@26: def evaluate_cb(topN, test_data, rating_threshold=2): p@25: p@25: tp = 0. p@25: fp = 0. p@25: fn = 0. p@25: tn = 0. p@25: for user, song_rating in test_data.iteritems(): p@25: entries = topN[user] p@25: for song, rating in song_rating.iteritems(): p@25: if song in entries: p@25: if rating > rating_threshold: p@25: tp += 1 p@25: elif rating <= rating_threshold: p@25: fp += 1 p@25: else: p@25: if rating > rating_threshold: p@25: fn += 1 p@25: elif rating <= rating_threshold: p@25: tn += 1 p@27: #print tp, fp, fn, tn p@26: if tp != 0: p@26: precision = tp / (tp + fp) p@26: recall = tp / (tp + fn) p@26: F1 = 2 * precision * recall / (precision + recall) p@26: else: p@26: precision = 0 p@26: recall = 0 p@26: F1 = 0 p@26: p@25: accuracy = (tp + tn) / (tp + fp + tn + fn) p@25: p@25: return precision, recall, F1, accuracy p@26: p@25: p = np.array([]) p@25: f = np.array([]) p@25: r = np.array([]) p@25: a = np.array([]) p@26: p@25: for i in range(len(users_train)): p@26: p@26: start_time = time.time() p@26: sim_matrix = build_model_cb(users_train[i]) p@26: p@25: topN = {} p@25: for user, song_rating in users_train[i].iteritems(): p@27: topN[user] = top_n(sim_matrix, user, song_rating, rating_threshold=2, N=20) p@26: elapsed_time = time.time() - start_time p@26: print 'Training execution time: %.3f seconds' % elapsed_time p@26: p@25: pi, ri, fi, ai = evaluate_cb(topN, users_test[i]) p@25: p@25: p = np.append(p, pi) p@25: r = np.append(r, ri) p@25: f = np.append(f, fi) p@25: a = np.append(a, ai) p@25: p@25: print "Precision = %f3 ± %f3" % (p.mean(), p.std()) p@25: print "Recall = %f3 ± %f3" % (r.mean(), r.std()) p@25: print "F1 = %f3 ± %f3" % (f.mean(), f.std()) p@25: print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) p@25: p@25: # set_C = {t[0]: t[1] for t in candidate} p@25: # for song in set_C: p@25: # sim = sim_matrix[song] p@25: # the_dict = {t[1]: t[0] for t in sim} p@25: # for key in entries: p@25: # if key in the_dict: p@25: # the_dict[key]