Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/content_based.py @ 47:b0186d4a4496 tip
Move 7Digital dataset to Downloads
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 09 Jul 2022 00:50:43 -0500 |
parents | ae650489d3a8 |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Wed Aug 19 11:58:19 2015 @author: paulochiliguano """ import cPickle as pickle from math import sqrt import numpy as np import pandas as pd import time # Item-vector dictionary f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ genre_classification/genre_prob.pkl', 'rb') song_library = pickle.load(f) f.close() # Normalisation #test = [] #for k, v in song_library.iteritems(): # test.append(v) #test = np.array(test) #test_median = np.median(test, axis=0) #test_abs = abs(test - test_median) #test_asd = test_abs.sum(axis=0) / test.shape[0] #for k, v in song_library.iteritems(): # modified_standard_score = (np.array(v) - test_median) / test_asd # song_library[k] = modified_standard_score.tolist() # Load training and test data f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ cross_validation.pkl', 'rb') users_train, users_test = pickle.load(f) f.close() # Adjusted Cosine Similarity def adj_cos_sim(vector_i, vector_j): avrg_w_i = (float(sum(vector_i)) / len(vector_i)) avrg_w_j = (float(sum(vector_j)) / len(vector_j)) num = sum(map( lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), vector_i, vector_j) ) dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) return num / (sqrt(dem1) * sqrt(dem2)) def build_model_cb(train_data, k=30): a = [] for user, info in train_data.iteritems(): a.extend([i for i in info]) songIDs = list(set(a)) #other_songs = song_library.keys() similarity_matrix = {} for song in songIDs: similarities = [] for other in songIDs: if other != song: sim = adj_cos_sim(song_library[song], song_library[other]) similarities.append((sim, other)) similarities.sort(reverse=True) similarity_matrix[song] = similarities[0:k] return similarity_matrix #similarity_rows[song] = {t[1]: t[0] for t in similarities} def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10): candidate = pd.DataFrame() entries = song_rating.keys() for song, rating in song_rating.iteritems(): if rating > rating_threshold: sim = sim_matrix[song] list_a = [k for v, k in sim] raw = [v for v, k in sim] sim_norm = [float(i)/max(raw) for i in raw] the_dict = dict(zip(list_a, sim_norm)) for key in entries: if key in the_dict: del the_dict[key] candidate_aux = pd.DataFrame( the_dict.items(), columns=['song', 'similarity'] ) candidate = candidate.append(candidate_aux, ignore_index=True) #tuples = [(k,v) for k,v in the_dict.iteritems()] #candidate.extend(tuples) topN = candidate.groupby('song')['similarity'].sum() topN.sort(1, ascending=False) return list(topN.head(N).keys()) def evaluate_cb(topN, test_data, rating_threshold=2): tp = 0. fp = 0. fn = 0. tn = 0. for user, song_rating in test_data.iteritems(): entries = topN[user] for song, rating in song_rating.iteritems(): if song in entries: if rating > rating_threshold: tp += 1 elif rating <= rating_threshold: fp += 1 else: if rating > rating_threshold: fn += 1 elif rating <= rating_threshold: tn += 1 #print tp, fp, fn, tn if tp != 0: precision = tp / (tp + fp) recall = tp / (tp + fn) F1 = 2 * precision * recall / (precision + recall) else: precision = 0 recall = 0 F1 = 0 accuracy = (tp + tn) / (tp + fp + tn + fn) return precision, recall, F1, accuracy p = np.array([]) f = np.array([]) r = np.array([]) a = np.array([]) for i in range(len(users_train)): start_time = time.time() sim_matrix = build_model_cb(users_train[i]) topN = {} for user, song_rating in users_train[i].iteritems(): topN[user] = top_n(sim_matrix, user, song_rating, rating_threshold=2, N=20) elapsed_time = time.time() - start_time print 'Training execution time: %.3f seconds' % elapsed_time pi, ri, fi, ai = evaluate_cb(topN, users_test[i]) p = np.append(p, pi) r = np.append(r, ri) f = np.append(f, fi) a = np.append(a, ai) print "Precision = %f3 ± %f3" % (p.mean(), p.std()) print "Recall = %f3 ± %f3" % (r.mean(), r.std()) print "F1 = %f3 ± %f3" % (f.mean(), f.std()) print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) # set_C = {t[0]: t[1] for t in candidate} # for song in set_C: # sim = sim_matrix[song] # the_dict = {t[1]: t[0] for t in sim} # for key in entries: # if key in the_dict: # the_dict[key]