annotate Code/content_based.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sat, 09 Jul 2022 00:50:43 -0500
parents ae650489d3a8
children
rev   line source
p@25 1 # -*- coding: utf-8 -*-
p@25 2 """
p@25 3 Created on Wed Aug 19 11:58:19 2015
p@25 4
p@25 5 @author: paulochiliguano
p@25 6 """
p@25 7
p@25 8 import cPickle as pickle
p@25 9 from math import sqrt
p@25 10 import numpy as np
p@25 11 import pandas as pd
p@26 12 import time
p@25 13
p@25 14 # Item-vector dictionary
p@25 15 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25 16 genre_classification/genre_prob.pkl', 'rb')
p@25 17 song_library = pickle.load(f)
p@25 18 f.close()
p@25 19
p@26 20 # Normalisation
p@26 21 #test = []
p@26 22 #for k, v in song_library.iteritems():
p@26 23 # test.append(v)
p@26 24 #test = np.array(test)
p@26 25 #test_median = np.median(test, axis=0)
p@26 26 #test_abs = abs(test - test_median)
p@26 27 #test_asd = test_abs.sum(axis=0) / test.shape[0]
p@26 28 #for k, v in song_library.iteritems():
p@26 29 # modified_standard_score = (np.array(v) - test_median) / test_asd
p@26 30 # song_library[k] = modified_standard_score.tolist()
p@26 31
p@25 32 # Load training and test data
p@25 33 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25 34 cross_validation.pkl', 'rb')
p@25 35 users_train, users_test = pickle.load(f)
p@25 36 f.close()
p@25 37
p@25 38 # Adjusted Cosine Similarity
p@25 39 def adj_cos_sim(vector_i, vector_j):
p@25 40 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
p@25 41 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
p@25 42 num = sum(map(
p@25 43 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
p@25 44 vector_i,
p@25 45 vector_j)
p@25 46 )
p@25 47 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
p@25 48 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
p@25 49 return num / (sqrt(dem1) * sqrt(dem2))
p@25 50
p@26 51 def build_model_cb(train_data, k=30):
p@26 52 a = []
p@26 53 for user, info in train_data.iteritems():
p@26 54 a.extend([i for i in info])
p@26 55 songIDs = list(set(a))
p@26 56 #other_songs = song_library.keys()
p@26 57
p@25 58 similarity_matrix = {}
p@26 59 for song in songIDs:
p@25 60 similarities = []
p@26 61 for other in songIDs:
p@25 62 if other != song:
p@25 63 sim = adj_cos_sim(song_library[song], song_library[other])
p@25 64 similarities.append((sim, other))
p@25 65 similarities.sort(reverse=True)
p@25 66 similarity_matrix[song] = similarities[0:k]
p@26 67
p@25 68 return similarity_matrix
p@25 69 #similarity_rows[song] = {t[1]: t[0] for t in similarities}
p@25 70
p@25 71 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10):
p@25 72 candidate = pd.DataFrame()
p@25 73 entries = song_rating.keys()
p@25 74 for song, rating in song_rating.iteritems():
p@25 75 if rating > rating_threshold:
p@25 76 sim = sim_matrix[song]
p@25 77 list_a = [k for v, k in sim]
p@25 78 raw = [v for v, k in sim]
p@25 79 sim_norm = [float(i)/max(raw) for i in raw]
p@25 80 the_dict = dict(zip(list_a, sim_norm))
p@25 81 for key in entries:
p@25 82 if key in the_dict:
p@25 83 del the_dict[key]
p@25 84 candidate_aux = pd.DataFrame(
p@25 85 the_dict.items(),
p@25 86 columns=['song', 'similarity']
p@25 87 )
p@25 88 candidate = candidate.append(candidate_aux, ignore_index=True)
p@25 89 #tuples = [(k,v) for k,v in the_dict.iteritems()]
p@25 90 #candidate.extend(tuples)
p@25 91 topN = candidate.groupby('song')['similarity'].sum()
p@25 92 topN.sort(1, ascending=False)
p@25 93
p@25 94 return list(topN.head(N).keys())
p@25 95
p@26 96 def evaluate_cb(topN, test_data, rating_threshold=2):
p@25 97
p@25 98 tp = 0.
p@25 99 fp = 0.
p@25 100 fn = 0.
p@25 101 tn = 0.
p@25 102 for user, song_rating in test_data.iteritems():
p@25 103 entries = topN[user]
p@25 104 for song, rating in song_rating.iteritems():
p@25 105 if song in entries:
p@25 106 if rating > rating_threshold:
p@25 107 tp += 1
p@25 108 elif rating <= rating_threshold:
p@25 109 fp += 1
p@25 110 else:
p@25 111 if rating > rating_threshold:
p@25 112 fn += 1
p@25 113 elif rating <= rating_threshold:
p@25 114 tn += 1
p@27 115 #print tp, fp, fn, tn
p@26 116 if tp != 0:
p@26 117 precision = tp / (tp + fp)
p@26 118 recall = tp / (tp + fn)
p@26 119 F1 = 2 * precision * recall / (precision + recall)
p@26 120 else:
p@26 121 precision = 0
p@26 122 recall = 0
p@26 123 F1 = 0
p@26 124
p@25 125 accuracy = (tp + tn) / (tp + fp + tn + fn)
p@25 126
p@25 127 return precision, recall, F1, accuracy
p@26 128
p@25 129 p = np.array([])
p@25 130 f = np.array([])
p@25 131 r = np.array([])
p@25 132 a = np.array([])
p@26 133
p@25 134 for i in range(len(users_train)):
p@26 135
p@26 136 start_time = time.time()
p@26 137 sim_matrix = build_model_cb(users_train[i])
p@26 138
p@25 139 topN = {}
p@25 140 for user, song_rating in users_train[i].iteritems():
p@27 141 topN[user] = top_n(sim_matrix, user, song_rating, rating_threshold=2, N=20)
p@26 142 elapsed_time = time.time() - start_time
p@26 143 print 'Training execution time: %.3f seconds' % elapsed_time
p@26 144
p@25 145 pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
p@25 146
p@25 147 p = np.append(p, pi)
p@25 148 r = np.append(r, ri)
p@25 149 f = np.append(f, fi)
p@25 150 a = np.append(a, ai)
p@25 151
p@25 152 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
p@25 153 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
p@25 154 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
p@25 155 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
p@25 156
p@25 157 # set_C = {t[0]: t[1] for t in candidate}
p@25 158 # for song in set_C:
p@25 159 # sim = sim_matrix[song]
p@25 160 # the_dict = {t[1]: t[0] for t in sim}
p@25 161 # for key in entries:
p@25 162 # if key in the_dict:
p@25 163 # the_dict[key]