hybrid-music-recommender-using-content-based-and-social-information: Code/content

annotate Code/content_based.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads

author	Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date	Sat, 09 Jul 2022 00:50:43 -0500
parents	ae650489d3a8
children

rev	line source
p@25	1 # -- coding: utf-8 --
p@25	2 """
p@25	3 Created on Wed Aug 19 11:58:19 2015
p@25	4
p@25	5 @author: paulochiliguano
p@25	6 """
p@25	7
p@25	8 import cPickle as pickle
p@25	9 from math import sqrt
p@25	10 import numpy as np
p@25	11 import pandas as pd
p@26	12 import time
p@25	13
p@25	14 # Item-vector dictionary
p@25	15 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25	16 genre_classification/genre_prob.pkl', 'rb')
p@25	17 song_library = pickle.load(f)
p@25	18 f.close()
p@25	19
p@26	20 # Normalisation
p@26	21 #test = []
p@26	22 #for k, v in song_library.iteritems():
p@26	23 # test.append(v)
p@26	24 #test = np.array(test)
p@26	25 #test_median = np.median(test, axis=0)
p@26	26 #test_abs = abs(test - test_median)
p@26	27 #test_asd = test_abs.sum(axis=0) / test.shape[0]
p@26	28 #for k, v in song_library.iteritems():
p@26	29 # modified_standard_score = (np.array(v) - test_median) / test_asd
p@26	30 # song_library[k] = modified_standard_score.tolist()
p@26	31
p@25	32 # Load training and test data
p@25	33 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25	34 cross_validation.pkl', 'rb')
p@25	35 users_train, users_test = pickle.load(f)
p@25	36 f.close()
p@25	37
p@25	38 # Adjusted Cosine Similarity
p@25	39 def adj_cos_sim(vector_i, vector_j):
p@25	40 avrg_w_i = (float(sum(vector_i)) / len(vector_i))
p@25	41 avrg_w_j = (float(sum(vector_j)) / len(vector_j))
p@25	42 num = sum(map(
p@25	43 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
p@25	44 vector_i,
p@25	45 vector_j)
p@25	46 )
p@25	47 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
p@25	48 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
p@25	49 return num / (sqrt(dem1) * sqrt(dem2))
p@25	50
p@26	51 def build_model_cb(train_data, k=30):
p@26	52 a = []
p@26	53 for user, info in train_data.iteritems():
p@26	54 a.extend([i for i in info])
p@26	55 songIDs = list(set(a))
p@26	56 #other_songs = song_library.keys()
p@26	57
p@25	58 similarity_matrix = {}
p@26	59 for song in songIDs:
p@25	60 similarities = []
p@26	61 for other in songIDs:
p@25	62 if other != song:
p@25	63 sim = adj_cos_sim(song_library[song], song_library[other])
p@25	64 similarities.append((sim, other))
p@25	65 similarities.sort(reverse=True)
p@25	66 similarity_matrix[song] = similarities[0:k]
p@26	67
p@25	68 return similarity_matrix
p@25	69 #similarity_rows[song] = {t[1]: t[0] for t in similarities}
p@25	70
p@25	71 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10):
p@25	72 candidate = pd.DataFrame()
p@25	73 entries = song_rating.keys()
p@25	74 for song, rating in song_rating.iteritems():
p@25	75 if rating > rating_threshold:
p@25	76 sim = sim_matrix[song]
p@25	77 list_a = [k for v, k in sim]
p@25	78 raw = [v for v, k in sim]
p@25	79 sim_norm = [float(i)/max(raw) for i in raw]
p@25	80 the_dict = dict(zip(list_a, sim_norm))
p@25	81 for key in entries:
p@25	82 if key in the_dict:
p@25	83 del the_dict[key]
p@25	84 candidate_aux = pd.DataFrame(
p@25	85 the_dict.items(),
p@25	86 columns=['song', 'similarity']
p@25	87 )
p@25	88 candidate = candidate.append(candidate_aux, ignore_index=True)
p@25	89 #tuples = [(k,v) for k,v in the_dict.iteritems()]
p@25	90 #candidate.extend(tuples)
p@25	91 topN = candidate.groupby('song')['similarity'].sum()
p@25	92 topN.sort(1, ascending=False)
p@25	93
p@25	94 return list(topN.head(N).keys())
p@25	95
p@26	96 def evaluate_cb(topN, test_data, rating_threshold=2):
p@25	97
p@25	98 tp = 0.
p@25	99 fp = 0.
p@25	100 fn = 0.
p@25	101 tn = 0.
p@25	102 for user, song_rating in test_data.iteritems():
p@25	103 entries = topN[user]
p@25	104 for song, rating in song_rating.iteritems():
p@25	105 if song in entries:
p@25	106 if rating > rating_threshold:
p@25	107 tp += 1
p@25	108 elif rating <= rating_threshold:
p@25	109 fp += 1
p@25	110 else:
p@25	111 if rating > rating_threshold:
p@25	112 fn += 1
p@25	113 elif rating <= rating_threshold:
p@25	114 tn += 1
p@27	115 #print tp, fp, fn, tn
p@26	116 if tp != 0:
p@26	117 precision = tp / (tp + fp)
p@26	118 recall = tp / (tp + fn)
p@26	119 F1 = 2 * precision * recall / (precision + recall)
p@26	120 else:
p@26	121 precision = 0
p@26	122 recall = 0
p@26	123 F1 = 0
p@26	124
p@25	125 accuracy = (tp + tn) / (tp + fp + tn + fn)
p@25	126
p@25	127 return precision, recall, F1, accuracy
p@26	128
p@25	129 p = np.array([])
p@25	130 f = np.array([])
p@25	131 r = np.array([])
p@25	132 a = np.array([])
p@26	133
p@25	134 for i in range(len(users_train)):
p@26	135
p@26	136 start_time = time.time()
p@26	137 sim_matrix = build_model_cb(users_train[i])
p@26	138
p@25	139 topN = {}
p@25	140 for user, song_rating in users_train[i].iteritems():
p@27	141 topN[user] = top_n(sim_matrix, user, song_rating, rating_threshold=2, N=20)
p@26	142 elapsed_time = time.time() - start_time
p@26	143 print 'Training execution time: %.3f seconds' % elapsed_time
p@26	144
p@25	145 pi, ri, fi, ai = evaluate_cb(topN, users_test[i])
p@25	146
p@25	147 p = np.append(p, pi)
p@25	148 r = np.append(r, ri)
p@25	149 f = np.append(f, fi)
p@25	150 a = np.append(a, ai)
p@25	151
p@25	152 print "Precision = %f3 ± %f3" % (p.mean(), p.std())
p@25	153 print "Recall = %f3 ± %f3" % (r.mean(), r.std())
p@25	154 print "F1 = %f3 ± %f3" % (f.mean(), f.std())
p@25	155 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
p@25	156
p@25	157 # set_C = {t[0]: t[1] for t in candidate}
p@25	158 # for song in set_C:
p@25	159 # sim = sim_matrix[song]
p@25	160 # the_dict = {t[1]: t[0] for t in sim}
p@25	161 # for key in entries:
p@25	162 # if key in the_dict:
p@25	163 # the_dict[key]

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

annotate Code/content_based.py @ 47:b0186d4a4496 tip