Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/content_based.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | |
children | e4bcfe00abf4 |
comparison
equal
deleted
inserted
replaced
24:68a62ca32441 | 25:fafc0b249a73 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Wed Aug 19 11:58:19 2015 | |
4 | |
5 @author: paulochiliguano | |
6 """ | |
7 | |
8 import cPickle as pickle | |
9 from math import sqrt | |
10 import numpy as np | |
11 import pandas as pd | |
12 | |
13 # Item-vector dictionary | |
14 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ | |
15 genre_classification/genre_prob.pkl', 'rb') | |
16 song_library = pickle.load(f) | |
17 f.close() | |
18 | |
19 # Load training and test data | |
20 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ | |
21 cross_validation.pkl', 'rb') | |
22 users_train, users_test = pickle.load(f) | |
23 f.close() | |
24 | |
25 # Adjusted Cosine Similarity | |
26 def adj_cos_sim(vector_i, vector_j): | |
27 avrg_w_i = (float(sum(vector_i)) / len(vector_i)) | |
28 avrg_w_j = (float(sum(vector_j)) / len(vector_j)) | |
29 num = sum(map( | |
30 lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), | |
31 vector_i, | |
32 vector_j) | |
33 ) | |
34 dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) | |
35 dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) | |
36 return num / (sqrt(dem1) * sqrt(dem2)) | |
37 | |
38 def computeNearestNeighbor(itemName, itemVector, items): | |
39 """creates a sorted list of items based on their distance to item""" | |
40 similarities = [] | |
41 for otherItem in items: | |
42 if otherItem != itemName: | |
43 sim = adj_cos_sim(itemVector, items[otherItem]) | |
44 similarities.append((sim, otherItem)) | |
45 # sort based on distance -- closest first | |
46 similarities.sort(reverse=True) | |
47 #if len(similarities) > N: | |
48 #similarities = similarities[0:N] | |
49 return similarities | |
50 | |
51 def nearest_neighbours(song, train_songs, N): | |
52 similarities = [] | |
53 for k in train_songs: | |
54 sim = adj_cos_sim(song_library[song], song_library[k]) | |
55 similarities.append((sim, k)) | |
56 similarities.sort(reverse=True) | |
57 #if len(similarities) > N: | |
58 #similarities = similarities[0:N] | |
59 return similarities | |
60 #return {t[1]: t[0] for t in similarities} | |
61 | |
62 def build_model_cb(song_library, k=30): | |
63 other_songs = song_library.keys() | |
64 similarity_matrix = {} | |
65 for song in song_library: | |
66 similarities = [] | |
67 for other in other_songs: | |
68 if other != song: | |
69 sim = adj_cos_sim(song_library[song], song_library[other]) | |
70 similarities.append((sim, other)) | |
71 similarities.sort(reverse=True) | |
72 similarity_matrix[song] = similarities[0:k] | |
73 return similarity_matrix | |
74 #similarity_rows[song] = {t[1]: t[0] for t in similarities} | |
75 | |
76 | |
77 def top_n(sim_matrix, user, song_rating, rating_threshold=2, N=10): | |
78 candidate = pd.DataFrame() | |
79 entries = song_rating.keys() | |
80 for song, rating in song_rating.iteritems(): | |
81 if rating > rating_threshold: | |
82 sim = sim_matrix[song] | |
83 list_a = [k for v, k in sim] | |
84 raw = [v for v, k in sim] | |
85 sim_norm = [float(i)/max(raw) for i in raw] | |
86 the_dict = dict(zip(list_a, sim_norm)) | |
87 for key in entries: | |
88 if key in the_dict: | |
89 del the_dict[key] | |
90 candidate_aux = pd.DataFrame( | |
91 the_dict.items(), | |
92 columns=['song', 'similarity'] | |
93 ) | |
94 candidate = candidate.append(candidate_aux, ignore_index=True) | |
95 #tuples = [(k,v) for k,v in the_dict.iteritems()] | |
96 #candidate.extend(tuples) | |
97 topN = candidate.groupby('song')['similarity'].sum() | |
98 topN.sort(1, ascending=False) | |
99 | |
100 return list(topN.head(N).keys()) | |
101 | |
102 def evaluate_cb(topN, test_data, rating_threshold=3): | |
103 | |
104 tp = 0. | |
105 fp = 0. | |
106 fn = 0. | |
107 tn = 0. | |
108 for user, song_rating in test_data.iteritems(): | |
109 entries = topN[user] | |
110 for song, rating in song_rating.iteritems(): | |
111 if song in entries: | |
112 if rating > rating_threshold: | |
113 tp += 1 | |
114 elif rating <= rating_threshold: | |
115 fp += 1 | |
116 else: | |
117 if rating > rating_threshold: | |
118 fn += 1 | |
119 elif rating <= rating_threshold: | |
120 tn += 1 | |
121 print tp, fp, fn, tn | |
122 precision = tp / (tp + fp) | |
123 recall = tp / (tp + fn) | |
124 F1 = 2 * precision * recall / (precision + recall) | |
125 accuracy = (tp + tn) / (tp + fp + tn + fn) | |
126 | |
127 return precision, recall, F1, accuracy | |
128 | |
129 sim_matrix = build_model_cb(song_library, 30) | |
130 p = np.array([]) | |
131 f = np.array([]) | |
132 r = np.array([]) | |
133 a = np.array([]) | |
134 for i in range(len(users_train)): | |
135 topN = {} | |
136 for user, song_rating in users_train[i].iteritems(): | |
137 topN[user] = top_n(sim_matrix, user, song_rating) | |
138 pi, ri, fi, ai = evaluate_cb(topN, users_test[i]) | |
139 | |
140 p = np.append(p, pi) | |
141 r = np.append(r, ri) | |
142 f = np.append(f, fi) | |
143 a = np.append(a, ai) | |
144 | |
145 | |
146 print "Precision = %f3 ± %f3" % (p.mean(), p.std()) | |
147 print "Recall = %f3 ± %f3" % (r.mean(), r.std()) | |
148 print "F1 = %f3 ± %f3" % (f.mean(), f.std()) | |
149 print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) | |
150 | |
151 # set_C = {t[0]: t[1] for t in candidate} | |
152 # for song in set_C: | |
153 # sim = sim_matrix[song] | |
154 # the_dict = {t[1]: t[0] for t in sim} | |
155 # for key in entries: | |
156 # if key in the_dict: | |
157 # the_dict[key] |