Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/eda.py @ 47:b0186d4a4496 tip
Move 7Digital dataset to Downloads
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 09 Jul 2022 00:50:43 -0500 |
parents | ae650489d3a8 |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Wed Jul 22 17:42:09 2015 @author: paulochiliguano """ from math import log, sqrt import numpy as np import pandas as pd import cPickle as pickle import time # Item-vector dictionary f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ genre_classification/genre_prob.pkl', 'rb') song_library = pickle.load(f) f.close() # Load training and test data f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ cross_validation.pkl', 'rb') users_train, users_test = pickle.load(f) f.close() # Cosine Similarity def cosine_similarity(vector1, vector2): dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) return dot_product / (length_x * length_y) # Adjusted Cosine Similarity def adj_cos_sim(vector_i, vector_j): avrg_w_i = (float(sum(vector_i)) / len(vector_i)) avrg_w_j = (float(sum(vector_j)) / len(vector_j)) num = sum(map( lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), vector_i, vector_j) ) dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) return num / (sqrt(dem1) * sqrt(dem2)) # Fitness function for EDA def Fitness(profile_u, user_subset): fitness_value = 0 for songID, score in user_subset.iteritems(): #print cosine_similarity(profile, song_library[songID]) sim = cosine_similarity(profile_u, song_library[songID]) if sim <= 0: fitness_value += -708 #math.log(sys.float_info.min) else: fitness_value += log(score * sim) #fitness_value += log(score * manhattan(profile, song_library[songID])) #fitness_value += score * cosine_similarity(profile, song_library[songID]) return fitness_value def users_likes_subset(users, rating_threshold=2): # Subset of most-liked items users_subset = {} for userID, songs in users.iteritems(): scores_above_threshold = { songID: score for songID, score in songs.iteritems() if score > rating_threshold } users_subset[userID]= scores_above_threshold #for songID, score in songs.iteritems(): #print score >0 #if score > 0: #print {userID: {songID: score}} #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} return users_subset def eda_train(users_subset, max_gen=250): # TRAINING num_features = len(song_library.values()[0]) # Given parameters for EDA population_size = len(users_subset) fraction_of_population = int(round(0.5 * population_size)) # Generation of M individuals uniformly np.random.seed(12345) M = np.random.uniform( 0, 1, population_size * num_features ) M.shape = (-1, num_features) profile_u = {} i = 0 for userID in users_subset: profile_u[userID] = M.tolist()[i] i += 1 fitnesses = [] generation = 0 while generation < max_gen: # Compute fitness values users_fitness = {} for userID in profile_u: users_fitness[userID] = Fitness( profile_u[userID], users_subset[userID] ) users_fitness_df = pd.DataFrame( users_fitness.items(), columns=["userID", "fitness"] ) fitnesses.append(users_fitness_df.fitness.values.tolist()) # Selection of best individuals based on fitness values best_individuals = {} users_fitness_df = users_fitness_df.sort(columns='fitness') M_sel = users_fitness_df.tail(fraction_of_population) M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() for userID in M_sel_dict: best_individuals[userID] = profile_u[userID] # Calculate sample mean and standard deviation D = np.array([]) for userID, features in best_individuals.iteritems(): D = np.append(D, features, axis=0) D.shape = (-1, num_features) D_mu = np.mean(D, axis=0) D_sigma = np.std(D, axis=0, ddof=1) # Sample M individuals M = np.random.normal( D_mu, D_sigma, (population_size, num_features) ) #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2)) #M.shape = (-1, len(items.values()[0])) #M = D_sigma * np.random.normal( #population_size, #len(items.values()[0]) #) + D_mu profile_u = {} i = 0 for userID in users_subset: profile_u[userID] = M.tolist()[i] i += 1 generation += 1 return profile_u, D, np.array(fitnesses) # Similarity matrix def cb_similarity(profileID, profile_data, test_data, N): a = [] for user, info in test_data.iteritems(): a.extend([i for i in info]) songIDs = list(set(a)) ''' Content-based: Similarity matrix ''' similarity = [] for songID in songIDs: sim = adj_cos_sim(profile_data, song_library[songID]) similarity.append((sim, songID)) ''' Top-N recommendation ''' similarity.sort(reverse=True) if len(similarity) > N: similarity = similarity[0:N] #sim_matrix[userID] = {t[1]: t[0] for t in similarity} return {t[1]: t[0] for t in similarity} def evaluate_eda( profiles, test_data, N=10, rating_threshold=2, EDA_treshold=0.5): ''' Evaluation ''' sim_matrix = {} for userID, features in profiles.iteritems(): sim_matrix[userID] = cb_similarity(userID, features, test_data, N) # Content-Based: Evaluation tp = 0. fp = 0. fn = 0. tn = 0. for user, song_rating in test_data.iteritems(): entries = sim_matrix[user] for song, rating in song_rating.iteritems(): if song in entries: if rating > rating_threshold: tp += 1 elif rating <= rating_threshold: fp += 1 else: if rating > rating_threshold: fn += 1 elif rating <= rating_threshold: tn += 1 # for userID, songID_sim in sim_matrix.iteritems(): # for songID, sim_value in songID_sim.iteritems(): # score = test_data[userID][songID] # if score > rating_threshold and sim_value >= EDA_treshold: # tp += 1 # elif score <= rating_threshold and sim_value >= EDA_treshold: # fp += 1 # elif score > rating_threshold and sim_value < EDA_treshold: # fn += 1 # elif score <= rating_threshold and sim_value < EDA_treshold: # tn += 1 #print tp, fp, fn, tn if tp != 0: precision = tp / (tp + fp) recall = tp / (tp + fn) F1 = 2 * precision * recall / (precision + recall) else: precision = 0 recall = 0 F1 = 0 accuracy = (tp + tn) / (tp + fp + tn + fn) return precision, recall, F1, accuracy #keys_a = set(users[userID].keys()) #keys_b = set(test_data.keys()) #intersection = keys_a & keys_b #if len(intersection) != 0: #similarity = {} #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs} #for songID in intersection: #if songID == k: #similarity[songID] = adj_cos_sim( #profile[userID], #test_data[songID] #) #max_sim = max(similarity, key=similarity.get) #if max_sim >= EDA_treshold: #sim_matrix[userID] = {max_sim: similarity[max_sim]} #sim_matrix[userID] = similarity #sim_matrix[userID] = {max_sim: similarity[max_sim]} #print len(sim_matrix) p = np.array([]) f = np.array([]) r = np.array([]) a = np.array([]) for i in range(len(users_train)): start_time = time.time() profile_u, prob, fffitness = eda_train(users_likes_subset(users_train[i])) elapsed_time = time.time() - start_time print 'Training execution time: %.3f seconds' % elapsed_time pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i], N=20) p = np.append(p, pi) r = np.append(r, ri) f = np.append(f, fi) a = np.append(a, ai) #precision = np.array(p) #rec = np.array(r) #F1 = np.array(f) #accuracy = np.array(a) print "Precision = %f3 ± %f3" % (p.mean(), p.std()) print "Recall = %f3 ± %f3" % (r.mean(), r.std()) print "F1 = %f3 ± %f3" % (f.mean(), f.std()) print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())