Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/eda.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | 68a62ca32441 |
children | e4bcfe00abf4 |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Wed Jul 22 17:42:09 2015 @author: paulochiliguano """ from math import log, sqrt import numpy as np import pandas as pd import cPickle as pickle #import random # Item-vector dictionary f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ genre_classification/genre_prob.pkl', 'rb') song_library = pickle.load(f) f.close() # Load training and test data f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ cross_validation.pkl', 'rb') users_train, users_test = pickle.load(f) f.close() # Cosine Similarity def cosine_similarity(vector1, vector2): dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) return dot_product / (length_x * length_y) # Adjusted Cosine Similarity def adj_cos_sim(vector_i, vector_j): avrg_w_i = (float(sum(vector_i)) / len(vector_i)) avrg_w_j = (float(sum(vector_j)) / len(vector_j)) num = sum(map( lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), vector_i, vector_j) ) dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) return num / (sqrt(dem1) * sqrt(dem2)) # Fitness function for EDA def Fitness(profile_u, user_subset): fitness_value = 0 for songID, score in user_subset.iteritems(): #print cosine_similarity(profile, song_library[songID]) sim = cosine_similarity(profile_u, song_library[songID]) if sim <= 0: fitness_value += -708 #math.log(sys.float_info.min) else: fitness_value += log(score * sim) #fitness_value += log(score * manhattan(profile, song_library[songID])) #fitness_value += score * cosine_similarity(profile, song_library[songID]) return fitness_value def users_likes_subset(users, rating_threshold=3): # Subset of most-liked items users_subset = {} for userID, songs in users.iteritems(): scores_above_threshold = { songID: score for songID, score in songs.iteritems() if score > rating_threshold } users_subset[userID]= scores_above_threshold #for songID, score in songs.iteritems(): #print score >0 #if score > 0: #print {userID: {songID: score}} #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} return users_subset def eda_train(users_subset, max_gen=200): # TRAINING num_features = len(song_library.values()[0]) # Given parameters for EDA population_size = len(users_subset) fraction_of_population = int(round(0.5 * population_size)) # Generation of M individuals uniformly np.random.seed(12345) M = np.random.uniform( 0, 1, population_size * num_features ) M.shape = (-1, num_features) profile_u = {} i = 0 for userID in users_subset: profile_u[userID] = M.tolist()[i] i += 1 generation = 0 while generation < max_gen: # Compute fitness values users_fitness = {} for userID in profile_u: users_fitness[userID] = Fitness( profile_u[userID], users_subset[userID] ) users_fitness_df = pd.DataFrame( users_fitness.items(), columns=["userID", "fitness"] ) # Selection of best individuals based on fitness values best_individuals = {} users_fitness_df = users_fitness_df.sort(columns='fitness') M_sel = users_fitness_df.tail(fraction_of_population) M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() for userID in M_sel_dict: best_individuals[userID] = profile_u[userID] # Calculate sample mean and standard deviation D = np.array([]) for userID, features in best_individuals.iteritems(): D = np.append(D, features, axis=0) D.shape = (-1, num_features) D_mu = np.mean(D, axis=0) D_sigma = np.std(D, axis=0, ddof=1) # Sample M individuals M = np.random.normal( D_mu, D_sigma, (population_size, num_features) ) #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2)) #M.shape = (-1, len(items.values()[0])) #M = D_sigma * np.random.normal( #population_size, #len(items.values()[0]) #) + D_mu profile_u = {} i = 0 for userID in users_subset: profile_u[userID] = M.tolist()[i] i += 1 generation += 1 return profile_u # Similarity matrix def cb_similarity(profileID, profile_data, test_data, N): ''' Content-based: Similarity matrix ''' similarity = [] for songID in test_data[profileID]: sim = adj_cos_sim(profile_data, song_library[songID]) similarity.append((sim, songID)) # Top-N recommendation #similarity.sort(reverse=True) #if len(similarity) > N: #similarity = similarity[0:N] #sim_matrix[userID] = {t[1]: t[0] for t in similarity} return {t[1]: t[0] for t in similarity} def evaluate_eda( profiles, test_data, N=10, rating_threshold=3, EDA_treshold=0.3): ''' Evaluation ''' sim_matrix = {} for userID, features in profiles.iteritems(): sim_matrix[userID] = cb_similarity(userID, features, test_data, N) # Content-Based: Evaluation tp = 0. fp = 0. fn = 0. tn = 0. for userID, songID_sim in sim_matrix.iteritems(): for songID, sim_value in songID_sim.iteritems(): score = test_data[userID][songID] if score > rating_threshold and sim_value >= EDA_treshold: tp += 1 elif score <= rating_threshold and sim_value >= EDA_treshold: fp += 1 elif score > rating_threshold and sim_value < EDA_treshold: fn += 1 elif score <= rating_threshold and sim_value < EDA_treshold: tn += 1 precision = tp / (tp + fp) recall = tp / (tp + fn) F1 = 2 * precision * recall / (precision + recall) accuracy = (tp + tn) / (tp + fp + tn + fn) return precision, recall, F1, accuracy #keys_a = set(users[userID].keys()) #keys_b = set(test_data.keys()) #intersection = keys_a & keys_b #if len(intersection) != 0: #similarity = {} #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs} #for songID in intersection: #if songID == k: #similarity[songID] = adj_cos_sim( #profile[userID], #test_data[songID] #) #max_sim = max(similarity, key=similarity.get) #if max_sim >= EDA_treshold: #sim_matrix[userID] = {max_sim: similarity[max_sim]} #sim_matrix[userID] = similarity #sim_matrix[userID] = {max_sim: similarity[max_sim]} #print len(sim_matrix) p = np.array([]) f = np.array([]) r = np.array([]) a = np.array([]) for i in range(len(users_train)): profile_u = eda_train(users_likes_subset(users_train[i])) pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i]) p = np.append(p, pi) r = np.append(r, ri) f = np.append(f, fi) a = np.append(a, ai) #precision = np.array(p) #rec = np.array(r) #F1 = np.array(f) #accuracy = np.array(a) print "Precision = %f3 ± %f3" % (p.mean(), p.std()) print "Recall = %f3 ± %f3" % (r.mean(), r.std()) print "F1 = %f3 ± %f3" % (f.mean(), f.std()) print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) '''# Collaborative-filtering: Similarity matrix sim_matrix_cf = {} count = 0 for userID_1 in profile: similarities = {} for userID_2 in profile: if userID_1 != userID_2: similarities[userID_2] = adj_cos_sim( profile[userID_1], profile[userID_2] ) #print similarities sim_matrix_cf[userID_1] = similarities''' # Predicted rating #for userID in users: # print np.array(users[userID].values()).mean() '''scores_above_threshold = { songID: score for songID, score in songs.iteritems() if score > rating_threshold }''' '''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True): print "%s: %s" % (key, value) break''' # Recommend new item ''' def computeNearestNeighbor(itemName, itemVector, items): """creates a sorted list of items based on their distance to item""" distances = [] for otherItem in items: if otherItem != itemName: distance = adj_cos_sim(itemVector, items[otherItem]) distances.append((distance, otherItem)) # sort based on distance -- closest first distances.sort(reverse=True) return distances def classify(user, itemName, itemVector): """Classify the itemName based on user ratings Should really have items and users as parameters""" # first find nearest neighbor nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1] rating = users[user][nearest] return rating ''' # Source: guidetodatamining.com '''def computeSimilarity(band1, band2, userRatings): averages = {} for (key, ratings) in userRatings.items(): averages[key] = (float(sum(ratings.values())) / len(ratings.values())) num = 0 # numerator dem1 = 0 # first half of denominator dem2 = 0 for (user, ratings) in userRatings.items(): if band1 in ratings and band2 in ratings: avg = averages[user] num += (ratings[band1] - avg) * (ratings[band2] - avg) dem1 += (ratings[band1] - avg)**2 dem2 += (ratings[band2] - avg)**2 return num / (sqrt(dem1) * sqrt(dem2))''' ''' sum_xy = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y if n == 0: return 0 # now compute denominator for key in rating1: x = rating1[key] sum_x2 += pow(x, 2) for key in rating2: y = rating2[key] sum_y2 += pow(y, 2) denominator = sqrt(sum_x2) * sqrt(sum_y2) if denominator == 0: return 0 else: return sum_xy / denominator''' ''' # Median # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python def get_median(lst): return np.median(np.array(lst)) # Absolute Standard Deviation def get_asd(lst, median): sum = 0 for item in lst: sum += abs(item - median) return sum / len(lst) # Normalisation rating with Modified Standard Score def normalize_rating(ratings, median, asd): for i in range(len(ratings)): ratings[i] = (ratings[i] - median) / asd return ratings ''' ''' # Pearson Correlation Coefficient def pearson(rating1, rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) if n == 0: return 0 # now compute denominator denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ sqrt(sum_y2 - pow(sum_y, 2) / n) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator ''' ''' def buckets(filename, bucketName, separator, classColumn): """the original data is in the file named filename bucketName is the prefix for all the bucket names separator is the character that divides the columns (for ex., a tab or comma and classColumn is the column that indicates the class""" # put the data in 10 buckets numberOfBuckets = 10 data = {} # first read in the data and divide by category with open(filename) as f: lines = f.readlines() for line in lines: if separator != '\t': line = line.replace(separator, '\t') # first get the category category = line.split()[classColumn] data.setdefault(category, []) data[category].append(line) # initialize the buckets buckets = [] for i in range(numberOfBuckets): buckets.append([]) # now for each category put the data into the buckets for k in song_library.keys(): #randomize order of instances for each class print random.shuffle(song_library[k]) bNum = 0 # divide into buckets for item in song_library[k]: buckets[bNum].append(item) bNum = (bNum + 1) % numberOfBuckets # write to file for bNum in range(numberOfBuckets): f = open("%s-%02i" % (bucketName, bNum + 1), 'w') for item in buckets[bNum]: f.write(item) f.close() ''' '''# Functions to compute similarity between items or between profiles # Source: http://www.guidetodatamining.com def manhattan(vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))'''