hybrid-music-recommender-using-content-based-and-social-information: Code/eda.py comparison

comparison Code/eda.py @ 25:fafc0b249a73

Final code

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sun, 23 Aug 2015 16:47:54 +0100
parents	68a62ca32441
children	e4bcfe00abf4

comparison

equal deleted inserted replaced

-:68a62ca32441
+:fafc0b249a73
 @author: paulochiliguano
 """
-from math import sqrt, log10
+from math import log, sqrt
 import numpy as np
 import pandas as pd
-from sklearn import mixture
+import cPickle as pickle
+#import random
-#Fine tuning
+# Item-vector dictionary
-#User-item dictionary
+f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
-users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
+genre_classification/genre_prob.pkl', 'rb')
-"SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
+song_library = pickle.load(f)
-"SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
+f.close()
-"SOAJZEP12A8C14379B": 2.0},
-"Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5,
+# Load training and test data
-"SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0,
+f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
-"SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0},
+cross_validation.pkl', 'rb')
-"Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0,
+users_train, users_test = pickle.load(f)
-"SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0,
+f.close()
-"SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0},
-"Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0,
+# Cosine Similarity
-"SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0,
+def cosine_similarity(vector1, vector2):
-"SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
+dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
-"SOAJZEP12A8C14379B": 2.0},
+length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
-"Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0,
+length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
-"SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0,
+return dot_product / (length_x * length_y)
-"SOAJZEP12A8C14379B": 1.0},
-"Jordyn":  {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0,
+# Adjusted Cosine Similarity
-"SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0,
+def adj_cos_sim(vector_i, vector_j):
-"SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
+avrg_w_i = (float(sum(vector_i)) / len(vector_i))
-"SOAJZEP12A8C14379B": 4.0},
+avrg_w_j = (float(sum(vector_j)) / len(vector_j))
-"Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0,
+num = sum(map(
-"SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0,
+lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
-"SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0},
+vector_i,
-"Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0,
+vector_j)
-"SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5,
+)
-"SOAKNZI12A58A79CAC": 3.0}
+dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
+dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
+return num / (sqrt(dem1) * sqrt(dem2))
+# Fitness function for EDA
+def Fitness(profile_u, user_subset):
+fitness_value = 0
+for songID, score in user_subset.iteritems():
+#print cosine_similarity(profile, song_library[songID])
+sim = cosine_similarity(profile_u, song_library[songID])
+if sim <= 0:
+fitness_value += -708
+#math.log(sys.float_info.min)
+else:
+fitness_value += log(score * sim)
+#fitness_value += log(score * manhattan(profile, song_library[songID]))
+#fitness_value += score * cosine_similarity(profile, song_library[songID])
+return fitness_value
+def users_likes_subset(users, rating_threshold=3):
+# Subset of most-liked items
+users_subset = {}
+for userID, songs in users.iteritems():
+scores_above_threshold = {
+songID: score for songID, score in songs.iteritems() if score > rating_threshold
 }
+users_subset[userID]= scores_above_threshold
-items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1],
-"SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1],
+#for songID, score in songs.iteritems():
-"SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1],
+#print score >0
-"SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1],
+#if score > 0:
-"SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
+#print {userID: {songID: score}}
-"SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
-"SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
+#{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
-"SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
+return users_subset
-# Functions to compute similarity between items or between profiles
-# Source: http://www.guidetodatamining.com
+def eda_train(users_subset, max_gen=200):
-def manhattan(vector1, vector2):
+# TRAINING
-"""Computes the Manhattan distance."""
+num_features = len(song_library.values()[0])
-return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
+# Given parameters for EDA
+population_size = len(users_subset)
-def nearestNeighbor(self, itemVector):
+fraction_of_population = int(round(0.5 * population_size))
-"""return nearest neighbor to itemVector"""
-return min([(
+# Generation of M individuals uniformly
-self.manhattan(itemVector, item[1]), item) for item in self.data
+np.random.seed(12345)
-])
+M = np.random.uniform(
+0,
-def classify(self, itemVector):
+1,
-"""Return class we think item Vector is in"""
+population_size * num_features
-return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
+)
+M.shape = (-1, num_features)
+profile_u = {}
+i = 0
+for userID in users_subset:
+profile_u[userID] = M.tolist()[i]
+i += 1
+generation = 0
+while generation < max_gen:
+# Compute fitness values
+users_fitness = {}
+for userID in profile_u:
+users_fitness[userID] = Fitness(
+profile_u[userID],
+users_subset[userID]
+)
+users_fitness_df = pd.DataFrame(
+users_fitness.items(),
+columns=["userID", "fitness"]
+)
+# Selection of best individuals based on fitness values
+best_individuals = {}
+users_fitness_df = users_fitness_df.sort(columns='fitness')
+M_sel = users_fitness_df.tail(fraction_of_population)
+M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
+for userID in M_sel_dict:
+best_individuals[userID] = profile_u[userID]
+# Calculate sample mean and standard deviation
+D = np.array([])
+for userID, features in best_individuals.iteritems():
+D = np.append(D, features, axis=0)
+D.shape = (-1, num_features)
+D_mu = np.mean(D, axis=0)
+D_sigma = np.std(D, axis=0, ddof=1)
+# Sample M individuals
+M = np.random.normal(
+D_mu,
+D_sigma,
+(population_size, num_features)
+)
+#M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2))
+#M.shape = (-1, len(items.values()[0]))
+#M = D_sigma * np.random.normal(
+#population_size,
+#len(items.values()[0])
+#) + D_mu
+profile_u = {}
+i = 0
+for userID in users_subset:
+profile_u[userID] = M.tolist()[i]
+i += 1
+generation += 1
+return profile_u
+# Similarity matrix
+def cb_similarity(profileID, profile_data, test_data, N):
+''' Content-based: Similarity matrix '''
+similarity = []
+for songID in test_data[profileID]:
+sim = adj_cos_sim(profile_data, song_library[songID])
+similarity.append((sim, songID))
+# Top-N recommendation
+#similarity.sort(reverse=True)
+#if len(similarity) > N:
+#similarity = similarity[0:N]
+#sim_matrix[userID] = {t[1]: t[0] for t in similarity}
+return {t[1]: t[0] for t in similarity}
+def evaluate_eda(
+profiles,
+test_data,
+N=10,
+rating_threshold=3,
+EDA_treshold=0.3):
+''' Evaluation '''
+sim_matrix = {}
+for userID, features in profiles.iteritems():
+sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
+# Content-Based: Evaluation
+tp = 0.
+fp = 0.
+fn = 0.
+tn = 0.
+for userID, songID_sim in sim_matrix.iteritems():
+for songID, sim_value in songID_sim.iteritems():
+score = test_data[userID][songID]
+if score > rating_threshold and sim_value >= EDA_treshold:
+tp += 1
+elif score <= rating_threshold and sim_value >= EDA_treshold:
+fp += 1
+elif score > rating_threshold and sim_value < EDA_treshold:
+fn += 1
+elif score <= rating_threshold and sim_value < EDA_treshold:
+tn += 1
+precision = tp / (tp + fp)
+recall = tp / (tp + fn)
+F1 = 2 * precision * recall / (precision + recall)
+accuracy = (tp + tn) / (tp + fp + tn + fn)
+return precision, recall, F1, accuracy
+#keys_a = set(users[userID].keys())
+#keys_b = set(test_data.keys())
+#intersection = keys_a & keys_b
+#if len(intersection) != 0:
+#similarity = {}
+#print {k: v for k,v in song_library_fold[0].iteritems() if k in songs}
+#for songID in intersection:
+#if songID == k:
+#similarity[songID] = adj_cos_sim(
+#profile[userID],
+#test_data[songID]
+#)
+#max_sim = max(similarity, key=similarity.get)
+#if max_sim >= EDA_treshold:
+#sim_matrix[userID] = {max_sim: similarity[max_sim]}
+#sim_matrix[userID] = similarity
+#sim_matrix[userID] = {max_sim: similarity[max_sim]}
+#print len(sim_matrix)
+p = np.array([])
+f = np.array([])
+r = np.array([])
+a = np.array([])
+for i in range(len(users_train)):
+profile_u = eda_train(users_likes_subset(users_train[i]))
+pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i])
+p = np.append(p, pi)
+r = np.append(r, ri)
+f = np.append(f, fi)
+a = np.append(a, ai)
+#precision = np.array(p)
+#rec = np.array(r)
+#F1 = np.array(f)
+#accuracy = np.array(a)
+print "Precision = %f3 ± %f3" % (p.mean(), p.std())
+print "Recall = %f3 ± %f3" % (r.mean(), r.std())
+print "F1 = %f3 ± %f3" % (f.mean(), f.std())
+print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
+'''# Collaborative-filtering: Similarity matrix
+sim_matrix_cf = {}
+count = 0
+for userID_1 in profile:
+similarities = {}
+for userID_2 in profile:
+if userID_1 != userID_2:
+similarities[userID_2] = adj_cos_sim(
+profile[userID_1],
+profile[userID_2]
+)
+#print similarities
+sim_matrix_cf[userID_1] = similarities'''
+# Predicted rating
+#for userID in users:
+#    print np.array(users[userID].values()).mean()
+'''scores_above_threshold = {
+songID: score for songID, score in songs.iteritems() if score > rating_threshold
+}'''
+'''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True):
+print "%s: %s" % (key, value)
+break'''
+# Recommend new item
+'''
+def computeNearestNeighbor(itemName, itemVector, items):
+"""creates a sorted list of items based on their distance to item"""
+distances = []
+for otherItem in items:
+if otherItem != itemName:
+distance = adj_cos_sim(itemVector, items[otherItem])
+distances.append((distance, otherItem))
+# sort based on distance -- closest first
+distances.sort(reverse=True)
+return distances
+def classify(user, itemName, itemVector):
+"""Classify the itemName based on user ratings
+Should really have items and users as parameters"""
+# first find nearest neighbor
+nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1]
+rating = users[user][nearest]
+return rating
+'''
+# Source: guidetodatamining.com
+'''def computeSimilarity(band1, band2, userRatings):
+averages = {}
+for (key, ratings) in userRatings.items():
+averages[key] = (float(sum(ratings.values())) / len(ratings.values()))
+num = 0 # numerator
+dem1 = 0 # first half of denominator
+dem2 = 0
+for (user, ratings) in userRatings.items():
+if band1 in ratings and band2 in ratings:
+avg = averages[user]
+num += (ratings[band1] - avg) * (ratings[band2] - avg)
+dem1 += (ratings[band1] - avg)**2
+dem2 += (ratings[band2] - avg)**2
+return num / (sqrt(dem1) * sqrt(dem2))'''
+'''
+sum_xy = 0
+sum_x2 = 0
+sum_y2 = 0
+n = 0
+for key in rating1:
+if key in rating2:
+n += 1
+x = rating1[key]
+y = rating2[key]
+sum_xy += x * y
+if n == 0:
+return 0
+# now compute denominator
+for key in rating1:
+x = rating1[key]
+sum_x2 += pow(x, 2)
+for key in rating2:
+y = rating2[key]
+sum_y2 += pow(y, 2)
+denominator = sqrt(sum_x2) * sqrt(sum_y2)
+if denominator == 0:
+return 0
+else:
+return sum_xy / denominator'''
 '''
 # Median
 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
 def get_median(lst):
 return np.median(np.array(lst))
 def normalize_rating(ratings, median, asd):
 for i in range(len(ratings)):
 ratings[i] = (ratings[i] - median) / asd
 return ratings
 '''
-# Normalise user play count
-for userID in users:
-song_play_count = pd.DataFrame(
-users[userID].items(),
-columns=["songID", "play_count"]
-)
-'''Coefficient of variation'''
-cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
-#user_ratings = np.array(users[userID].values())
-#cv = user_ratings.std()/user_ratings.mean()
-#print userID, cv
-if cv <= 0.5:
-for songID, play_count in users[userID].items():
-users[userID][songID] = 3
-else:
-song_play_count_q = pd.cut(
-song_play_count["play_count"],
-5,
-labels=False
-) + 1
-song_play_count.play_count = song_play_count_q
-users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
-#print song_play_count
-#median = get_median(user_ratings)
-#asd = get_asd(user_ratings, median)
-#for songID, play_count in users[userID].items():
-#users[userID][songID] = (play_count - median) / asd
-# Subset of most-liked items
-users_subset = {}
-for userID, songs in users.iteritems():
-scores_above_threshold = {
-songID: score for songID, score in songs.iteritems() if score > 2
-}
-users_subset[userID]= scores_above_threshold
-'''
-for songID, score in songs.iteritems():
-print score >0
-if score > 0:
-print {userID: {songID: score}}
-{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
-'''
-# Fitness function for EDA
-def Fitness(profile, user_subset):
-fitness_value = 0
-for songID, score in user_subset.iteritems():
-fitness_value += log10(score * manhattan(profile, items[songID]))
-return fitness_value
-# Given parameters for EDA
-population_size = len(users_subset)
-fraction_of_population = int(round(0.5 * population_size))
-# Generation of M individuals uniformly
-np.random.seed(len(users_subset))
-M = np.random.rand(population_size, len(items.values()[0]))
-#M.shape = (-1, len(items.values()[0]))
-profile = {}
-i = 0
-for userID in users_subset:
-profile[userID] = M.tolist()[i]
-i += 1
-# Compute fitness values
-users_fitness = {}
-for userID in profile:
-users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
-users_fitness_df = pd.DataFrame(
-users_fitness.items(),
-columns=["userID", "fitness"]
-)
-# Selection of best individuals based on fitness values
-best_individuals = {}
-users_fitness_df = users_fitness_df.sort(columns='fitness')
-M_sel = users_fitness_df.head(fraction_of_population)
-M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
-for userID in M_sel_dict:
-best_individuals[userID] = profile[userID]
-# Calculate sample mean and standard deviation
-np.random.seed(1)
-g = mixture.GMM(n_components=10)
-# Generate random observations with two modes centered on 0
-# and 10 to use for training.
-obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
-g.fit(obs)
-np.round(g.weights_, 2)
-np.round(g.means_, 2)
-np.round(g.covars_, 2)
-g.predict([[0], [2], [9], [10]])
-np.round(g.score([[0], [2], [9], [10]]), 2)
-# Refit the model on new data (initial parameters remain the
-# same), this time with an even split between the two modes.
-g.fit(20 * [[0]] +  20 * [[10]])
-np.round(g.weights_, 2)
 '''
 # Pearson Correlation Coefficient
 def pearson(rating1, rating2):
 sum_xy = 0
 sqrt(sum_y2 - pow(sum_y, 2) / n)
 if denominator == 0:
 return 0
 else:
 return (sum_xy - (sum_x * sum_y) / n) / denominator
+'''
-# Cosine Similarity for test purposes
+'''
-def cosine_similarity(rating1, rating2):
+def buckets(filename, bucketName, separator, classColumn):
-sum_xy = 0
+"""the original data is in the file named filename
-sum_x2 = 0
+bucketName is the prefix for all the bucket names
-sum_y2 = 0
+separator is the character that divides the columns
-n = 0
+(for ex., a tab or comma and classColumn is the column
-for key in rating1:
+that indicates the class"""
-if key in rating2:
+# put the data in 10 buckets
-n += 1
+numberOfBuckets = 10
-x = rating1[key]
+data = {}
-y = rating2[key]
+# first read in the data and divide by category
-sum_xy += x * y
+with open(filename) as f:
-if n == 0:
+lines = f.readlines()
-return 0
+for line in lines:
+if separator != '\t':
-# now compute denominator
+line = line.replace(separator, '\t')
-for key in rating1:
+# first get the category
-x = rating1[key]
+category = line.split()[classColumn]
-sum_x2 += pow(x, 2)
+data.setdefault(category, [])
+data[category].append(line)
-for key in rating2:
+# initialize the buckets
-y = rating2[key]
+buckets = []
-sum_y2 += pow(y, 2)
+for i in range(numberOfBuckets):
+buckets.append([])
-denominator = sqrt(sum_x2) * sqrt(sum_y2)
+# now for each category put the data into the buckets
-if denominator == 0:
+for k in song_library.keys():
-return 0
+#randomize order of instances for each class
-else:
+print random.shuffle(song_library[k])
-return sum_xy / denominator
+bNum = 0
+# divide into buckets
+for item in song_library[k]:
+buckets[bNum].append(item)
-def Fitness(profile, user_index):
+bNum = (bNum + 1) % numberOfBuckets
-sim = 0
+# write to file
-sum_log = 0
+for bNum in range(numberOfBuckets):
+f = open("%s-%02i" % (bucketName, bNum + 1), 'w')
-features = profile.items()[user_index][1]
+for item in buckets[bNum]:
-songs = users.items()[user_index][1]
+f.write(item)
+f.close()
-for song, rating in songs.items():
+'''
-sim = pearson(features, items[song])
-print(sim)
+'''# Functions to compute similarity between items or between profiles
+# Source: http://www.guidetodatamining.com
-for username, songs in users.items():
+def manhattan(vector1, vector2):
-for song, rating in songs.items():
+"""Computes the Manhattan distance."""
-sim = pearson(profile, items[song])
+return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))'''
-#sum_log += log10(rating * sim)
-return sim
-'''

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

comparison Code/eda.py @ 25:fafc0b249a73