Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
diff Code/eda.py @ 24:68a62ca32441
Organized python scripts
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 15 Aug 2015 19:16:17 +0100 |
parents | 45e6f85d0ba4 |
children | fafc0b249a73 |
line wrap: on
line diff
--- a/Code/eda.py Tue Aug 11 14:23:42 2015 +0100 +++ b/Code/eda.py Sat Aug 15 19:16:17 2015 +0100 @@ -8,8 +8,11 @@ from math import sqrt, log10 import numpy as np +import pandas as pd from sklearn import mixture +#Fine tuning + #User-item dictionary users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, @@ -49,43 +52,140 @@ "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} -#Functions to compute similarity between items or between profiles +# Functions to compute similarity between items or between profiles # Source: http://www.guidetodatamining.com def manhattan(vector1, vector2): """Computes the Manhattan distance.""" - distance = 0 - total = 0 - n = len(vector1) - for i in range(n): - distance += abs(vector1[i] - vector2[i]) - return distance + return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) -def computeNearestNeighbor(itemName, itemVector, items): - """creates a sorted list of items based on their distance to item""" - distances = [] - for otherItem in items: - if otherItem != itemName: - distance = manhattan(itemVector, items[otherItem]) - distances.append((distance, otherItem)) - # sort based on distance -- closest first - distances.sort() - return distances +def nearestNeighbor(self, itemVector): + """return nearest neighbor to itemVector""" + return min([( + self.manhattan(itemVector, item[1]), item) for item in self.data + ]) -def classify(user, itemName, itemVector): - """Classify the itemName based on user ratings - Should really have items and users as parameters""" - # first find nearest neighbor - nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1] - rating = users[user][nearest] - return rating +def classify(self, itemVector): + """Return class we think item Vector is in""" + return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) +''' +# Median +# http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python +def get_median(lst): + return np.median(np.array(lst)) -# Fitness function of EDA -def Fitness(profile, user): - nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1] - rating = users[user][nearest] - return rating +# Absolute Standard Deviation +def get_asd(lst, median): + sum = 0 + for item in lst: + sum += abs(item - median) + return sum / len(lst) - +# Normalisation rating with Modified Standard Score +def normalize_rating(ratings, median, asd): + for i in range(len(ratings)): + ratings[i] = (ratings[i] - median) / asd + return ratings +''' +# Normalise user play count +for userID in users: + song_play_count = pd.DataFrame( + users[userID].items(), + columns=["songID", "play_count"] + ) + '''Coefficient of variation''' + cv = song_play_count.play_count.std() / song_play_count.play_count.mean() + #user_ratings = np.array(users[userID].values()) + #cv = user_ratings.std()/user_ratings.mean() + #print userID, cv + if cv <= 0.5: + for songID, play_count in users[userID].items(): + users[userID][songID] = 3 + else: + song_play_count_q = pd.cut( + song_play_count["play_count"], + 5, + labels=False + ) + 1 + song_play_count.play_count = song_play_count_q + users[userID] = song_play_count.set_index('songID')['play_count'].to_dict() + #print song_play_count + #median = get_median(user_ratings) + #asd = get_asd(user_ratings, median) + #for songID, play_count in users[userID].items(): + #users[userID][songID] = (play_count - median) / asd + +# Subset of most-liked items +users_subset = {} +for userID, songs in users.iteritems(): + scores_above_threshold = { + songID: score for songID, score in songs.iteritems() if score > 2 + } + users_subset[userID]= scores_above_threshold + ''' + for songID, score in songs.iteritems(): + print score >0 + if score > 0: + print {userID: {songID: score}} + +{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} +''' +# Fitness function for EDA +def Fitness(profile, user_subset): + fitness_value = 0 + for songID, score in user_subset.iteritems(): + fitness_value += log10(score * manhattan(profile, items[songID])) + return fitness_value + +# Given parameters for EDA +population_size = len(users_subset) +fraction_of_population = int(round(0.5 * population_size)) + +# Generation of M individuals uniformly +np.random.seed(len(users_subset)) +M = np.random.rand(population_size, len(items.values()[0])) +#M.shape = (-1, len(items.values()[0])) +profile = {} +i = 0 +for userID in users_subset: + profile[userID] = M.tolist()[i] + i += 1 + +# Compute fitness values +users_fitness = {} +for userID in profile: + users_fitness[userID] = Fitness(profile[userID], users_subset[userID]) +users_fitness_df = pd.DataFrame( + users_fitness.items(), + columns=["userID", "fitness"] +) + +# Selection of best individuals based on fitness values +best_individuals = {} +users_fitness_df = users_fitness_df.sort(columns='fitness') +M_sel = users_fitness_df.head(fraction_of_population) +M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() +for userID in M_sel_dict: + best_individuals[userID] = profile[userID] + +# Calculate sample mean and standard deviation +np.random.seed(1) +g = mixture.GMM(n_components=10) +# Generate random observations with two modes centered on 0 +# and 10 to use for training. +obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) +g.fit(obs) +np.round(g.weights_, 2) +np.round(g.means_, 2) +np.round(g.covars_, 2) +g.predict([[0], [2], [9], [10]]) +np.round(g.score([[0], [2], [9], [10]]), 2) +# Refit the model on new data (initial parameters remain the +# same), this time with an even split between the two modes. +g.fit(20 * [[0]] + 20 * [[10]]) +np.round(g.weights_, 2) + + +''' # Pearson Correlation Coefficient def pearson(rating1, rating2): sum_xy = 0 @@ -145,7 +245,7 @@ return sum_xy / denominator -''' + def Fitness(profile, user_index): sim = 0 sum_log = 0 @@ -163,40 +263,12 @@ #sum_log += log10(rating * sim) return sim ''' -# Generation of M individuals uniformly -population_size = len(users) -fraction_of_population = 0.5 -np.random.seed(len(users)) -M = np.random.uniform(size=population_size * len(items.values()[0])) -M.shape = (-1, len(items.values()[0])) -profile = {} -i = 0 -for row in M.tolist(): - profile["Profile" + str(i)] = M.tolist()[i] - i = i + 1 -''' -Calculate fitness values -''' -Fitness(profile, 0) -np.random.seed(1) -g = mixture.GMM(n_components=7) -# Generate random observations with two modes centered on 0 -# and 10 to use for training. -obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) -g.fit(obs) -np.round(g.weights_, 2) -np.round(g.means_, 2) -np.round(g.covars_, 2) -g.predict([[0], [2], [9], [10]]) -np.round(g.score([[0], [2], [9], [10]]), 2) -# Refit the model on new data (initial parameters remain the -# same), this time with an even split between the two modes. -g.fit(20 * [[0]] + 20 * [[10]]) -np.round(g.weights_, 2) + +