Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/eda.py @ 24:68a62ca32441
Organized python scripts
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 15 Aug 2015 19:16:17 +0100 |
parents | 45e6f85d0ba4 |
children | fafc0b249a73 |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Wed Jul 22 17:42:09 2015 @author: paulochiliguano """ from math import sqrt, log10 import numpy as np import pandas as pd from sklearn import mixture #Fine tuning #User-item dictionary users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, "SOAJZEP12A8C14379B": 2.0}, "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5, "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0, "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0}, "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0, "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0}, "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0, "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, "SOAJZEP12A8C14379B": 2.0}, "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0, "SOAJZEP12A8C14379B": 1.0}, "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0, "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0, "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, "SOAJZEP12A8C14379B": 4.0}, "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0, "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0, "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0}, "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5, "SOAKNZI12A58A79CAC": 3.0} } items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1], "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1], "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1], "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1], "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} # Functions to compute similarity between items or between profiles # Source: http://www.guidetodatamining.com def manhattan(vector1, vector2): """Computes the Manhattan distance.""" return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) def nearestNeighbor(self, itemVector): """return nearest neighbor to itemVector""" return min([( self.manhattan(itemVector, item[1]), item) for item in self.data ]) def classify(self, itemVector): """Return class we think item Vector is in""" return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) ''' # Median # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python def get_median(lst): return np.median(np.array(lst)) # Absolute Standard Deviation def get_asd(lst, median): sum = 0 for item in lst: sum += abs(item - median) return sum / len(lst) # Normalisation rating with Modified Standard Score def normalize_rating(ratings, median, asd): for i in range(len(ratings)): ratings[i] = (ratings[i] - median) / asd return ratings ''' # Normalise user play count for userID in users: song_play_count = pd.DataFrame( users[userID].items(), columns=["songID", "play_count"] ) '''Coefficient of variation''' cv = song_play_count.play_count.std() / song_play_count.play_count.mean() #user_ratings = np.array(users[userID].values()) #cv = user_ratings.std()/user_ratings.mean() #print userID, cv if cv <= 0.5: for songID, play_count in users[userID].items(): users[userID][songID] = 3 else: song_play_count_q = pd.cut( song_play_count["play_count"], 5, labels=False ) + 1 song_play_count.play_count = song_play_count_q users[userID] = song_play_count.set_index('songID')['play_count'].to_dict() #print song_play_count #median = get_median(user_ratings) #asd = get_asd(user_ratings, median) #for songID, play_count in users[userID].items(): #users[userID][songID] = (play_count - median) / asd # Subset of most-liked items users_subset = {} for userID, songs in users.iteritems(): scores_above_threshold = { songID: score for songID, score in songs.iteritems() if score > 2 } users_subset[userID]= scores_above_threshold ''' for songID, score in songs.iteritems(): print score >0 if score > 0: print {userID: {songID: score}} {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} ''' # Fitness function for EDA def Fitness(profile, user_subset): fitness_value = 0 for songID, score in user_subset.iteritems(): fitness_value += log10(score * manhattan(profile, items[songID])) return fitness_value # Given parameters for EDA population_size = len(users_subset) fraction_of_population = int(round(0.5 * population_size)) # Generation of M individuals uniformly np.random.seed(len(users_subset)) M = np.random.rand(population_size, len(items.values()[0])) #M.shape = (-1, len(items.values()[0])) profile = {} i = 0 for userID in users_subset: profile[userID] = M.tolist()[i] i += 1 # Compute fitness values users_fitness = {} for userID in profile: users_fitness[userID] = Fitness(profile[userID], users_subset[userID]) users_fitness_df = pd.DataFrame( users_fitness.items(), columns=["userID", "fitness"] ) # Selection of best individuals based on fitness values best_individuals = {} users_fitness_df = users_fitness_df.sort(columns='fitness') M_sel = users_fitness_df.head(fraction_of_population) M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() for userID in M_sel_dict: best_individuals[userID] = profile[userID] # Calculate sample mean and standard deviation np.random.seed(1) g = mixture.GMM(n_components=10) # Generate random observations with two modes centered on 0 # and 10 to use for training. obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) g.fit(obs) np.round(g.weights_, 2) np.round(g.means_, 2) np.round(g.covars_, 2) g.predict([[0], [2], [9], [10]]) np.round(g.score([[0], [2], [9], [10]]), 2) # Refit the model on new data (initial parameters remain the # same), this time with an even split between the two modes. g.fit(20 * [[0]] + 20 * [[10]]) np.round(g.weights_, 2) ''' # Pearson Correlation Coefficient def pearson(rating1, rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) if n == 0: return 0 # now compute denominator denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ sqrt(sum_y2 - pow(sum_y, 2) / n) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator # Cosine Similarity for test purposes def cosine_similarity(rating1, rating2): sum_xy = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y if n == 0: return 0 # now compute denominator for key in rating1: x = rating1[key] sum_x2 += pow(x, 2) for key in rating2: y = rating2[key] sum_y2 += pow(y, 2) denominator = sqrt(sum_x2) * sqrt(sum_y2) if denominator == 0: return 0 else: return sum_xy / denominator def Fitness(profile, user_index): sim = 0 sum_log = 0 features = profile.items()[user_index][1] songs = users.items()[user_index][1] for song, rating in songs.items(): sim = pearson(features, items[song]) print(sim) for username, songs in users.items(): for song, rating in songs.items(): sim = pearson(profile, items[song]) #sum_log += log10(rating * sim) return sim '''