Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/eda.py @ 21:e68dbee1f6db
Modified code
New datasets
Updated report
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 11 Aug 2015 10:50:36 +0100 |
parents | ee13c193c76e |
children | 45e6f85d0ba4 |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Wed Jul 22 17:42:09 2015 @author: paulochiliguano """ from math import sqrt, log10 import numpy as np from sklearn import mixture #User-item dictionary users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, "SOAJZEP12A8C14379B": 2.0}, "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5, "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0, "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0}, "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0, "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0}, "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0, "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, "SOAJZEP12A8C14379B": 2.0}, "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0, "SOAJZEP12A8C14379B": 1.0}, "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0, "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0, "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0, "SOAJZEP12A8C14379B": 4.0}, "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0, "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0, "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0}, "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5, "SOAKNZI12A58A79CAC": 3.0} } items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1], "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1], "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1], "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1], "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} ''' Functions to compute similarity between items or between profiles ''' # Pearson Correlation Coefficient # Source: http://www.guidetodatamining.com def pearson(rating1, rating2): sum_xy = 0 sum_x = 0 sum_y = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += pow(x, 2) sum_y2 += pow(y, 2) if n == 0: return 0 # now compute denominator denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ sqrt(sum_y2 - pow(sum_y, 2) / n) if denominator == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / denominator # Cosine Similarity for test purposes def cosine_similarity(rating1, rating2): sum_xy = 0 sum_x2 = 0 sum_y2 = 0 n = 0 for key in rating1: if key in rating2: n += 1 x = rating1[key] y = rating2[key] sum_xy += x * y if n == 0: return 0 # now compute denominator for key in rating1: x = rating1[key] sum_x2 += pow(x, 2) for key in rating2: y = rating2[key] sum_y2 += pow(y, 2) denominator = sqrt(sum_x2) * sqrt(sum_y2) if denominator == 0: return 0 else: return sum_xy / denominator ''' Fitness function of EDA ''' def Fitness(profile, user_index): sim = 0 sum_log = 0 features = profile.items()[user_index][1] songs = users.items()[user_index][1] for song, rating in songs.items(): sim = pearson(features, items[song]) print(sim) for username, songs in users.items(): for song, rating in songs.items(): sim = pearson(profile, items[song]) #sum_log += log10(rating * sim) return sim ''' Generation of M individuals uniformly ''' population_size = len(users) fraction_of_population = 0.5 np.random.seed(len(users)) M = np.random.uniform(size=population_size * len(items.values()[0])) M.shape = (-1, len(items.values()[0])) profile = {} i = 0 for row in M.tolist(): profile["Profile" + str(i)] = M.tolist()[i] i = i + 1 ''' Calculate fitness values ''' Fitness(profile, 0) np.random.seed(1) g = mixture.GMM(n_components=7) # Generate random observations with two modes centered on 0 # and 10 to use for training. obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) g.fit(obs) np.round(g.weights_, 2) np.round(g.means_, 2) np.round(g.covars_, 2) g.predict([[0], [2], [9], [10]]) np.round(g.score([[0], [2], [9], [10]]), 2) # Refit the model on new data (initial parameters remain the # same), this time with an even split between the two modes. g.fit(20 * [[0]] + 20 * [[10]]) np.round(g.weights_, 2)