Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 17:42:09 2015

@author: paulochiliguano
"""


from math import sqrt, log10
import numpy as np
import pandas as pd
from sklearn import mixture

#Fine tuning

#User-item dictionary
users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
                      "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
                      "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
                      "SOAJZEP12A8C14379B": 2.0},
         "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5,
                 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0,
                 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0},
         "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0,
                  "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0,
                  "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0},
         "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0,
                 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0,
                 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
                 "SOAJZEP12A8C14379B": 2.0},
         "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0,
                    "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0,
                    "SOAJZEP12A8C14379B": 1.0},
         "Jordyn":  {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0,
                     "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0,
                     "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
                     "SOAJZEP12A8C14379B": 4.0},
         "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0,
                 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0,
                 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0},
         "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0,
                      "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5,
                      "SOAKNZI12A58A79CAC": 3.0}
        }

items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1],
         "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1],
         "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1],
         "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1],
         "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
         "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
         "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
         "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}

# Functions to compute similarity between items or between profiles
# Source: http://www.guidetodatamining.com
def manhattan(vector1, vector2):
    """Computes the Manhattan distance."""
    return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))

def nearestNeighbor(self, itemVector):
    """return nearest neighbor to itemVector"""
    return min([(
        self.manhattan(itemVector, item[1]), item) for item in self.data
    ])

def classify(self, itemVector):
    """Return class we think item Vector is in"""
    return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
'''
# Median
# http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
def get_median(lst):
    return np.median(np.array(lst))

# Absolute Standard Deviation
def get_asd(lst, median):
    sum = 0
    for item in lst:
        sum += abs(item - median)
    return sum / len(lst)

# Normalisation rating with Modified Standard Score
def normalize_rating(ratings, median, asd):
    for i in range(len(ratings)):
        ratings[i] = (ratings[i] - median) / asd
    return ratings
'''
# Normalise user play count
for userID in users:
    song_play_count = pd.DataFrame(
        users[userID].items(),
        columns=["songID", "play_count"]
    )
    '''Coefficient of variation'''
    cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
    #user_ratings = np.array(users[userID].values())
    #cv = user_ratings.std()/user_ratings.mean()
    #print userID, cv
    if cv <= 0.5:
        for songID, play_count in users[userID].items():
            users[userID][songID] = 3
    else:
        song_play_count_q = pd.cut(
            song_play_count["play_count"],
            5,
            labels=False
        ) + 1
        song_play_count.play_count = song_play_count_q
        users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
        #print song_play_count
    #median = get_median(user_ratings)
    #asd = get_asd(user_ratings, median)
    #for songID, play_count in users[userID].items():
        #users[userID][songID] = (play_count - median) / asd

# Subset of most-liked items
users_subset = {}
for userID, songs in users.iteritems():
    scores_above_threshold = {
        songID: score for songID, score in songs.iteritems() if score > 2
    }
    users_subset[userID]= scores_above_threshold
    '''
    for songID, score in songs.iteritems():
        print score >0
        if score > 0:
            print {userID: {songID: score}}

{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
'''
# Fitness function for EDA
def Fitness(profile, user_subset):
    fitness_value = 0
    for songID, score in user_subset.iteritems():
        fitness_value += log10(score * manhattan(profile, items[songID]))
    return fitness_value

# Given parameters for EDA
population_size = len(users_subset)
fraction_of_population = int(round(0.5 * population_size))

# Generation of M individuals uniformly
np.random.seed(len(users_subset))
M = np.random.rand(population_size, len(items.values()[0]))
#M.shape = (-1, len(items.values()[0]))
profile = {}
i = 0
for userID in users_subset:
    profile[userID] = M.tolist()[i]
    i += 1

# Compute fitness values
users_fitness = {}
for userID in profile:
    users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
users_fitness_df = pd.DataFrame(
    users_fitness.items(),
    columns=["userID", "fitness"]
)

# Selection of best individuals based on fitness values
best_individuals = {}
users_fitness_df = users_fitness_df.sort(columns='fitness')
M_sel = users_fitness_df.head(fraction_of_population)
M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
for userID in M_sel_dict:
    best_individuals[userID] = profile[userID]

# Calculate sample mean and standard deviation
np.random.seed(1)
g = mixture.GMM(n_components=10)
# Generate random observations with two modes centered on 0
# and 10 to use for training.
obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
g.fit(obs)
np.round(g.weights_, 2)
np.round(g.means_, 2)
np.round(g.covars_, 2)
g.predict([[0], [2], [9], [10]])
np.round(g.score([[0], [2], [9], [10]]), 2)
# Refit the model on new data (initial parameters remain the
# same), this time with an even split between the two modes.
g.fit(20 * [[0]] +  20 * [[10]])
np.round(g.weights_, 2)


'''
# Pearson Correlation Coefficient
def pearson(rating1, rating2):
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x * y
            sum_x += x
            sum_y += y
            sum_x2 += pow(x, 2)
            sum_y2 += pow(y, 2)
    if n == 0:
        return 0
    # now compute denominator
    denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
                  sqrt(sum_y2 - pow(sum_y, 2) / n)
    if denominator == 0:
        return 0
    else:
        return (sum_xy - (sum_x * sum_y) / n) / denominator

# Cosine Similarity for test purposes
def cosine_similarity(rating1, rating2):
    sum_xy = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x * y
    if n == 0:
        return 0

    # now compute denominator
    for key in rating1:
        x = rating1[key]
        sum_x2 += pow(x, 2)

    for key in rating2:
        y = rating2[key]
        sum_y2 += pow(y, 2)

    denominator = sqrt(sum_x2) * sqrt(sum_y2)
    if denominator == 0:
        return 0
    else:
        return sum_xy / denominator


def Fitness(profile, user_index):
    sim = 0
    sum_log = 0

    features = profile.items()[user_index][1]
    songs = users.items()[user_index][1]

    for song, rating in songs.items():
        sim = pearson(features, items[song])
        print(sim)

    for username, songs in users.items():
        for song, rating in songs.items():
            sim = pearson(profile, items[song])
            #sum_log += log10(rating * sim)
    return sim
'''
author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sat, 15 Aug 2015 19:16:17 +0100
parents	45e6f85d0ba4
children	fafc0b249a73