view Code/eda.py @ 25:fafc0b249a73

Final code
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sun, 23 Aug 2015 16:47:54 +0100
parents 68a62ca32441
children e4bcfe00abf4
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 22 17:42:09 2015

@author: paulochiliguano
"""


from math import log, sqrt
import numpy as np
import pandas as pd
import cPickle as pickle
#import random

# Item-vector dictionary
f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
genre_classification/genre_prob.pkl', 'rb')
song_library = pickle.load(f)
f.close()

# Load training and test data
f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
cross_validation.pkl', 'rb')
users_train, users_test = pickle.load(f)
f.close()

# Cosine Similarity
def cosine_similarity(vector1, vector2):
    dot_product = sum(map(lambda x, y: x * y, vector1, vector2))
    length_x = sqrt(sum(map(lambda x: x ** 2, vector1)))
    length_y = sqrt(sum(map(lambda y: y ** 2, vector2)))
    return dot_product / (length_x * length_y)

# Adjusted Cosine Similarity
def adj_cos_sim(vector_i, vector_j):
    avrg_w_i = (float(sum(vector_i)) / len(vector_i))
    avrg_w_j = (float(sum(vector_j)) / len(vector_j))
    num = sum(map(
        lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j),
        vector_i,
        vector_j)
    )
    dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i))
    dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j))
    return num / (sqrt(dem1) * sqrt(dem2))

# Fitness function for EDA
def Fitness(profile_u, user_subset):
    fitness_value = 0
    for songID, score in user_subset.iteritems():
        #print cosine_similarity(profile, song_library[songID])
        sim = cosine_similarity(profile_u, song_library[songID])
        if sim <= 0:
            fitness_value += -708
            #math.log(sys.float_info.min)
        else:
            fitness_value += log(score * sim)
        #fitness_value += log(score * manhattan(profile, song_library[songID]))
        #fitness_value += score * cosine_similarity(profile, song_library[songID])
    return fitness_value

def users_likes_subset(users, rating_threshold=3):
    # Subset of most-liked items
    users_subset = {}
    for userID, songs in users.iteritems():
        scores_above_threshold = {
            songID: score for songID, score in songs.iteritems() if score > rating_threshold
        }
        users_subset[userID]= scores_above_threshold
        
    #for songID, score in songs.iteritems():
        #print score >0
        #if score > 0:
            #print {userID: {songID: score}}

    #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
        
    return users_subset

def eda_train(users_subset, max_gen=200):
    # TRAINING
    num_features = len(song_library.values()[0])
    # Given parameters for EDA
    population_size = len(users_subset)
    fraction_of_population = int(round(0.5 * population_size))

    # Generation of M individuals uniformly
    np.random.seed(12345)
    M = np.random.uniform(
        0,
        1,
        population_size * num_features
    )
    M.shape = (-1, num_features)
    profile_u = {}
    i = 0
    for userID in users_subset:
        profile_u[userID] = M.tolist()[i]
        i += 1

    generation = 0
    while generation < max_gen:
        # Compute fitness values
        users_fitness = {}
        for userID in profile_u:
            users_fitness[userID] = Fitness(
                profile_u[userID],
                users_subset[userID]
            )
        users_fitness_df = pd.DataFrame(
            users_fitness.items(),
            columns=["userID", "fitness"]
        )
    
        # Selection of best individuals based on fitness values
        best_individuals = {}
        users_fitness_df = users_fitness_df.sort(columns='fitness')
        M_sel = users_fitness_df.tail(fraction_of_population)
        M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
        for userID in M_sel_dict:
            best_individuals[userID] = profile_u[userID]
        
        # Calculate sample mean and standard deviation
        D = np.array([])
        for userID, features in best_individuals.iteritems():
            D = np.append(D, features, axis=0)
        D.shape = (-1, num_features)    
        D_mu = np.mean(D, axis=0)
        D_sigma = np.std(D, axis=0, ddof=1)
    
        # Sample M individuals
        M = np.random.normal(
            D_mu,
            D_sigma,
            (population_size, num_features)
        )
        #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2))
    
        #M.shape = (-1, len(items.values()[0]))
        #M = D_sigma * np.random.normal(
            #population_size,
            #len(items.values()[0])
        #) + D_mu
        profile_u = {}
        i = 0
        for userID in users_subset:
            profile_u[userID] = M.tolist()[i]
            i += 1
        generation += 1
    
    return profile_u

# Similarity matrix
def cb_similarity(profileID, profile_data, test_data, N):
    ''' Content-based: Similarity matrix '''
    similarity = []
    for songID in test_data[profileID]:
        sim = adj_cos_sim(profile_data, song_library[songID])
        similarity.append((sim, songID))
    # Top-N recommendation
    #similarity.sort(reverse=True)
    #if len(similarity) > N:
        #similarity = similarity[0:N]
        
    #sim_matrix[userID] = {t[1]: t[0] for t in similarity}
    return {t[1]: t[0] for t in similarity}

def evaluate_eda(
    profiles,
    test_data,
    N=10,
    rating_threshold=3,
    EDA_treshold=0.3):    
    
    ''' Evaluation '''
    
    sim_matrix = {}
    for userID, features in profiles.iteritems():
        sim_matrix[userID] = cb_similarity(userID, features, test_data, N)
    
    # Content-Based: Evaluation
    tp = 0.
    fp = 0.
    fn = 0.
    tn = 0.
    for userID, songID_sim in sim_matrix.iteritems():
        for songID, sim_value in songID_sim.iteritems():
            score = test_data[userID][songID]
            if score > rating_threshold and sim_value >= EDA_treshold:
                tp += 1
            elif score <= rating_threshold and sim_value >= EDA_treshold:
                fp += 1
            elif score > rating_threshold and sim_value < EDA_treshold:
                fn += 1
            elif score <= rating_threshold and sim_value < EDA_treshold:
                tn += 1
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    return precision, recall, F1, accuracy
 
        #keys_a = set(users[userID].keys())
        #keys_b = set(test_data.keys())
        #intersection = keys_a & keys_b
        #if len(intersection) != 0:
            #similarity = {}  
        #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs}
            #for songID in intersection:
            #if songID == k:
                #similarity[songID] = adj_cos_sim(
                    #profile[userID],
                    #test_data[songID]
                #)
                #max_sim = max(similarity, key=similarity.get)
                #if max_sim >= EDA_treshold:
                    #sim_matrix[userID] = {max_sim: similarity[max_sim]}
            #sim_matrix[userID] = similarity
            #sim_matrix[userID] = {max_sim: similarity[max_sim]}

#print len(sim_matrix)
p = np.array([])
f = np.array([])
r = np.array([])
a = np.array([])

for i in range(len(users_train)):
    
    profile_u = eda_train(users_likes_subset(users_train[i]))
    pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i])
    p = np.append(p, pi)
    r = np.append(r, ri)
    f = np.append(f, fi)
    a = np.append(a, ai)

#precision = np.array(p)
#rec = np.array(r)
#F1 = np.array(f)
#accuracy = np.array(a)

print "Precision = %f3 ± %f3" % (p.mean(), p.std())
print "Recall = %f3 ± %f3" % (r.mean(), r.std())
print "F1 = %f3 ± %f3" % (f.mean(), f.std())
print "Accuracy = %f3 ± %f3" % (a.mean(), a.std())
   
'''# Collaborative-filtering: Similarity matrix
sim_matrix_cf = {}
count = 0
for userID_1 in profile:
    similarities = {}
    for userID_2 in profile:
        if userID_1 != userID_2:
            similarities[userID_2] = adj_cos_sim(
                profile[userID_1],
                profile[userID_2]
            )
            #print similarities
    sim_matrix_cf[userID_1] = similarities'''

# Predicted rating
#for userID in users:
#    print np.array(users[userID].values()).mean()

'''scores_above_threshold = {
        songID: score for songID, score in songs.iteritems() if score > rating_threshold
    }'''

'''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True):
                print "%s: %s" % (key, value)
                break'''


# Recommend new item

'''
def computeNearestNeighbor(itemName, itemVector, items):
    """creates a sorted list of items based on their distance to item"""
    distances = []
    for otherItem in items:
        if otherItem != itemName:
            distance = adj_cos_sim(itemVector, items[otherItem])
            distances.append((distance, otherItem))
    # sort based on distance -- closest first
    distances.sort(reverse=True)
    return distances

def classify(user, itemName, itemVector):
    """Classify the itemName based on user ratings
    Should really have items and users as parameters"""
    # first find nearest neighbor
    nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1]
    rating = users[user][nearest]
    return rating
'''
# Source: guidetodatamining.com
'''def computeSimilarity(band1, band2, userRatings):
    averages = {}
    for (key, ratings) in userRatings.items():
        averages[key] = (float(sum(ratings.values())) / len(ratings.values()))
    num = 0 # numerator
    dem1 = 0 # first half of denominator
    dem2 = 0
    for (user, ratings) in userRatings.items():
        if band1 in ratings and band2 in ratings:
            avg = averages[user]
            num += (ratings[band1] - avg) * (ratings[band2] - avg)
            dem1 += (ratings[band1] - avg)**2
            dem2 += (ratings[band2] - avg)**2
    return num / (sqrt(dem1) * sqrt(dem2))'''

'''
    sum_xy = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x * y
    if n == 0:
        return 0
    
    # now compute denominator
    for key in rating1:
        x = rating1[key]
        sum_x2 += pow(x, 2)
    
    for key in rating2:
        y = rating2[key]
        sum_y2 += pow(y, 2)
    
    denominator = sqrt(sum_x2) * sqrt(sum_y2)
    if denominator == 0:
        return 0
    else:
        return sum_xy / denominator'''

'''
# Median
# http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
def get_median(lst):
    return np.median(np.array(lst))

# Absolute Standard Deviation
def get_asd(lst, median):
    sum = 0
    for item in lst:
        sum += abs(item - median)
    return sum / len(lst)

# Normalisation rating with Modified Standard Score
def normalize_rating(ratings, median, asd):
    for i in range(len(ratings)):
        ratings[i] = (ratings[i] - median) / asd
    return ratings
'''

'''
# Pearson Correlation Coefficient
def pearson(rating1, rating2):
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x * y
            sum_x += x
            sum_y += y
            sum_x2 += pow(x, 2)
            sum_y2 += pow(y, 2)
    if n == 0:
        return 0
    # now compute denominator
    denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
                  sqrt(sum_y2 - pow(sum_y, 2) / n)
    if denominator == 0:
        return 0
    else:
        return (sum_xy - (sum_x * sum_y) / n) / denominator
'''
'''
def buckets(filename, bucketName, separator, classColumn):
    """the original data is in the file named filename
    bucketName is the prefix for all the bucket names
    separator is the character that divides the columns
    (for ex., a tab or comma and classColumn is the column
    that indicates the class"""
    # put the data in 10 buckets
    numberOfBuckets = 10
    data = {}
    # first read in the data and divide by category
    with open(filename) as f:
        lines = f.readlines()
    for line in lines:
        if separator != '\t':
            line = line.replace(separator, '\t')
            # first get the category
            category = line.split()[classColumn]
            data.setdefault(category, [])
            data[category].append(line)
    # initialize the buckets
    buckets = []
    for i in range(numberOfBuckets):
        buckets.append([])
    # now for each category put the data into the buckets
    for k in song_library.keys():
        #randomize order of instances for each class
        print random.shuffle(song_library[k])
        bNum = 0
        # divide into buckets
        for item in song_library[k]:
            buckets[bNum].append(item)
            bNum = (bNum + 1) % numberOfBuckets
    # write to file
    for bNum in range(numberOfBuckets):
        f = open("%s-%02i" % (bucketName, bNum + 1), 'w')
        for item in buckets[bNum]:
            f.write(item)
        f.close()
'''

'''# Functions to compute similarity between items or between profiles
# Source: http://www.guidetodatamining.com
def manhattan(vector1, vector2):
    """Computes the Manhattan distance."""
    return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))'''