p@15: # -*- coding: utf-8 -*- p@15: """ p@15: Created on Wed Jul 22 17:42:09 2015 p@15: p@15: @author: paulochiliguano p@15: """ p@15: p@16: p@25: from math import log, sqrt p@15: import numpy as np p@24: import pandas as pd p@25: import cPickle as pickle p@25: #import random p@15: p@25: # Item-vector dictionary p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: genre_classification/genre_prob.pkl', 'rb') p@25: song_library = pickle.load(f) p@25: f.close() p@24: p@25: # Load training and test data p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: cross_validation.pkl', 'rb') p@25: users_train, users_test = pickle.load(f) p@25: f.close() p@25: p@25: # Cosine Similarity p@25: def cosine_similarity(vector1, vector2): p@25: dot_product = sum(map(lambda x, y: x * y, vector1, vector2)) p@25: length_x = sqrt(sum(map(lambda x: x ** 2, vector1))) p@25: length_y = sqrt(sum(map(lambda y: y ** 2, vector2))) p@25: return dot_product / (length_x * length_y) p@25: p@25: # Adjusted Cosine Similarity p@25: def adj_cos_sim(vector_i, vector_j): p@25: avrg_w_i = (float(sum(vector_i)) / len(vector_i)) p@25: avrg_w_j = (float(sum(vector_j)) / len(vector_j)) p@25: num = sum(map( p@25: lambda w_i, w_j: (w_i - avrg_w_i) * (w_j - avrg_w_j), p@25: vector_i, p@25: vector_j) p@25: ) p@25: dem1 = sum(map(lambda w_i: (w_i - avrg_w_i) ** 2, vector_i)) p@25: dem2 = sum(map(lambda w_j: (w_j - avrg_w_j) ** 2, vector_j)) p@25: return num / (sqrt(dem1) * sqrt(dem2)) p@25: p@25: # Fitness function for EDA p@25: def Fitness(profile_u, user_subset): p@25: fitness_value = 0 p@25: for songID, score in user_subset.iteritems(): p@25: #print cosine_similarity(profile, song_library[songID]) p@25: sim = cosine_similarity(profile_u, song_library[songID]) p@25: if sim <= 0: p@25: fitness_value += -708 p@25: #math.log(sys.float_info.min) p@25: else: p@25: fitness_value += log(score * sim) p@25: #fitness_value += log(score * manhattan(profile, song_library[songID])) p@25: #fitness_value += score * cosine_similarity(profile, song_library[songID]) p@25: return fitness_value p@25: p@25: def users_likes_subset(users, rating_threshold=3): p@25: # Subset of most-liked items p@25: users_subset = {} p@25: for userID, songs in users.iteritems(): p@25: scores_above_threshold = { p@25: songID: score for songID, score in songs.iteritems() if score > rating_threshold p@15: } p@25: users_subset[userID]= scores_above_threshold p@25: p@25: #for songID, score in songs.iteritems(): p@25: #print score >0 p@25: #if score > 0: p@25: #print {userID: {songID: score}} p@15: p@25: #{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} p@25: p@25: return users_subset p@15: p@25: def eda_train(users_subset, max_gen=200): p@25: # TRAINING p@25: num_features = len(song_library.values()[0]) p@25: # Given parameters for EDA p@25: population_size = len(users_subset) p@25: fraction_of_population = int(round(0.5 * population_size)) p@23: p@25: # Generation of M individuals uniformly p@25: np.random.seed(12345) p@25: M = np.random.uniform( p@25: 0, p@25: 1, p@25: population_size * num_features p@25: ) p@25: M.shape = (-1, num_features) p@25: profile_u = {} p@25: i = 0 p@25: for userID in users_subset: p@25: profile_u[userID] = M.tolist()[i] p@25: i += 1 p@23: p@25: generation = 0 p@25: while generation < max_gen: p@25: # Compute fitness values p@25: users_fitness = {} p@25: for userID in profile_u: p@25: users_fitness[userID] = Fitness( p@25: profile_u[userID], p@25: users_subset[userID] p@25: ) p@25: users_fitness_df = pd.DataFrame( p@25: users_fitness.items(), p@25: columns=["userID", "fitness"] p@25: ) p@25: p@25: # Selection of best individuals based on fitness values p@25: best_individuals = {} p@25: users_fitness_df = users_fitness_df.sort(columns='fitness') p@25: M_sel = users_fitness_df.tail(fraction_of_population) p@25: M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() p@25: for userID in M_sel_dict: p@25: best_individuals[userID] = profile_u[userID] p@25: p@25: # Calculate sample mean and standard deviation p@25: D = np.array([]) p@25: for userID, features in best_individuals.iteritems(): p@25: D = np.append(D, features, axis=0) p@25: D.shape = (-1, num_features) p@25: D_mu = np.mean(D, axis=0) p@25: D_sigma = np.std(D, axis=0, ddof=1) p@25: p@25: # Sample M individuals p@25: M = np.random.normal( p@25: D_mu, p@25: D_sigma, p@25: (population_size, num_features) p@25: ) p@25: #M = 1 / (D_sigma * np.sqrt(2 * np.pi)) * np.exp(- (M_range - D_mu) ** 2 / (2 * D_sigma ** 2)) p@25: p@25: #M.shape = (-1, len(items.values()[0])) p@25: #M = D_sigma * np.random.normal( p@25: #population_size, p@25: #len(items.values()[0]) p@25: #) + D_mu p@25: profile_u = {} p@25: i = 0 p@25: for userID in users_subset: p@25: profile_u[userID] = M.tolist()[i] p@25: i += 1 p@25: generation += 1 p@25: p@25: return profile_u p@25: p@25: # Similarity matrix p@25: def cb_similarity(profileID, profile_data, test_data, N): p@25: ''' Content-based: Similarity matrix ''' p@25: similarity = [] p@25: for songID in test_data[profileID]: p@25: sim = adj_cos_sim(profile_data, song_library[songID]) p@25: similarity.append((sim, songID)) p@25: # Top-N recommendation p@25: #similarity.sort(reverse=True) p@25: #if len(similarity) > N: p@25: #similarity = similarity[0:N] p@25: p@25: #sim_matrix[userID] = {t[1]: t[0] for t in similarity} p@25: return {t[1]: t[0] for t in similarity} p@25: p@25: def evaluate_eda( p@25: profiles, p@25: test_data, p@25: N=10, p@25: rating_threshold=3, p@25: EDA_treshold=0.3): p@25: p@25: ''' Evaluation ''' p@25: p@25: sim_matrix = {} p@25: for userID, features in profiles.iteritems(): p@25: sim_matrix[userID] = cb_similarity(userID, features, test_data, N) p@25: p@25: # Content-Based: Evaluation p@25: tp = 0. p@25: fp = 0. p@25: fn = 0. p@25: tn = 0. p@25: for userID, songID_sim in sim_matrix.iteritems(): p@25: for songID, sim_value in songID_sim.iteritems(): p@25: score = test_data[userID][songID] p@25: if score > rating_threshold and sim_value >= EDA_treshold: p@25: tp += 1 p@25: elif score <= rating_threshold and sim_value >= EDA_treshold: p@25: fp += 1 p@25: elif score > rating_threshold and sim_value < EDA_treshold: p@25: fn += 1 p@25: elif score <= rating_threshold and sim_value < EDA_treshold: p@25: tn += 1 p@25: p@25: precision = tp / (tp + fp) p@25: recall = tp / (tp + fn) p@25: F1 = 2 * precision * recall / (precision + recall) p@25: accuracy = (tp + tn) / (tp + fp + tn + fn) p@25: p@25: return precision, recall, F1, accuracy p@25: p@25: #keys_a = set(users[userID].keys()) p@25: #keys_b = set(test_data.keys()) p@25: #intersection = keys_a & keys_b p@25: #if len(intersection) != 0: p@25: #similarity = {} p@25: #print {k: v for k,v in song_library_fold[0].iteritems() if k in songs} p@25: #for songID in intersection: p@25: #if songID == k: p@25: #similarity[songID] = adj_cos_sim( p@25: #profile[userID], p@25: #test_data[songID] p@25: #) p@25: #max_sim = max(similarity, key=similarity.get) p@25: #if max_sim >= EDA_treshold: p@25: #sim_matrix[userID] = {max_sim: similarity[max_sim]} p@25: #sim_matrix[userID] = similarity p@25: #sim_matrix[userID] = {max_sim: similarity[max_sim]} p@25: p@25: #print len(sim_matrix) p@25: p = np.array([]) p@25: f = np.array([]) p@25: r = np.array([]) p@25: a = np.array([]) p@25: p@25: for i in range(len(users_train)): p@25: p@25: profile_u = eda_train(users_likes_subset(users_train[i])) p@25: pi, ri, fi, ai = evaluate_eda(profile_u, users_test[i]) p@25: p = np.append(p, pi) p@25: r = np.append(r, ri) p@25: f = np.append(f, fi) p@25: a = np.append(a, ai) p@25: p@25: #precision = np.array(p) p@25: #rec = np.array(r) p@25: #F1 = np.array(f) p@25: #accuracy = np.array(a) p@25: p@25: print "Precision = %f3 ± %f3" % (p.mean(), p.std()) p@25: print "Recall = %f3 ± %f3" % (r.mean(), r.std()) p@25: print "F1 = %f3 ± %f3" % (f.mean(), f.std()) p@25: print "Accuracy = %f3 ± %f3" % (a.mean(), a.std()) p@25: p@25: '''# Collaborative-filtering: Similarity matrix p@25: sim_matrix_cf = {} p@25: count = 0 p@25: for userID_1 in profile: p@25: similarities = {} p@25: for userID_2 in profile: p@25: if userID_1 != userID_2: p@25: similarities[userID_2] = adj_cos_sim( p@25: profile[userID_1], p@25: profile[userID_2] p@25: ) p@25: #print similarities p@25: sim_matrix_cf[userID_1] = similarities''' p@25: p@25: # Predicted rating p@25: #for userID in users: p@25: # print np.array(users[userID].values()).mean() p@25: p@25: '''scores_above_threshold = { p@25: songID: score for songID, score in songs.iteritems() if score > rating_threshold p@25: }''' p@25: p@25: '''for key, value in sorted(similarity.iteritems(), key=lambda (k,v): (v,k), reverse=True): p@25: print "%s: %s" % (key, value) p@25: break''' p@25: p@25: p@25: # Recommend new item p@25: p@25: ''' p@25: def computeNearestNeighbor(itemName, itemVector, items): p@25: """creates a sorted list of items based on their distance to item""" p@25: distances = [] p@25: for otherItem in items: p@25: if otherItem != itemName: p@25: distance = adj_cos_sim(itemVector, items[otherItem]) p@25: distances.append((distance, otherItem)) p@25: # sort based on distance -- closest first p@25: distances.sort(reverse=True) p@25: return distances p@25: p@25: def classify(user, itemName, itemVector): p@25: """Classify the itemName based on user ratings p@25: Should really have items and users as parameters""" p@25: # first find nearest neighbor p@25: nearest = computeNearestNeighbor(itemName, itemVector, song_library)[0][1] p@25: rating = users[user][nearest] p@25: return rating p@25: ''' p@25: # Source: guidetodatamining.com p@25: '''def computeSimilarity(band1, band2, userRatings): p@25: averages = {} p@25: for (key, ratings) in userRatings.items(): p@25: averages[key] = (float(sum(ratings.values())) / len(ratings.values())) p@25: num = 0 # numerator p@25: dem1 = 0 # first half of denominator p@25: dem2 = 0 p@25: for (user, ratings) in userRatings.items(): p@25: if band1 in ratings and band2 in ratings: p@25: avg = averages[user] p@25: num += (ratings[band1] - avg) * (ratings[band2] - avg) p@25: dem1 += (ratings[band1] - avg)**2 p@25: dem2 += (ratings[band2] - avg)**2 p@25: return num / (sqrt(dem1) * sqrt(dem2))''' p@25: p@25: ''' p@25: sum_xy = 0 p@25: sum_x2 = 0 p@25: sum_y2 = 0 p@25: n = 0 p@25: for key in rating1: p@25: if key in rating2: p@25: n += 1 p@25: x = rating1[key] p@25: y = rating2[key] p@25: sum_xy += x * y p@25: if n == 0: p@25: return 0 p@25: p@25: # now compute denominator p@25: for key in rating1: p@25: x = rating1[key] p@25: sum_x2 += pow(x, 2) p@25: p@25: for key in rating2: p@25: y = rating2[key] p@25: sum_y2 += pow(y, 2) p@25: p@25: denominator = sqrt(sum_x2) * sqrt(sum_y2) p@25: if denominator == 0: p@25: return 0 p@25: else: p@25: return sum_xy / denominator''' p@25: p@24: ''' p@24: # Median p@24: # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python p@24: def get_median(lst): p@24: return np.median(np.array(lst)) p@23: p@24: # Absolute Standard Deviation p@24: def get_asd(lst, median): p@24: sum = 0 p@24: for item in lst: p@24: sum += abs(item - median) p@24: return sum / len(lst) p@23: p@24: # Normalisation rating with Modified Standard Score p@24: def normalize_rating(ratings, median, asd): p@24: for i in range(len(ratings)): p@24: ratings[i] = (ratings[i] - median) / asd p@24: return ratings p@24: ''' p@24: p@24: ''' p@17: # Pearson Correlation Coefficient p@17: def pearson(rating1, rating2): p@17: sum_xy = 0 p@17: sum_x = 0 p@17: sum_y = 0 p@17: sum_x2 = 0 p@17: sum_y2 = 0 p@17: n = 0 p@17: for key in rating1: p@17: if key in rating2: p@17: n += 1 p@17: x = rating1[key] p@17: y = rating2[key] p@17: sum_xy += x * y p@17: sum_x += x p@17: sum_y += y p@17: sum_x2 += pow(x, 2) p@17: sum_y2 += pow(y, 2) p@17: if n == 0: p@17: return 0 p@17: # now compute denominator p@17: denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \ p@17: sqrt(sum_y2 - pow(sum_y, 2) / n) p@17: if denominator == 0: p@17: return 0 p@17: else: p@17: return (sum_xy - (sum_x * sum_y) / n) / denominator p@25: ''' p@25: ''' p@25: def buckets(filename, bucketName, separator, classColumn): p@25: """the original data is in the file named filename p@25: bucketName is the prefix for all the bucket names p@25: separator is the character that divides the columns p@25: (for ex., a tab or comma and classColumn is the column p@25: that indicates the class""" p@25: # put the data in 10 buckets p@25: numberOfBuckets = 10 p@25: data = {} p@25: # first read in the data and divide by category p@25: with open(filename) as f: p@25: lines = f.readlines() p@25: for line in lines: p@25: if separator != '\t': p@25: line = line.replace(separator, '\t') p@25: # first get the category p@25: category = line.split()[classColumn] p@25: data.setdefault(category, []) p@25: data[category].append(line) p@25: # initialize the buckets p@25: buckets = [] p@25: for i in range(numberOfBuckets): p@25: buckets.append([]) p@25: # now for each category put the data into the buckets p@25: for k in song_library.keys(): p@25: #randomize order of instances for each class p@25: print random.shuffle(song_library[k]) p@25: bNum = 0 p@25: # divide into buckets p@25: for item in song_library[k]: p@25: buckets[bNum].append(item) p@25: bNum = (bNum + 1) % numberOfBuckets p@25: # write to file p@25: for bNum in range(numberOfBuckets): p@25: f = open("%s-%02i" % (bucketName, bNum + 1), 'w') p@25: for item in buckets[bNum]: p@25: f.write(item) p@25: f.close() p@17: ''' p@15: p@25: '''# Functions to compute similarity between items or between profiles p@25: # Source: http://www.guidetodatamining.com p@25: def manhattan(vector1, vector2): p@25: """Computes the Manhattan distance.""" p@25: return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))'''