hybrid-music-recommender-using-content-based-and-social-information: Code/eda.py comparison

comparison Code/eda.py @ 24:68a62ca32441

Organized python scripts

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sat, 15 Aug 2015 19:16:17 +0100
parents	45e6f85d0ba4
children	fafc0b249a73

comparison

equal deleted inserted replaced

-:45e6f85d0ba4
+:68a62ca32441
 """
 from math import sqrt, log10
 import numpy as np
+import pandas as pd
 from sklearn import mixture
+#Fine tuning
 #User-item dictionary
 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
-#Functions to compute similarity between items or between profiles
+# Functions to compute similarity between items or between profiles
 # Source: http://www.guidetodatamining.com
 def manhattan(vector1, vector2):
 """Computes the Manhattan distance."""
-distance = 0
+return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
-total = 0
-n = len(vector1)
+def nearestNeighbor(self, itemVector):
-for i in range(n):
+"""return nearest neighbor to itemVector"""
-distance += abs(vector1[i] - vector2[i])
+return min([(
-return distance
+self.manhattan(itemVector, item[1]), item) for item in self.data
+])
-def computeNearestNeighbor(itemName, itemVector, items):
-"""creates a sorted list of items based on their distance to item"""
+def classify(self, itemVector):
-distances = []
+"""Return class we think item Vector is in"""
-for otherItem in items:
+return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
-if otherItem != itemName:
+'''
-distance = manhattan(itemVector, items[otherItem])
+# Median
-distances.append((distance, otherItem))
+# http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
-# sort based on distance -- closest first
+def get_median(lst):
-distances.sort()
+return np.median(np.array(lst))
-return distances
+# Absolute Standard Deviation
-def classify(user, itemName, itemVector):
+def get_asd(lst, median):
-"""Classify the itemName based on user ratings
+sum = 0
-Should really have items and users as parameters"""
+for item in lst:
-# first find nearest neighbor
+sum += abs(item - median)
-nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
+return sum / len(lst)
-rating = users[user][nearest]
-return rating
+# Normalisation rating with Modified Standard Score
+def normalize_rating(ratings, median, asd):
-# Fitness function of EDA
+for i in range(len(ratings)):
-def Fitness(profile, user):
+ratings[i] = (ratings[i] - median) / asd
-nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
+return ratings
-rating = users[user][nearest]
+'''
-return rating
+# Normalise user play count
+for userID in users:
+song_play_count = pd.DataFrame(
+users[userID].items(),
+columns=["songID", "play_count"]
+)
+'''Coefficient of variation'''
+cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
+#user_ratings = np.array(users[userID].values())
+#cv = user_ratings.std()/user_ratings.mean()
+#print userID, cv
+if cv <= 0.5:
+for songID, play_count in users[userID].items():
+users[userID][songID] = 3
+else:
+song_play_count_q = pd.cut(
+song_play_count["play_count"],
+5,
+labels=False
+) + 1
+song_play_count.play_count = song_play_count_q
+users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
+#print song_play_count
+#median = get_median(user_ratings)
+#asd = get_asd(user_ratings, median)
+#for songID, play_count in users[userID].items():
+#users[userID][songID] = (play_count - median) / asd
+# Subset of most-liked items
+users_subset = {}
+for userID, songs in users.iteritems():
+scores_above_threshold = {
+songID: score for songID, score in songs.iteritems() if score > 2
+}
+users_subset[userID]= scores_above_threshold
+'''
+for songID, score in songs.iteritems():
+print score >0
+if score > 0:
+print {userID: {songID: score}}
+{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
+'''
+# Fitness function for EDA
+def Fitness(profile, user_subset):
+fitness_value = 0
+for songID, score in user_subset.iteritems():
+fitness_value += log10(score * manhattan(profile, items[songID]))
+return fitness_value
+# Given parameters for EDA
+population_size = len(users_subset)
+fraction_of_population = int(round(0.5 * population_size))
+# Generation of M individuals uniformly
+np.random.seed(len(users_subset))
+M = np.random.rand(population_size, len(items.values()[0]))
+#M.shape = (-1, len(items.values()[0]))
+profile = {}
+i = 0
+for userID in users_subset:
+profile[userID] = M.tolist()[i]
+i += 1
+# Compute fitness values
+users_fitness = {}
+for userID in profile:
+users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
+users_fitness_df = pd.DataFrame(
+users_fitness.items(),
+columns=["userID", "fitness"]
+)
+# Selection of best individuals based on fitness values
+best_individuals = {}
+users_fitness_df = users_fitness_df.sort(columns='fitness')
+M_sel = users_fitness_df.head(fraction_of_population)
+M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
+for userID in M_sel_dict:
+best_individuals[userID] = profile[userID]
+# Calculate sample mean and standard deviation
+np.random.seed(1)
+g = mixture.GMM(n_components=10)
+# Generate random observations with two modes centered on 0
+# and 10 to use for training.
+obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
+g.fit(obs)
+np.round(g.weights_, 2)
+np.round(g.means_, 2)
+np.round(g.covars_, 2)
+g.predict([[0], [2], [9], [10]])
+np.round(g.score([[0], [2], [9], [10]]), 2)
+# Refit the model on new data (initial parameters remain the
+# same), this time with an even split between the two modes.
+g.fit(20 * [[0]] +  20 * [[10]])
+np.round(g.weights_, 2)
+'''
 # Pearson Correlation Coefficient
 def pearson(rating1, rating2):
 sum_xy = 0
 sum_x = 0
 sum_y = 0
 return 0
 else:
 return sum_xy / denominator
-'''
 def Fitness(profile, user_index):
 sim = 0
 sum_log = 0
 features = profile.items()[user_index][1]
 for song, rating in songs.items():
 sim = pearson(profile, items[song])
 #sum_log += log10(rating * sim)
 return sim
 '''
-# Generation of M individuals uniformly
-population_size = len(users)
-fraction_of_population = 0.5
-np.random.seed(len(users))
-M = np.random.uniform(size=population_size * len(items.values()[0]))
-M.shape = (-1, len(items.values()[0]))
-profile = {}
-i = 0
-for row in M.tolist():
-profile["Profile" + str(i)] = M.tolist()[i]
-i = i + 1
-'''
-Calculate fitness values
-'''
-Fitness(profile, 0)
-np.random.seed(1)
-g = mixture.GMM(n_components=7)
-# Generate random observations with two modes centered on 0
-# and 10 to use for training.
-obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
-g.fit(obs)
-np.round(g.weights_, 2)
-np.round(g.means_, 2)
-np.round(g.covars_, 2)
-g.predict([[0], [2], [9], [10]])
-np.round(g.score([[0], [2], [9], [10]]), 2)
-# Refit the model on new data (initial parameters remain the
-# same), this time with an even split between the two modes.
-g.fit(20 * [[0]] +  20 * [[10]])
-np.round(g.weights_, 2)

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

comparison Code/eda.py @ 24:68a62ca32441