hybrid-music-recommender-using-content-based-and-social-information: Code/eda.py annotate

annotate Code/eda.py @ 24:68a62ca32441

Organized python scripts

author	Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date	Sat, 15 Aug 2015 19:16:17 +0100
parents	45e6f85d0ba4
children	fafc0b249a73

rev	line source
p@15	1 # -- coding: utf-8 --
p@15	2 """
p@15	3 Created on Wed Jul 22 17:42:09 2015
p@15	4
p@15	5 @author: paulochiliguano
p@15	6 """
p@15	7
p@16	8
p@17	9 from math import sqrt, log10
p@15	10 import numpy as np
p@24	11 import pandas as pd
p@15	12 from sklearn import mixture
p@15	13
p@24	14 #Fine tuning
p@24	15
p@15	16 #User-item dictionary
p@15	17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
p@15	18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
p@15	19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
p@15	20 "SOAJZEP12A8C14379B": 2.0},
p@15	21 "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5,
p@15	22 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0,
p@15	23 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0},
p@15	24 "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0,
p@15	25 "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0,
p@15	26 "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0},
p@15	27 "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0,
p@15	28 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0,
p@15	29 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
p@15	30 "SOAJZEP12A8C14379B": 2.0},
p@15	31 "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0,
p@15	32 "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0,
p@15	33 "SOAJZEP12A8C14379B": 1.0},
p@15	34 "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0,
p@15	35 "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0,
p@15	36 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
p@15	37 "SOAJZEP12A8C14379B": 4.0},
p@15	38 "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0,
p@15	39 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0,
p@15	40 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0},
p@15	41 "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0,
p@15	42 "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5,
p@15	43 "SOAKNZI12A58A79CAC": 3.0}
p@15	44 }
p@15	45
p@16	46 items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1],
p@16	47 "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1],
p@16	48 "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1],
p@16	49 "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1],
p@16	50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
p@16	51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
p@16	52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
p@16	53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
p@15	54
p@24	55 # Functions to compute similarity between items or between profiles
p@23	56 # Source: http://www.guidetodatamining.com
p@23	57 def manhattan(vector1, vector2):
p@23	58 """Computes the Manhattan distance."""
p@24	59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
p@23	60
p@24	61 def nearestNeighbor(self, itemVector):
p@24	62 """return nearest neighbor to itemVector"""
p@24	63 return min([(
p@24	64 self.manhattan(itemVector, item[1]), item) for item in self.data
p@24	65 ])
p@23	66
p@24	67 def classify(self, itemVector):
p@24	68 """Return class we think item Vector is in"""
p@24	69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
p@24	70 '''
p@24	71 # Median
p@24	72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
p@24	73 def get_median(lst):
p@24	74 return np.median(np.array(lst))
p@23	75
p@24	76 # Absolute Standard Deviation
p@24	77 def get_asd(lst, median):
p@24	78 sum = 0
p@24	79 for item in lst:
p@24	80 sum += abs(item - median)
p@24	81 return sum / len(lst)
p@23	82
p@24	83 # Normalisation rating with Modified Standard Score
p@24	84 def normalize_rating(ratings, median, asd):
p@24	85 for i in range(len(ratings)):
p@24	86 ratings[i] = (ratings[i] - median) / asd
p@24	87 return ratings
p@24	88 '''
p@24	89 # Normalise user play count
p@24	90 for userID in users:
p@24	91 song_play_count = pd.DataFrame(
p@24	92 users[userID].items(),
p@24	93 columns=["songID", "play_count"]
p@24	94 )
p@24	95 '''Coefficient of variation'''
p@24	96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
p@24	97 #user_ratings = np.array(users[userID].values())
p@24	98 #cv = user_ratings.std()/user_ratings.mean()
p@24	99 #print userID, cv
p@24	100 if cv <= 0.5:
p@24	101 for songID, play_count in users[userID].items():
p@24	102 users[userID][songID] = 3
p@24	103 else:
p@24	104 song_play_count_q = pd.cut(
p@24	105 song_play_count["play_count"],
p@24	106 5,
p@24	107 labels=False
p@24	108 ) + 1
p@24	109 song_play_count.play_count = song_play_count_q
p@24	110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
p@24	111 #print song_play_count
p@24	112 #median = get_median(user_ratings)
p@24	113 #asd = get_asd(user_ratings, median)
p@24	114 #for songID, play_count in users[userID].items():
p@24	115 #users[userID][songID] = (play_count - median) / asd
p@24	116
p@24	117 # Subset of most-liked items
p@24	118 users_subset = {}
p@24	119 for userID, songs in users.iteritems():
p@24	120 scores_above_threshold = {
p@24	121 songID: score for songID, score in songs.iteritems() if score > 2
p@24	122 }
p@24	123 users_subset[userID]= scores_above_threshold
p@24	124 '''
p@24	125 for songID, score in songs.iteritems():
p@24	126 print score >0
p@24	127 if score > 0:
p@24	128 print {userID: {songID: score}}
p@24	129
p@24	130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
p@24	131 '''
p@24	132 # Fitness function for EDA
p@24	133 def Fitness(profile, user_subset):
p@24	134 fitness_value = 0
p@24	135 for songID, score in user_subset.iteritems():
p@24	136 fitness_value += log10(score * manhattan(profile, items[songID]))
p@24	137 return fitness_value
p@24	138
p@24	139 # Given parameters for EDA
p@24	140 population_size = len(users_subset)
p@24	141 fraction_of_population = int(round(0.5 * population_size))
p@24	142
p@24	143 # Generation of M individuals uniformly
p@24	144 np.random.seed(len(users_subset))
p@24	145 M = np.random.rand(population_size, len(items.values()[0]))
p@24	146 #M.shape = (-1, len(items.values()[0]))
p@24	147 profile = {}
p@24	148 i = 0
p@24	149 for userID in users_subset:
p@24	150 profile[userID] = M.tolist()[i]
p@24	151 i += 1
p@24	152
p@24	153 # Compute fitness values
p@24	154 users_fitness = {}
p@24	155 for userID in profile:
p@24	156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
p@24	157 users_fitness_df = pd.DataFrame(
p@24	158 users_fitness.items(),
p@24	159 columns=["userID", "fitness"]
p@24	160 )
p@24	161
p@24	162 # Selection of best individuals based on fitness values
p@24	163 best_individuals = {}
p@24	164 users_fitness_df = users_fitness_df.sort(columns='fitness')
p@24	165 M_sel = users_fitness_df.head(fraction_of_population)
p@24	166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
p@24	167 for userID in M_sel_dict:
p@24	168 best_individuals[userID] = profile[userID]
p@24	169
p@24	170 # Calculate sample mean and standard deviation
p@24	171 np.random.seed(1)
p@24	172 g = mixture.GMM(n_components=10)
p@24	173 # Generate random observations with two modes centered on 0
p@24	174 # and 10 to use for training.
p@24	175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
p@24	176 g.fit(obs)
p@24	177 np.round(g.weights_, 2)
p@24	178 np.round(g.means_, 2)
p@24	179 np.round(g.covars_, 2)
p@24	180 g.predict([[0], [2], [9], [10]])
p@24	181 np.round(g.score([[0], [2], [9], [10]]), 2)
p@24	182 # Refit the model on new data (initial parameters remain the
p@24	183 # same), this time with an even split between the two modes.
p@24	184 g.fit(20 * [[0]] + 20 * [[10]])
p@24	185 np.round(g.weights_, 2)
p@24	186
p@24	187
p@24	188 '''
p@17	189 # Pearson Correlation Coefficient
p@17	190 def pearson(rating1, rating2):
p@17	191 sum_xy = 0
p@17	192 sum_x = 0
p@17	193 sum_y = 0
p@17	194 sum_x2 = 0
p@17	195 sum_y2 = 0
p@17	196 n = 0
p@17	197 for key in rating1:
p@17	198 if key in rating2:
p@17	199 n += 1
p@17	200 x = rating1[key]
p@17	201 y = rating2[key]
p@17	202 sum_xy += x * y
p@17	203 sum_x += x
p@17	204 sum_y += y
p@17	205 sum_x2 += pow(x, 2)
p@17	206 sum_y2 += pow(y, 2)
p@17	207 if n == 0:
p@17	208 return 0
p@17	209 # now compute denominator
p@17	210 denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
p@17	211 sqrt(sum_y2 - pow(sum_y, 2) / n)
p@17	212 if denominator == 0:
p@17	213 return 0
p@17	214 else:
p@17	215 return (sum_xy - (sum_x * sum_y) / n) / denominator
p@17	216
p@17	217 # Cosine Similarity for test purposes
p@17	218 def cosine_similarity(rating1, rating2):
p@17	219 sum_xy = 0
p@17	220 sum_x2 = 0
p@17	221 sum_y2 = 0
p@17	222 n = 0
p@17	223 for key in rating1:
p@17	224 if key in rating2:
p@17	225 n += 1
p@17	226 x = rating1[key]
p@17	227 y = rating2[key]
p@17	228 sum_xy += x * y
p@17	229 if n == 0:
p@17	230 return 0
p@17	231
p@17	232 # now compute denominator
p@17	233 for key in rating1:
p@17	234 x = rating1[key]
p@17	235 sum_x2 += pow(x, 2)
p@17	236
p@17	237 for key in rating2:
p@17	238 y = rating2[key]
p@17	239 sum_y2 += pow(y, 2)
p@17	240
p@17	241 denominator = sqrt(sum_x2) * sqrt(sum_y2)
p@17	242 if denominator == 0:
p@17	243 return 0
p@17	244 else:
p@17	245 return sum_xy / denominator
p@17	246
p@23	247
p@24	248
p@17	249 def Fitness(profile, user_index):
p@17	250 sim = 0
p@17	251 sum_log = 0
p@17	252
p@17	253 features = profile.items()[user_index][1]
p@17	254 songs = users.items()[user_index][1]
p@17	255
p@17	256 for song, rating in songs.items():
p@17	257 sim = pearson(features, items[song])
p@17	258 print(sim)
p@17	259
p@17	260 for username, songs in users.items():
p@17	261 for song, rating in songs.items():
p@17	262 sim = pearson(profile, items[song])
p@17	263 #sum_log += log10(rating * sim)
p@17	264 return sim
p@17	265 '''
p@15	266
p@17	267
p@21	268
p@21	269
p@21	270
p@21	271
p@21	272
p@24	273
p@24	274

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

annotate Code/eda.py @ 24:68a62ca32441