annotate Code/eda.py @ 24:68a62ca32441

Organized python scripts
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sat, 15 Aug 2015 19:16:17 +0100
parents 45e6f85d0ba4
children fafc0b249a73
rev   line source
p@15 1 # -*- coding: utf-8 -*-
p@15 2 """
p@15 3 Created on Wed Jul 22 17:42:09 2015
p@15 4
p@15 5 @author: paulochiliguano
p@15 6 """
p@15 7
p@16 8
p@17 9 from math import sqrt, log10
p@15 10 import numpy as np
p@24 11 import pandas as pd
p@15 12 from sklearn import mixture
p@15 13
p@24 14 #Fine tuning
p@24 15
p@15 16 #User-item dictionary
p@15 17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
p@15 18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
p@15 19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
p@15 20 "SOAJZEP12A8C14379B": 2.0},
p@15 21 "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5,
p@15 22 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0,
p@15 23 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0},
p@15 24 "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0,
p@15 25 "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0,
p@15 26 "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0},
p@15 27 "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0,
p@15 28 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0,
p@15 29 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
p@15 30 "SOAJZEP12A8C14379B": 2.0},
p@15 31 "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0,
p@15 32 "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0,
p@15 33 "SOAJZEP12A8C14379B": 1.0},
p@15 34 "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0,
p@15 35 "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0,
p@15 36 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
p@15 37 "SOAJZEP12A8C14379B": 4.0},
p@15 38 "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0,
p@15 39 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0,
p@15 40 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0},
p@15 41 "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0,
p@15 42 "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5,
p@15 43 "SOAKNZI12A58A79CAC": 3.0}
p@15 44 }
p@15 45
p@16 46 items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1],
p@16 47 "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1],
p@16 48 "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1],
p@16 49 "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1],
p@16 50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
p@16 51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
p@16 52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
p@16 53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
p@15 54
p@24 55 # Functions to compute similarity between items or between profiles
p@23 56 # Source: http://www.guidetodatamining.com
p@23 57 def manhattan(vector1, vector2):
p@23 58 """Computes the Manhattan distance."""
p@24 59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
p@23 60
p@24 61 def nearestNeighbor(self, itemVector):
p@24 62 """return nearest neighbor to itemVector"""
p@24 63 return min([(
p@24 64 self.manhattan(itemVector, item[1]), item) for item in self.data
p@24 65 ])
p@23 66
p@24 67 def classify(self, itemVector):
p@24 68 """Return class we think item Vector is in"""
p@24 69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
p@24 70 '''
p@24 71 # Median
p@24 72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
p@24 73 def get_median(lst):
p@24 74 return np.median(np.array(lst))
p@23 75
p@24 76 # Absolute Standard Deviation
p@24 77 def get_asd(lst, median):
p@24 78 sum = 0
p@24 79 for item in lst:
p@24 80 sum += abs(item - median)
p@24 81 return sum / len(lst)
p@23 82
p@24 83 # Normalisation rating with Modified Standard Score
p@24 84 def normalize_rating(ratings, median, asd):
p@24 85 for i in range(len(ratings)):
p@24 86 ratings[i] = (ratings[i] - median) / asd
p@24 87 return ratings
p@24 88 '''
p@24 89 # Normalise user play count
p@24 90 for userID in users:
p@24 91 song_play_count = pd.DataFrame(
p@24 92 users[userID].items(),
p@24 93 columns=["songID", "play_count"]
p@24 94 )
p@24 95 '''Coefficient of variation'''
p@24 96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
p@24 97 #user_ratings = np.array(users[userID].values())
p@24 98 #cv = user_ratings.std()/user_ratings.mean()
p@24 99 #print userID, cv
p@24 100 if cv <= 0.5:
p@24 101 for songID, play_count in users[userID].items():
p@24 102 users[userID][songID] = 3
p@24 103 else:
p@24 104 song_play_count_q = pd.cut(
p@24 105 song_play_count["play_count"],
p@24 106 5,
p@24 107 labels=False
p@24 108 ) + 1
p@24 109 song_play_count.play_count = song_play_count_q
p@24 110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
p@24 111 #print song_play_count
p@24 112 #median = get_median(user_ratings)
p@24 113 #asd = get_asd(user_ratings, median)
p@24 114 #for songID, play_count in users[userID].items():
p@24 115 #users[userID][songID] = (play_count - median) / asd
p@24 116
p@24 117 # Subset of most-liked items
p@24 118 users_subset = {}
p@24 119 for userID, songs in users.iteritems():
p@24 120 scores_above_threshold = {
p@24 121 songID: score for songID, score in songs.iteritems() if score > 2
p@24 122 }
p@24 123 users_subset[userID]= scores_above_threshold
p@24 124 '''
p@24 125 for songID, score in songs.iteritems():
p@24 126 print score >0
p@24 127 if score > 0:
p@24 128 print {userID: {songID: score}}
p@24 129
p@24 130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
p@24 131 '''
p@24 132 # Fitness function for EDA
p@24 133 def Fitness(profile, user_subset):
p@24 134 fitness_value = 0
p@24 135 for songID, score in user_subset.iteritems():
p@24 136 fitness_value += log10(score * manhattan(profile, items[songID]))
p@24 137 return fitness_value
p@24 138
p@24 139 # Given parameters for EDA
p@24 140 population_size = len(users_subset)
p@24 141 fraction_of_population = int(round(0.5 * population_size))
p@24 142
p@24 143 # Generation of M individuals uniformly
p@24 144 np.random.seed(len(users_subset))
p@24 145 M = np.random.rand(population_size, len(items.values()[0]))
p@24 146 #M.shape = (-1, len(items.values()[0]))
p@24 147 profile = {}
p@24 148 i = 0
p@24 149 for userID in users_subset:
p@24 150 profile[userID] = M.tolist()[i]
p@24 151 i += 1
p@24 152
p@24 153 # Compute fitness values
p@24 154 users_fitness = {}
p@24 155 for userID in profile:
p@24 156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
p@24 157 users_fitness_df = pd.DataFrame(
p@24 158 users_fitness.items(),
p@24 159 columns=["userID", "fitness"]
p@24 160 )
p@24 161
p@24 162 # Selection of best individuals based on fitness values
p@24 163 best_individuals = {}
p@24 164 users_fitness_df = users_fitness_df.sort(columns='fitness')
p@24 165 M_sel = users_fitness_df.head(fraction_of_population)
p@24 166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
p@24 167 for userID in M_sel_dict:
p@24 168 best_individuals[userID] = profile[userID]
p@24 169
p@24 170 # Calculate sample mean and standard deviation
p@24 171 np.random.seed(1)
p@24 172 g = mixture.GMM(n_components=10)
p@24 173 # Generate random observations with two modes centered on 0
p@24 174 # and 10 to use for training.
p@24 175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
p@24 176 g.fit(obs)
p@24 177 np.round(g.weights_, 2)
p@24 178 np.round(g.means_, 2)
p@24 179 np.round(g.covars_, 2)
p@24 180 g.predict([[0], [2], [9], [10]])
p@24 181 np.round(g.score([[0], [2], [9], [10]]), 2)
p@24 182 # Refit the model on new data (initial parameters remain the
p@24 183 # same), this time with an even split between the two modes.
p@24 184 g.fit(20 * [[0]] + 20 * [[10]])
p@24 185 np.round(g.weights_, 2)
p@24 186
p@24 187
p@24 188 '''
p@17 189 # Pearson Correlation Coefficient
p@17 190 def pearson(rating1, rating2):
p@17 191 sum_xy = 0
p@17 192 sum_x = 0
p@17 193 sum_y = 0
p@17 194 sum_x2 = 0
p@17 195 sum_y2 = 0
p@17 196 n = 0
p@17 197 for key in rating1:
p@17 198 if key in rating2:
p@17 199 n += 1
p@17 200 x = rating1[key]
p@17 201 y = rating2[key]
p@17 202 sum_xy += x * y
p@17 203 sum_x += x
p@17 204 sum_y += y
p@17 205 sum_x2 += pow(x, 2)
p@17 206 sum_y2 += pow(y, 2)
p@17 207 if n == 0:
p@17 208 return 0
p@17 209 # now compute denominator
p@17 210 denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
p@17 211 sqrt(sum_y2 - pow(sum_y, 2) / n)
p@17 212 if denominator == 0:
p@17 213 return 0
p@17 214 else:
p@17 215 return (sum_xy - (sum_x * sum_y) / n) / denominator
p@17 216
p@17 217 # Cosine Similarity for test purposes
p@17 218 def cosine_similarity(rating1, rating2):
p@17 219 sum_xy = 0
p@17 220 sum_x2 = 0
p@17 221 sum_y2 = 0
p@17 222 n = 0
p@17 223 for key in rating1:
p@17 224 if key in rating2:
p@17 225 n += 1
p@17 226 x = rating1[key]
p@17 227 y = rating2[key]
p@17 228 sum_xy += x * y
p@17 229 if n == 0:
p@17 230 return 0
p@17 231
p@17 232 # now compute denominator
p@17 233 for key in rating1:
p@17 234 x = rating1[key]
p@17 235 sum_x2 += pow(x, 2)
p@17 236
p@17 237 for key in rating2:
p@17 238 y = rating2[key]
p@17 239 sum_y2 += pow(y, 2)
p@17 240
p@17 241 denominator = sqrt(sum_x2) * sqrt(sum_y2)
p@17 242 if denominator == 0:
p@17 243 return 0
p@17 244 else:
p@17 245 return sum_xy / denominator
p@17 246
p@23 247
p@24 248
p@17 249 def Fitness(profile, user_index):
p@17 250 sim = 0
p@17 251 sum_log = 0
p@17 252
p@17 253 features = profile.items()[user_index][1]
p@17 254 songs = users.items()[user_index][1]
p@17 255
p@17 256 for song, rating in songs.items():
p@17 257 sim = pearson(features, items[song])
p@17 258 print(sim)
p@17 259
p@17 260 for username, songs in users.items():
p@17 261 for song, rating in songs.items():
p@17 262 sim = pearson(profile, items[song])
p@17 263 #sum_log += log10(rating * sim)
p@17 264 return sim
p@17 265 '''
p@15 266
p@17 267
p@21 268
p@21 269
p@21 270
p@21 271
p@21 272
p@24 273
p@24 274