comparison Code/eda.py @ 24:68a62ca32441

Organized python scripts
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sat, 15 Aug 2015 19:16:17 +0100
parents 45e6f85d0ba4
children fafc0b249a73
comparison
equal deleted inserted replaced
23:45e6f85d0ba4 24:68a62ca32441
6 """ 6 """
7 7
8 8
9 from math import sqrt, log10 9 from math import sqrt, log10
10 import numpy as np 10 import numpy as np
11 import pandas as pd
11 from sklearn import mixture 12 from sklearn import mixture
13
14 #Fine tuning
12 15
13 #User-item dictionary 16 #User-item dictionary
14 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, 17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
15 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, 18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
16 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, 19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
47 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], 50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
48 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], 51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
49 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], 52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
50 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} 53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
51 54
52 #Functions to compute similarity between items or between profiles 55 # Functions to compute similarity between items or between profiles
53 # Source: http://www.guidetodatamining.com 56 # Source: http://www.guidetodatamining.com
54 def manhattan(vector1, vector2): 57 def manhattan(vector1, vector2):
55 """Computes the Manhattan distance.""" 58 """Computes the Manhattan distance."""
56 distance = 0 59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
57 total = 0 60
58 n = len(vector1) 61 def nearestNeighbor(self, itemVector):
59 for i in range(n): 62 """return nearest neighbor to itemVector"""
60 distance += abs(vector1[i] - vector2[i]) 63 return min([(
61 return distance 64 self.manhattan(itemVector, item[1]), item) for item in self.data
62 65 ])
63 def computeNearestNeighbor(itemName, itemVector, items): 66
64 """creates a sorted list of items based on their distance to item""" 67 def classify(self, itemVector):
65 distances = [] 68 """Return class we think item Vector is in"""
66 for otherItem in items: 69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
67 if otherItem != itemName: 70 '''
68 distance = manhattan(itemVector, items[otherItem]) 71 # Median
69 distances.append((distance, otherItem)) 72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
70 # sort based on distance -- closest first 73 def get_median(lst):
71 distances.sort() 74 return np.median(np.array(lst))
72 return distances 75
73 76 # Absolute Standard Deviation
74 def classify(user, itemName, itemVector): 77 def get_asd(lst, median):
75 """Classify the itemName based on user ratings 78 sum = 0
76 Should really have items and users as parameters""" 79 for item in lst:
77 # first find nearest neighbor 80 sum += abs(item - median)
78 nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1] 81 return sum / len(lst)
79 rating = users[user][nearest] 82
80 return rating 83 # Normalisation rating with Modified Standard Score
81 84 def normalize_rating(ratings, median, asd):
82 # Fitness function of EDA 85 for i in range(len(ratings)):
83 def Fitness(profile, user): 86 ratings[i] = (ratings[i] - median) / asd
84 nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1] 87 return ratings
85 rating = users[user][nearest] 88 '''
86 return rating 89 # Normalise user play count
87 90 for userID in users:
88 91 song_play_count = pd.DataFrame(
92 users[userID].items(),
93 columns=["songID", "play_count"]
94 )
95 '''Coefficient of variation'''
96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
97 #user_ratings = np.array(users[userID].values())
98 #cv = user_ratings.std()/user_ratings.mean()
99 #print userID, cv
100 if cv <= 0.5:
101 for songID, play_count in users[userID].items():
102 users[userID][songID] = 3
103 else:
104 song_play_count_q = pd.cut(
105 song_play_count["play_count"],
106 5,
107 labels=False
108 ) + 1
109 song_play_count.play_count = song_play_count_q
110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
111 #print song_play_count
112 #median = get_median(user_ratings)
113 #asd = get_asd(user_ratings, median)
114 #for songID, play_count in users[userID].items():
115 #users[userID][songID] = (play_count - median) / asd
116
117 # Subset of most-liked items
118 users_subset = {}
119 for userID, songs in users.iteritems():
120 scores_above_threshold = {
121 songID: score for songID, score in songs.iteritems() if score > 2
122 }
123 users_subset[userID]= scores_above_threshold
124 '''
125 for songID, score in songs.iteritems():
126 print score >0
127 if score > 0:
128 print {userID: {songID: score}}
129
130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
131 '''
132 # Fitness function for EDA
133 def Fitness(profile, user_subset):
134 fitness_value = 0
135 for songID, score in user_subset.iteritems():
136 fitness_value += log10(score * manhattan(profile, items[songID]))
137 return fitness_value
138
139 # Given parameters for EDA
140 population_size = len(users_subset)
141 fraction_of_population = int(round(0.5 * population_size))
142
143 # Generation of M individuals uniformly
144 np.random.seed(len(users_subset))
145 M = np.random.rand(population_size, len(items.values()[0]))
146 #M.shape = (-1, len(items.values()[0]))
147 profile = {}
148 i = 0
149 for userID in users_subset:
150 profile[userID] = M.tolist()[i]
151 i += 1
152
153 # Compute fitness values
154 users_fitness = {}
155 for userID in profile:
156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
157 users_fitness_df = pd.DataFrame(
158 users_fitness.items(),
159 columns=["userID", "fitness"]
160 )
161
162 # Selection of best individuals based on fitness values
163 best_individuals = {}
164 users_fitness_df = users_fitness_df.sort(columns='fitness')
165 M_sel = users_fitness_df.head(fraction_of_population)
166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
167 for userID in M_sel_dict:
168 best_individuals[userID] = profile[userID]
169
170 # Calculate sample mean and standard deviation
171 np.random.seed(1)
172 g = mixture.GMM(n_components=10)
173 # Generate random observations with two modes centered on 0
174 # and 10 to use for training.
175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
176 g.fit(obs)
177 np.round(g.weights_, 2)
178 np.round(g.means_, 2)
179 np.round(g.covars_, 2)
180 g.predict([[0], [2], [9], [10]])
181 np.round(g.score([[0], [2], [9], [10]]), 2)
182 # Refit the model on new data (initial parameters remain the
183 # same), this time with an even split between the two modes.
184 g.fit(20 * [[0]] + 20 * [[10]])
185 np.round(g.weights_, 2)
186
187
188 '''
89 # Pearson Correlation Coefficient 189 # Pearson Correlation Coefficient
90 def pearson(rating1, rating2): 190 def pearson(rating1, rating2):
91 sum_xy = 0 191 sum_xy = 0
92 sum_x = 0 192 sum_x = 0
93 sum_y = 0 193 sum_y = 0
143 return 0 243 return 0
144 else: 244 else:
145 return sum_xy / denominator 245 return sum_xy / denominator
146 246
147 247
148 ''' 248
149 def Fitness(profile, user_index): 249 def Fitness(profile, user_index):
150 sim = 0 250 sim = 0
151 sum_log = 0 251 sum_log = 0
152 252
153 features = profile.items()[user_index][1] 253 features = profile.items()[user_index][1]
161 for song, rating in songs.items(): 261 for song, rating in songs.items():
162 sim = pearson(profile, items[song]) 262 sim = pearson(profile, items[song])
163 #sum_log += log10(rating * sim) 263 #sum_log += log10(rating * sim)
164 return sim 264 return sim
165 ''' 265 '''
166 # Generation of M individuals uniformly 266
167 population_size = len(users) 267
168 fraction_of_population = 0.5 268
169 np.random.seed(len(users)) 269
170 M = np.random.uniform(size=population_size * len(items.values()[0])) 270
171 M.shape = (-1, len(items.values()[0])) 271
172 profile = {} 272
173 i = 0 273
174 for row in M.tolist(): 274
175 profile["Profile" + str(i)] = M.tolist()[i]
176 i = i + 1
177
178 '''
179 Calculate fitness values
180 '''
181 Fitness(profile, 0)
182
183
184
185
186
187
188 np.random.seed(1)
189 g = mixture.GMM(n_components=7)
190 # Generate random observations with two modes centered on 0
191 # and 10 to use for training.
192 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
193 g.fit(obs)
194 np.round(g.weights_, 2)
195 np.round(g.means_, 2)
196 np.round(g.covars_, 2)
197 g.predict([[0], [2], [9], [10]])
198 np.round(g.score([[0], [2], [9], [10]]), 2)
199 # Refit the model on new data (initial parameters remain the
200 # same), this time with an even split between the two modes.
201 g.fit(20 * [[0]] + 20 * [[10]])
202 np.round(g.weights_, 2)