p@15
|
1 # -*- coding: utf-8 -*-
|
p@15
|
2 """
|
p@15
|
3 Created on Wed Jul 22 17:42:09 2015
|
p@15
|
4
|
p@15
|
5 @author: paulochiliguano
|
p@15
|
6 """
|
p@15
|
7
|
p@16
|
8
|
p@17
|
9 from math import sqrt, log10
|
p@15
|
10 import numpy as np
|
p@24
|
11 import pandas as pd
|
p@15
|
12 from sklearn import mixture
|
p@15
|
13
|
p@24
|
14 #Fine tuning
|
p@24
|
15
|
p@15
|
16 #User-item dictionary
|
p@15
|
17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
|
p@15
|
18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
|
p@15
|
19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
|
p@15
|
20 "SOAJZEP12A8C14379B": 2.0},
|
p@15
|
21 "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5,
|
p@15
|
22 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0,
|
p@15
|
23 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0},
|
p@15
|
24 "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0,
|
p@15
|
25 "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0,
|
p@15
|
26 "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0},
|
p@15
|
27 "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0,
|
p@15
|
28 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0,
|
p@15
|
29 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
|
p@15
|
30 "SOAJZEP12A8C14379B": 2.0},
|
p@15
|
31 "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0,
|
p@15
|
32 "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0,
|
p@15
|
33 "SOAJZEP12A8C14379B": 1.0},
|
p@15
|
34 "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0,
|
p@15
|
35 "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0,
|
p@15
|
36 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
|
p@15
|
37 "SOAJZEP12A8C14379B": 4.0},
|
p@15
|
38 "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0,
|
p@15
|
39 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0,
|
p@15
|
40 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0},
|
p@15
|
41 "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0,
|
p@15
|
42 "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5,
|
p@15
|
43 "SOAKNZI12A58A79CAC": 3.0}
|
p@15
|
44 }
|
p@15
|
45
|
p@16
|
46 items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1],
|
p@16
|
47 "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1],
|
p@16
|
48 "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1],
|
p@16
|
49 "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1],
|
p@16
|
50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
|
p@16
|
51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
|
p@16
|
52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
|
p@16
|
53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
|
p@15
|
54
|
p@24
|
55 # Functions to compute similarity between items or between profiles
|
p@23
|
56 # Source: http://www.guidetodatamining.com
|
p@23
|
57 def manhattan(vector1, vector2):
|
p@23
|
58 """Computes the Manhattan distance."""
|
p@24
|
59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
|
p@23
|
60
|
p@24
|
61 def nearestNeighbor(self, itemVector):
|
p@24
|
62 """return nearest neighbor to itemVector"""
|
p@24
|
63 return min([(
|
p@24
|
64 self.manhattan(itemVector, item[1]), item) for item in self.data
|
p@24
|
65 ])
|
p@23
|
66
|
p@24
|
67 def classify(self, itemVector):
|
p@24
|
68 """Return class we think item Vector is in"""
|
p@24
|
69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
|
p@24
|
70 '''
|
p@24
|
71 # Median
|
p@24
|
72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
|
p@24
|
73 def get_median(lst):
|
p@24
|
74 return np.median(np.array(lst))
|
p@23
|
75
|
p@24
|
76 # Absolute Standard Deviation
|
p@24
|
77 def get_asd(lst, median):
|
p@24
|
78 sum = 0
|
p@24
|
79 for item in lst:
|
p@24
|
80 sum += abs(item - median)
|
p@24
|
81 return sum / len(lst)
|
p@23
|
82
|
p@24
|
83 # Normalisation rating with Modified Standard Score
|
p@24
|
84 def normalize_rating(ratings, median, asd):
|
p@24
|
85 for i in range(len(ratings)):
|
p@24
|
86 ratings[i] = (ratings[i] - median) / asd
|
p@24
|
87 return ratings
|
p@24
|
88 '''
|
p@24
|
89 # Normalise user play count
|
p@24
|
90 for userID in users:
|
p@24
|
91 song_play_count = pd.DataFrame(
|
p@24
|
92 users[userID].items(),
|
p@24
|
93 columns=["songID", "play_count"]
|
p@24
|
94 )
|
p@24
|
95 '''Coefficient of variation'''
|
p@24
|
96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
|
p@24
|
97 #user_ratings = np.array(users[userID].values())
|
p@24
|
98 #cv = user_ratings.std()/user_ratings.mean()
|
p@24
|
99 #print userID, cv
|
p@24
|
100 if cv <= 0.5:
|
p@24
|
101 for songID, play_count in users[userID].items():
|
p@24
|
102 users[userID][songID] = 3
|
p@24
|
103 else:
|
p@24
|
104 song_play_count_q = pd.cut(
|
p@24
|
105 song_play_count["play_count"],
|
p@24
|
106 5,
|
p@24
|
107 labels=False
|
p@24
|
108 ) + 1
|
p@24
|
109 song_play_count.play_count = song_play_count_q
|
p@24
|
110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
|
p@24
|
111 #print song_play_count
|
p@24
|
112 #median = get_median(user_ratings)
|
p@24
|
113 #asd = get_asd(user_ratings, median)
|
p@24
|
114 #for songID, play_count in users[userID].items():
|
p@24
|
115 #users[userID][songID] = (play_count - median) / asd
|
p@24
|
116
|
p@24
|
117 # Subset of most-liked items
|
p@24
|
118 users_subset = {}
|
p@24
|
119 for userID, songs in users.iteritems():
|
p@24
|
120 scores_above_threshold = {
|
p@24
|
121 songID: score for songID, score in songs.iteritems() if score > 2
|
p@24
|
122 }
|
p@24
|
123 users_subset[userID]= scores_above_threshold
|
p@24
|
124 '''
|
p@24
|
125 for songID, score in songs.iteritems():
|
p@24
|
126 print score >0
|
p@24
|
127 if score > 0:
|
p@24
|
128 print {userID: {songID: score}}
|
p@24
|
129
|
p@24
|
130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
|
p@24
|
131 '''
|
p@24
|
132 # Fitness function for EDA
|
p@24
|
133 def Fitness(profile, user_subset):
|
p@24
|
134 fitness_value = 0
|
p@24
|
135 for songID, score in user_subset.iteritems():
|
p@24
|
136 fitness_value += log10(score * manhattan(profile, items[songID]))
|
p@24
|
137 return fitness_value
|
p@24
|
138
|
p@24
|
139 # Given parameters for EDA
|
p@24
|
140 population_size = len(users_subset)
|
p@24
|
141 fraction_of_population = int(round(0.5 * population_size))
|
p@24
|
142
|
p@24
|
143 # Generation of M individuals uniformly
|
p@24
|
144 np.random.seed(len(users_subset))
|
p@24
|
145 M = np.random.rand(population_size, len(items.values()[0]))
|
p@24
|
146 #M.shape = (-1, len(items.values()[0]))
|
p@24
|
147 profile = {}
|
p@24
|
148 i = 0
|
p@24
|
149 for userID in users_subset:
|
p@24
|
150 profile[userID] = M.tolist()[i]
|
p@24
|
151 i += 1
|
p@24
|
152
|
p@24
|
153 # Compute fitness values
|
p@24
|
154 users_fitness = {}
|
p@24
|
155 for userID in profile:
|
p@24
|
156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
|
p@24
|
157 users_fitness_df = pd.DataFrame(
|
p@24
|
158 users_fitness.items(),
|
p@24
|
159 columns=["userID", "fitness"]
|
p@24
|
160 )
|
p@24
|
161
|
p@24
|
162 # Selection of best individuals based on fitness values
|
p@24
|
163 best_individuals = {}
|
p@24
|
164 users_fitness_df = users_fitness_df.sort(columns='fitness')
|
p@24
|
165 M_sel = users_fitness_df.head(fraction_of_population)
|
p@24
|
166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
|
p@24
|
167 for userID in M_sel_dict:
|
p@24
|
168 best_individuals[userID] = profile[userID]
|
p@24
|
169
|
p@24
|
170 # Calculate sample mean and standard deviation
|
p@24
|
171 np.random.seed(1)
|
p@24
|
172 g = mixture.GMM(n_components=10)
|
p@24
|
173 # Generate random observations with two modes centered on 0
|
p@24
|
174 # and 10 to use for training.
|
p@24
|
175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
|
p@24
|
176 g.fit(obs)
|
p@24
|
177 np.round(g.weights_, 2)
|
p@24
|
178 np.round(g.means_, 2)
|
p@24
|
179 np.round(g.covars_, 2)
|
p@24
|
180 g.predict([[0], [2], [9], [10]])
|
p@24
|
181 np.round(g.score([[0], [2], [9], [10]]), 2)
|
p@24
|
182 # Refit the model on new data (initial parameters remain the
|
p@24
|
183 # same), this time with an even split between the two modes.
|
p@24
|
184 g.fit(20 * [[0]] + 20 * [[10]])
|
p@24
|
185 np.round(g.weights_, 2)
|
p@24
|
186
|
p@24
|
187
|
p@24
|
188 '''
|
p@17
|
189 # Pearson Correlation Coefficient
|
p@17
|
190 def pearson(rating1, rating2):
|
p@17
|
191 sum_xy = 0
|
p@17
|
192 sum_x = 0
|
p@17
|
193 sum_y = 0
|
p@17
|
194 sum_x2 = 0
|
p@17
|
195 sum_y2 = 0
|
p@17
|
196 n = 0
|
p@17
|
197 for key in rating1:
|
p@17
|
198 if key in rating2:
|
p@17
|
199 n += 1
|
p@17
|
200 x = rating1[key]
|
p@17
|
201 y = rating2[key]
|
p@17
|
202 sum_xy += x * y
|
p@17
|
203 sum_x += x
|
p@17
|
204 sum_y += y
|
p@17
|
205 sum_x2 += pow(x, 2)
|
p@17
|
206 sum_y2 += pow(y, 2)
|
p@17
|
207 if n == 0:
|
p@17
|
208 return 0
|
p@17
|
209 # now compute denominator
|
p@17
|
210 denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
|
p@17
|
211 sqrt(sum_y2 - pow(sum_y, 2) / n)
|
p@17
|
212 if denominator == 0:
|
p@17
|
213 return 0
|
p@17
|
214 else:
|
p@17
|
215 return (sum_xy - (sum_x * sum_y) / n) / denominator
|
p@17
|
216
|
p@17
|
217 # Cosine Similarity for test purposes
|
p@17
|
218 def cosine_similarity(rating1, rating2):
|
p@17
|
219 sum_xy = 0
|
p@17
|
220 sum_x2 = 0
|
p@17
|
221 sum_y2 = 0
|
p@17
|
222 n = 0
|
p@17
|
223 for key in rating1:
|
p@17
|
224 if key in rating2:
|
p@17
|
225 n += 1
|
p@17
|
226 x = rating1[key]
|
p@17
|
227 y = rating2[key]
|
p@17
|
228 sum_xy += x * y
|
p@17
|
229 if n == 0:
|
p@17
|
230 return 0
|
p@17
|
231
|
p@17
|
232 # now compute denominator
|
p@17
|
233 for key in rating1:
|
p@17
|
234 x = rating1[key]
|
p@17
|
235 sum_x2 += pow(x, 2)
|
p@17
|
236
|
p@17
|
237 for key in rating2:
|
p@17
|
238 y = rating2[key]
|
p@17
|
239 sum_y2 += pow(y, 2)
|
p@17
|
240
|
p@17
|
241 denominator = sqrt(sum_x2) * sqrt(sum_y2)
|
p@17
|
242 if denominator == 0:
|
p@17
|
243 return 0
|
p@17
|
244 else:
|
p@17
|
245 return sum_xy / denominator
|
p@17
|
246
|
p@23
|
247
|
p@24
|
248
|
p@17
|
249 def Fitness(profile, user_index):
|
p@17
|
250 sim = 0
|
p@17
|
251 sum_log = 0
|
p@17
|
252
|
p@17
|
253 features = profile.items()[user_index][1]
|
p@17
|
254 songs = users.items()[user_index][1]
|
p@17
|
255
|
p@17
|
256 for song, rating in songs.items():
|
p@17
|
257 sim = pearson(features, items[song])
|
p@17
|
258 print(sim)
|
p@17
|
259
|
p@17
|
260 for username, songs in users.items():
|
p@17
|
261 for song, rating in songs.items():
|
p@17
|
262 sim = pearson(profile, items[song])
|
p@17
|
263 #sum_log += log10(rating * sim)
|
p@17
|
264 return sim
|
p@17
|
265 '''
|
p@15
|
266
|
p@17
|
267
|
p@21
|
268
|
p@21
|
269
|
p@21
|
270
|
p@21
|
271
|
p@21
|
272
|
p@24
|
273
|
p@24
|
274
|