Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
comparison Code/eda.py @ 24:68a62ca32441
Organized python scripts
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sat, 15 Aug 2015 19:16:17 +0100 |
parents | 45e6f85d0ba4 |
children | fafc0b249a73 |
comparison
equal
deleted
inserted
replaced
23:45e6f85d0ba4 | 24:68a62ca32441 |
---|---|
6 """ | 6 """ |
7 | 7 |
8 | 8 |
9 from math import sqrt, log10 | 9 from math import sqrt, log10 |
10 import numpy as np | 10 import numpy as np |
11 import pandas as pd | |
11 from sklearn import mixture | 12 from sklearn import mixture |
13 | |
14 #Fine tuning | |
12 | 15 |
13 #User-item dictionary | 16 #User-item dictionary |
14 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, | 17 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0, |
15 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, | 18 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0, |
16 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, | 19 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5, |
47 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], | 50 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1], |
48 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], | 51 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1], |
49 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], | 52 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1], |
50 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} | 53 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]} |
51 | 54 |
52 #Functions to compute similarity between items or between profiles | 55 # Functions to compute similarity between items or between profiles |
53 # Source: http://www.guidetodatamining.com | 56 # Source: http://www.guidetodatamining.com |
54 def manhattan(vector1, vector2): | 57 def manhattan(vector1, vector2): |
55 """Computes the Manhattan distance.""" | 58 """Computes the Manhattan distance.""" |
56 distance = 0 | 59 return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2)) |
57 total = 0 | 60 |
58 n = len(vector1) | 61 def nearestNeighbor(self, itemVector): |
59 for i in range(n): | 62 """return nearest neighbor to itemVector""" |
60 distance += abs(vector1[i] - vector2[i]) | 63 return min([( |
61 return distance | 64 self.manhattan(itemVector, item[1]), item) for item in self.data |
62 | 65 ]) |
63 def computeNearestNeighbor(itemName, itemVector, items): | 66 |
64 """creates a sorted list of items based on their distance to item""" | 67 def classify(self, itemVector): |
65 distances = [] | 68 """Return class we think item Vector is in""" |
66 for otherItem in items: | 69 return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0]) |
67 if otherItem != itemName: | 70 ''' |
68 distance = manhattan(itemVector, items[otherItem]) | 71 # Median |
69 distances.append((distance, otherItem)) | 72 # http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python |
70 # sort based on distance -- closest first | 73 def get_median(lst): |
71 distances.sort() | 74 return np.median(np.array(lst)) |
72 return distances | 75 |
73 | 76 # Absolute Standard Deviation |
74 def classify(user, itemName, itemVector): | 77 def get_asd(lst, median): |
75 """Classify the itemName based on user ratings | 78 sum = 0 |
76 Should really have items and users as parameters""" | 79 for item in lst: |
77 # first find nearest neighbor | 80 sum += abs(item - median) |
78 nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1] | 81 return sum / len(lst) |
79 rating = users[user][nearest] | 82 |
80 return rating | 83 # Normalisation rating with Modified Standard Score |
81 | 84 def normalize_rating(ratings, median, asd): |
82 # Fitness function of EDA | 85 for i in range(len(ratings)): |
83 def Fitness(profile, user): | 86 ratings[i] = (ratings[i] - median) / asd |
84 nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1] | 87 return ratings |
85 rating = users[user][nearest] | 88 ''' |
86 return rating | 89 # Normalise user play count |
87 | 90 for userID in users: |
88 | 91 song_play_count = pd.DataFrame( |
92 users[userID].items(), | |
93 columns=["songID", "play_count"] | |
94 ) | |
95 '''Coefficient of variation''' | |
96 cv = song_play_count.play_count.std() / song_play_count.play_count.mean() | |
97 #user_ratings = np.array(users[userID].values()) | |
98 #cv = user_ratings.std()/user_ratings.mean() | |
99 #print userID, cv | |
100 if cv <= 0.5: | |
101 for songID, play_count in users[userID].items(): | |
102 users[userID][songID] = 3 | |
103 else: | |
104 song_play_count_q = pd.cut( | |
105 song_play_count["play_count"], | |
106 5, | |
107 labels=False | |
108 ) + 1 | |
109 song_play_count.play_count = song_play_count_q | |
110 users[userID] = song_play_count.set_index('songID')['play_count'].to_dict() | |
111 #print song_play_count | |
112 #median = get_median(user_ratings) | |
113 #asd = get_asd(user_ratings, median) | |
114 #for songID, play_count in users[userID].items(): | |
115 #users[userID][songID] = (play_count - median) / asd | |
116 | |
117 # Subset of most-liked items | |
118 users_subset = {} | |
119 for userID, songs in users.iteritems(): | |
120 scores_above_threshold = { | |
121 songID: score for songID, score in songs.iteritems() if score > 2 | |
122 } | |
123 users_subset[userID]= scores_above_threshold | |
124 ''' | |
125 for songID, score in songs.iteritems(): | |
126 print score >0 | |
127 if score > 0: | |
128 print {userID: {songID: score}} | |
129 | |
130 {k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0} | |
131 ''' | |
132 # Fitness function for EDA | |
133 def Fitness(profile, user_subset): | |
134 fitness_value = 0 | |
135 for songID, score in user_subset.iteritems(): | |
136 fitness_value += log10(score * manhattan(profile, items[songID])) | |
137 return fitness_value | |
138 | |
139 # Given parameters for EDA | |
140 population_size = len(users_subset) | |
141 fraction_of_population = int(round(0.5 * population_size)) | |
142 | |
143 # Generation of M individuals uniformly | |
144 np.random.seed(len(users_subset)) | |
145 M = np.random.rand(population_size, len(items.values()[0])) | |
146 #M.shape = (-1, len(items.values()[0])) | |
147 profile = {} | |
148 i = 0 | |
149 for userID in users_subset: | |
150 profile[userID] = M.tolist()[i] | |
151 i += 1 | |
152 | |
153 # Compute fitness values | |
154 users_fitness = {} | |
155 for userID in profile: | |
156 users_fitness[userID] = Fitness(profile[userID], users_subset[userID]) | |
157 users_fitness_df = pd.DataFrame( | |
158 users_fitness.items(), | |
159 columns=["userID", "fitness"] | |
160 ) | |
161 | |
162 # Selection of best individuals based on fitness values | |
163 best_individuals = {} | |
164 users_fitness_df = users_fitness_df.sort(columns='fitness') | |
165 M_sel = users_fitness_df.head(fraction_of_population) | |
166 M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict() | |
167 for userID in M_sel_dict: | |
168 best_individuals[userID] = profile[userID] | |
169 | |
170 # Calculate sample mean and standard deviation | |
171 np.random.seed(1) | |
172 g = mixture.GMM(n_components=10) | |
173 # Generate random observations with two modes centered on 0 | |
174 # and 10 to use for training. | |
175 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) | |
176 g.fit(obs) | |
177 np.round(g.weights_, 2) | |
178 np.round(g.means_, 2) | |
179 np.round(g.covars_, 2) | |
180 g.predict([[0], [2], [9], [10]]) | |
181 np.round(g.score([[0], [2], [9], [10]]), 2) | |
182 # Refit the model on new data (initial parameters remain the | |
183 # same), this time with an even split between the two modes. | |
184 g.fit(20 * [[0]] + 20 * [[10]]) | |
185 np.round(g.weights_, 2) | |
186 | |
187 | |
188 ''' | |
89 # Pearson Correlation Coefficient | 189 # Pearson Correlation Coefficient |
90 def pearson(rating1, rating2): | 190 def pearson(rating1, rating2): |
91 sum_xy = 0 | 191 sum_xy = 0 |
92 sum_x = 0 | 192 sum_x = 0 |
93 sum_y = 0 | 193 sum_y = 0 |
143 return 0 | 243 return 0 |
144 else: | 244 else: |
145 return sum_xy / denominator | 245 return sum_xy / denominator |
146 | 246 |
147 | 247 |
148 ''' | 248 |
149 def Fitness(profile, user_index): | 249 def Fitness(profile, user_index): |
150 sim = 0 | 250 sim = 0 |
151 sum_log = 0 | 251 sum_log = 0 |
152 | 252 |
153 features = profile.items()[user_index][1] | 253 features = profile.items()[user_index][1] |
161 for song, rating in songs.items(): | 261 for song, rating in songs.items(): |
162 sim = pearson(profile, items[song]) | 262 sim = pearson(profile, items[song]) |
163 #sum_log += log10(rating * sim) | 263 #sum_log += log10(rating * sim) |
164 return sim | 264 return sim |
165 ''' | 265 ''' |
166 # Generation of M individuals uniformly | 266 |
167 population_size = len(users) | 267 |
168 fraction_of_population = 0.5 | 268 |
169 np.random.seed(len(users)) | 269 |
170 M = np.random.uniform(size=population_size * len(items.values()[0])) | 270 |
171 M.shape = (-1, len(items.values()[0])) | 271 |
172 profile = {} | 272 |
173 i = 0 | 273 |
174 for row in M.tolist(): | 274 |
175 profile["Profile" + str(i)] = M.tolist()[i] | |
176 i = i + 1 | |
177 | |
178 ''' | |
179 Calculate fitness values | |
180 ''' | |
181 Fitness(profile, 0) | |
182 | |
183 | |
184 | |
185 | |
186 | |
187 | |
188 np.random.seed(1) | |
189 g = mixture.GMM(n_components=7) | |
190 # Generate random observations with two modes centered on 0 | |
191 # and 10 to use for training. | |
192 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1))) | |
193 g.fit(obs) | |
194 np.round(g.weights_, 2) | |
195 np.round(g.means_, 2) | |
196 np.round(g.covars_, 2) | |
197 g.predict([[0], [2], [9], [10]]) | |
198 np.round(g.score([[0], [2], [9], [10]]), 2) | |
199 # Refit the model on new data (initial parameters remain the | |
200 # same), this time with an even split between the two modes. | |
201 g.fit(20 * [[0]] + 20 * [[10]]) | |
202 np.round(g.weights_, 2) |