diff Code/eda.py @ 24:68a62ca32441

Organized python scripts
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Sat, 15 Aug 2015 19:16:17 +0100
parents 45e6f85d0ba4
children fafc0b249a73
line wrap: on
line diff
--- a/Code/eda.py	Tue Aug 11 14:23:42 2015 +0100
+++ b/Code/eda.py	Sat Aug 15 19:16:17 2015 +0100
@@ -8,8 +8,11 @@
 
 from math import sqrt, log10
 import numpy as np
+import pandas as pd
 from sklearn import mixture
 
+#Fine tuning
+
 #User-item dictionary
 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
                       "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
@@ -49,43 +52,140 @@
          "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
          "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
 
-#Functions to compute similarity between items or between profiles
+# Functions to compute similarity between items or between profiles
 # Source: http://www.guidetodatamining.com
 def manhattan(vector1, vector2):
     """Computes the Manhattan distance."""
-    distance = 0
-    total = 0
-    n = len(vector1)
-    for i in range(n):
-        distance += abs(vector1[i] - vector2[i])
-    return distance
+    return sum(map(lambda v1, v2: abs(v1 - v2), vector1, vector2))
 
-def computeNearestNeighbor(itemName, itemVector, items):
-    """creates a sorted list of items based on their distance to item"""
-    distances = []
-    for otherItem in items:
-        if otherItem != itemName:
-            distance = manhattan(itemVector, items[otherItem])
-            distances.append((distance, otherItem))
-        # sort based on distance -- closest first
-        distances.sort()
-    return distances
+def nearestNeighbor(self, itemVector):
+    """return nearest neighbor to itemVector"""
+    return min([(
+        self.manhattan(itemVector, item[1]), item) for item in self.data
+    ])
 
-def classify(user, itemName, itemVector):
-    """Classify the itemName based on user ratings
-    Should really have items and users as parameters"""
-    # first find nearest neighbor
-    nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
-    rating = users[user][nearest]
-    return rating
+def classify(self, itemVector):
+    """Return class we think item Vector is in"""
+    return(self.nearestNeighbor(self.normalizeVector(itemVector))[1][0])
+'''
+# Median
+# http://stackoverflow.com/questions/24101524/finding-median-of-list-in-python
+def get_median(lst):
+    return np.median(np.array(lst))
 
-# Fitness function of EDA
-def Fitness(profile, user):
-    nearest = computeNearestNeighbor(itemName, itemVector, items)[0][1]
-    rating = users[user][nearest]
-    return rating
+# Absolute Standard Deviation
+def get_asd(lst, median):
+    sum = 0
+    for item in lst:
+        sum += abs(item - median)
+    return sum / len(lst)
 
-    
+# Normalisation rating with Modified Standard Score
+def normalize_rating(ratings, median, asd):
+    for i in range(len(ratings)):
+        ratings[i] = (ratings[i] - median) / asd
+    return ratings
+'''
+# Normalise user play count
+for userID in users:
+    song_play_count = pd.DataFrame(
+        users[userID].items(),
+        columns=["songID", "play_count"]
+    )
+    '''Coefficient of variation'''
+    cv = song_play_count.play_count.std() / song_play_count.play_count.mean()
+    #user_ratings = np.array(users[userID].values())
+    #cv = user_ratings.std()/user_ratings.mean()
+    #print userID, cv
+    if cv <= 0.5:
+        for songID, play_count in users[userID].items():
+            users[userID][songID] = 3
+    else:
+        song_play_count_q = pd.cut(
+            song_play_count["play_count"],
+            5,
+            labels=False
+        ) + 1    
+        song_play_count.play_count = song_play_count_q
+        users[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
+        #print song_play_count
+    #median = get_median(user_ratings)
+    #asd = get_asd(user_ratings, median)
+    #for songID, play_count in users[userID].items():
+        #users[userID][songID] = (play_count - median) / asd
+
+# Subset of most-liked items
+users_subset = {}
+for userID, songs in users.iteritems():
+    scores_above_threshold = {
+        songID: score for songID, score in songs.iteritems() if score > 2
+    }
+    users_subset[userID]= scores_above_threshold
+    '''
+    for songID, score in songs.iteritems():
+        print score >0
+        if score > 0:
+            print {userID: {songID: score}}
+
+{k: v for k, v in users.iteritems() for i,j in v.iteritems() if j > 0}
+'''
+# Fitness function for EDA
+def Fitness(profile, user_subset):
+    fitness_value = 0
+    for songID, score in user_subset.iteritems():
+        fitness_value += log10(score * manhattan(profile, items[songID]))   
+    return fitness_value
+
+# Given parameters for EDA
+population_size = len(users_subset)
+fraction_of_population = int(round(0.5 * population_size))
+
+# Generation of M individuals uniformly
+np.random.seed(len(users_subset))
+M = np.random.rand(population_size, len(items.values()[0]))
+#M.shape = (-1, len(items.values()[0]))
+profile = {}
+i = 0
+for userID in users_subset:
+    profile[userID] = M.tolist()[i]
+    i += 1
+
+# Compute fitness values
+users_fitness = {}
+for userID in profile:
+    users_fitness[userID] = Fitness(profile[userID], users_subset[userID])
+users_fitness_df = pd.DataFrame(
+    users_fitness.items(),
+    columns=["userID", "fitness"]
+)
+
+# Selection of best individuals based on fitness values
+best_individuals = {}
+users_fitness_df = users_fitness_df.sort(columns='fitness')
+M_sel = users_fitness_df.head(fraction_of_population)
+M_sel_dict = M_sel.set_index('userID')['fitness'].to_dict()
+for userID in M_sel_dict:
+    best_individuals[userID] = profile[userID]
+
+# Calculate sample mean and standard deviation
+np.random.seed(1)
+g = mixture.GMM(n_components=10)
+# Generate random observations with two modes centered on 0
+# and 10 to use for training.
+obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
+g.fit(obs) 
+np.round(g.weights_, 2)
+np.round(g.means_, 2)
+np.round(g.covars_, 2) 
+g.predict([[0], [2], [9], [10]]) 
+np.round(g.score([[0], [2], [9], [10]]), 2)
+# Refit the model on new data (initial parameters remain the
+# same), this time with an even split between the two modes.
+g.fit(20 * [[0]] +  20 * [[10]]) 
+np.round(g.weights_, 2)
+
+
+'''
 # Pearson Correlation Coefficient
 def pearson(rating1, rating2):
     sum_xy = 0
@@ -145,7 +245,7 @@
         return sum_xy / denominator
 
 
-'''
+
 def Fitness(profile, user_index):
     sim = 0
     sum_log = 0
@@ -163,40 +263,12 @@
             #sum_log += log10(rating * sim)
     return sim
 '''
-# Generation of M individuals uniformly
-population_size = len(users)
-fraction_of_population = 0.5
-np.random.seed(len(users))
-M = np.random.uniform(size=population_size * len(items.values()[0]))
-M.shape = (-1, len(items.values()[0]))
-profile = {}
-i = 0
-for row in M.tolist():
-    profile["Profile" + str(i)] = M.tolist()[i]
-    i = i + 1
 
-'''
-Calculate fitness values
-'''
-Fitness(profile, 0)
 
 
 
 
 
 
-np.random.seed(1)
-g = mixture.GMM(n_components=7)
-# Generate random observations with two modes centered on 0
-# and 10 to use for training.
-obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
-g.fit(obs) 
-np.round(g.weights_, 2)
-np.round(g.means_, 2)
-np.round(g.covars_, 2) 
-g.predict([[0], [2], [9], [10]]) 
-np.round(g.score([[0], [2], [9], [10]]), 2)
-# Refit the model on new data (initial parameters remain the
-# same), this time with an even split between the two modes.
-g.fit(20 * [[0]] +  20 * [[10]]) 
-np.round(g.weights_, 2)
+
+