annotate Code/eda.py @ 17:ee13c193c76e

Continue working on EDA
author Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk>
date Tue, 28 Jul 2015 21:11:22 +0100
parents 68b8b088f50a
children e68dbee1f6db
rev   line source
p@15 1 # -*- coding: utf-8 -*-
p@15 2 """
p@15 3 Created on Wed Jul 22 17:42:09 2015
p@15 4
p@15 5 @author: paulochiliguano
p@15 6 """
p@15 7
p@16 8
p@17 9 from math import sqrt, log10
p@15 10 import numpy as np
p@15 11 from sklearn import mixture
p@15 12
p@15 13 #User-item dictionary
p@15 14 users = {"Angelica": {"SOAJJPC12AB017D63F": 3.5, "SOAKIXJ12AC3DF7152": 2.0,
p@15 15 "SOAKPFH12A8C13BA4A": 4.5, "SOAGTJW12A6701F1F5": 5.0,
p@15 16 "SOAKWCK12A8C139F81": 1.5, "SOAKNZI12A58A79CAC": 2.5,
p@15 17 "SOAJZEP12A8C14379B": 2.0},
p@15 18 "Bill":{"SOAJJPC12AB017D63F": 2.0, "SOAKIXJ12AC3DF7152": 3.5,
p@15 19 "SOAHQFM12A8C134B65": 4.0, "SOAGTJW12A6701F1F5": 2.0,
p@15 20 "SOAKWCK12A8C139F81": 3.5, "SOAJZEP12A8C14379B": 3.0},
p@15 21 "Chan": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 1.0,
p@15 22 "SOAHQFM12A8C134B65": 1.0, "SOAKPFH12A8C13BA4A": 3.0,
p@15 23 "SOAGTJW12A6701F1F5": 5, "SOAKWCK12A8C139F81": 1.0},
p@15 24 "Dan": {"SOAJJPC12AB017D63F": 3.0, "SOAKIXJ12AC3DF7152": 4.0,
p@15 25 "SOAHQFM12A8C134B65": 4.5, "SOAGTJW12A6701F1F5": 3.0,
p@15 26 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
p@15 27 "SOAJZEP12A8C14379B": 2.0},
p@15 28 "Hailey": {"SOAKIXJ12AC3DF7152": 4.0, "SOAHQFM12A8C134B65": 1.0,
p@15 29 "SOAKPFH12A8C13BA4A": 4.0, "SOAKNZI12A58A79CAC": 4.0,
p@15 30 "SOAJZEP12A8C14379B": 1.0},
p@15 31 "Jordyn": {"SOAKIXJ12AC3DF7152": 4.5, "SOAHQFM12A8C134B65": 4.0,
p@15 32 "SOAKPFH12A8C13BA4A": 5.0, "SOAGTJW12A6701F1F5": 5.0,
p@15 33 "SOAKWCK12A8C139F81": 4.5, "SOAKNZI12A58A79CAC": 4.0,
p@15 34 "SOAJZEP12A8C14379B": 4.0},
p@15 35 "Sam": {"SOAJJPC12AB017D63F": 5.0, "SOAKIXJ12AC3DF7152": 2.0,
p@15 36 "SOAKPFH12A8C13BA4A": 3.0, "SOAGTJW12A6701F1F5": 5.0,
p@15 37 "SOAKWCK12A8C139F81": 4.0, "SOAKNZI12A58A79CAC": 5.0},
p@15 38 "Veronica": {"SOAJJPC12AB017D63F": 3.0, "SOAKPFH12A8C13BA4A": 5.0,
p@15 39 "SOAGTJW12A6701F1F5": 4.0, "SOAKWCK12A8C139F81": 2.5,
p@15 40 "SOAKNZI12A58A79CAC": 3.0}
p@15 41 }
p@15 42
p@16 43 items = {"SOAJJPC12AB017D63F": [2.5, 4, 3.5, 3, 5, 4, 1, 5, 4, 1],
p@16 44 "SOAKIXJ12AC3DF7152": [2, 5, 5, 3, 2, 1, 1, 5, 4, 1],
p@16 45 "SOAKPFH12A8C13BA4A": [1, 5, 4, 2, 4, 1, 1, 5, 4, 1],
p@16 46 "SOAGTJW12A6701F1F5": [4, 5, 4, 4, 1, 5, 1, 5, 4, 1],
p@16 47 "SOAKWCK12A8C139F81": [1, 4, 5, 3.5, 5, 1, 1, 5, 4, 1],
p@16 48 "SOAKNZI12A58A79CAC": [1, 5, 3.5, 3, 4, 5, 1, 5, 4, 1],
p@16 49 "SOAJZEP12A8C14379B": [5, 5, 4, 2, 1, 1, 1, 5, 4, 1],
p@16 50 "SOAHQFM12A8C134B65": [2.5, 4, 4, 1, 1, 1, 1, 5, 4, 1]}
p@16 51 '''
p@15 52 profile = {"Profile0": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 53 "Profile1": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 54 "Profile2": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 55 "Profile3": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 56 "Profile4": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 57 "Profile5": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 58 "Profile6": [2.5, 4, 3.5, 3, 5, 4, 1],
p@15 59 "Profile7": [2.5, 4, 3.5, 3, 5, 4, 1]}
p@16 60 '''
p@15 61
p@16 62 '''
p@17 63 Functions to compute similarity between items or between profiles
p@16 64 '''
p@17 65 # Pearson Correlation Coefficient
p@17 66 # Source: http://www.guidetodatamining.com
p@17 67 def pearson(rating1, rating2):
p@17 68 sum_xy = 0
p@17 69 sum_x = 0
p@17 70 sum_y = 0
p@17 71 sum_x2 = 0
p@17 72 sum_y2 = 0
p@17 73 n = 0
p@17 74 for key in rating1:
p@17 75 if key in rating2:
p@17 76 n += 1
p@17 77 x = rating1[key]
p@17 78 y = rating2[key]
p@17 79 sum_xy += x * y
p@17 80 sum_x += x
p@17 81 sum_y += y
p@17 82 sum_x2 += pow(x, 2)
p@17 83 sum_y2 += pow(y, 2)
p@17 84 if n == 0:
p@17 85 return 0
p@17 86 # now compute denominator
p@17 87 denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
p@17 88 sqrt(sum_y2 - pow(sum_y, 2) / n)
p@17 89 if denominator == 0:
p@17 90 return 0
p@17 91 else:
p@17 92 return (sum_xy - (sum_x * sum_y) / n) / denominator
p@17 93
p@17 94 # Cosine Similarity for test purposes
p@17 95 def cosine_similarity(rating1, rating2):
p@17 96 sum_xy = 0
p@17 97 sum_x2 = 0
p@17 98 sum_y2 = 0
p@17 99 n = 0
p@17 100 for key in rating1:
p@17 101 if key in rating2:
p@17 102 n += 1
p@17 103 x = rating1[key]
p@17 104 y = rating2[key]
p@17 105 sum_xy += x * y
p@17 106 if n == 0:
p@17 107 return 0
p@17 108
p@17 109 # now compute denominator
p@17 110 for key in rating1:
p@17 111 x = rating1[key]
p@17 112 sum_x2 += pow(x, 2)
p@17 113
p@17 114 for key in rating2:
p@17 115 y = rating2[key]
p@17 116 sum_y2 += pow(y, 2)
p@17 117
p@17 118 denominator = sqrt(sum_x2) * sqrt(sum_y2)
p@17 119 if denominator == 0:
p@17 120 return 0
p@17 121 else:
p@17 122 return sum_xy / denominator
p@17 123
p@17 124 '''
p@17 125 Fitness function of EDA
p@17 126 '''
p@17 127 def Fitness(profile, user_index):
p@17 128 sim = 0
p@17 129 sum_log = 0
p@17 130
p@17 131 features = profile.items()[user_index][1]
p@17 132 songs = users.items()[user_index][1]
p@17 133
p@17 134 for song, rating in songs.items():
p@17 135 sim = pearson(features, items[song])
p@17 136 print(sim)
p@17 137
p@17 138 for username, songs in users.items():
p@17 139 for song, rating in songs.items():
p@17 140 sim = pearson(profile, items[song])
p@17 141 #sum_log += log10(rating * sim)
p@17 142 return sim
p@17 143
p@17 144
p@17 145 '''
p@17 146 Generation of M individuals uniformly
p@17 147 '''
p@17 148 population_size = len(users)
p@17 149 fraction_of_population = 0.5
p@16 150 np.random.seed(len(users))
p@17 151 M = np.random.uniform(1, 5, population_size * len(items.values()[0]))
p@16 152 M.shape = (-1, len(items.values()[0]))
p@16 153 profile = {}
p@16 154 i = 0
p@16 155 for row in M.tolist():
p@16 156 profile["Profile" + str(i)] = M.tolist()[i]
p@16 157 i = i + 1
p@15 158
p@17 159 '''
p@17 160 Calculate fitness values
p@17 161 '''
p@17 162 Fitness(profile, 0)
p@17 163
p@15 164 np.random.seed(1)
p@15 165 g = mixture.GMM(n_components=7)
p@15 166 # Generate random observations with two modes centered on 0
p@15 167 # and 10 to use for training.
p@15 168 obs = np.concatenate((np.random.randn(100, 1), 10 + np.random.randn(300, 1)))
p@15 169 g.fit(obs)
p@15 170 np.round(g.weights_, 2)
p@15 171 np.round(g.means_, 2)
p@15 172 np.round(g.covars_, 2)
p@15 173 g.predict([[0], [2], [9], [10]])
p@15 174 np.round(g.score([[0], [2], [9], [10]]), 2)
p@15 175 # Refit the model on new data (initial parameters remain the
p@15 176 # same), this time with an even split between the two modes.
p@15 177 g.fit(20 * [[0]] + 20 * [[10]])
p@15 178 np.round(g.weights_, 2)