p@25
|
1 # -*- coding: utf-8 -*-
|
p@25
|
2 """
|
p@25
|
3 Created on Tue Aug 18 01:41:18 2015
|
p@25
|
4
|
p@25
|
5 @author: paulochiliguano
|
p@25
|
6 """
|
p@25
|
7
|
p@25
|
8 import pandas as pd
|
p@25
|
9 import cPickle as pickle
|
p@25
|
10 #import random
|
p@25
|
11 import numpy as np
|
p@25
|
12
|
p@25
|
13 # Normalise user's play count
|
p@25
|
14 def normalise_user_play_count(user_df, cv_value = 0.5, max_score=5, similar_score=3):
|
p@25
|
15
|
p@25
|
16 '''cv: Coefficient of variation, normalised measure of dispersion of a
|
p@25
|
17 probability distribution'''
|
p@25
|
18 cv = user_df.plays.std() / user_df.plays.mean()
|
p@25
|
19
|
p@25
|
20 if cv <= 0.5:
|
p@25
|
21 '''Homogenous listening habits'''
|
p@25
|
22 user_df.plays = similar_score
|
p@25
|
23 print "Homogeneous"
|
p@25
|
24 #for songID, play_count in user[userID].items():
|
p@25
|
25 #user[userID][songID] = 3
|
p@25
|
26 else:
|
p@25
|
27 '''Complementary cumulative distribution'''
|
p@25
|
28 user_df = user_df.sort(columns='plays', ascending=False)
|
p@25
|
29 user_df['ccs'] = 1 - user_df.plays.cumsum() / float(user_df.plays.sum())
|
p@25
|
30
|
p@25
|
31 user_df.loc[user_df.ccs >= 0.8, 'plays'] = 5
|
p@25
|
32 user_df.loc[(user_df.ccs < 0.8) & (user_df.ccs >= 0.6), 'plays'] = 4
|
p@25
|
33 user_df.loc[(user_df.ccs < 0.6) & (user_df.ccs >= 0.4), 'plays'] = 3
|
p@25
|
34 user_df.loc[(user_df.ccs < 0.4) & (user_df.ccs >= 0.2), 'plays'] = 2
|
p@25
|
35 user_df.loc[user_df.ccs < 0.2, 'plays'] = 1
|
p@25
|
36
|
p@25
|
37 user_df = user_df.drop('ccs', 1)
|
p@25
|
38 #song_play_count_q = pd.cut(
|
p@25
|
39 #user_df["plays"],
|
p@25
|
40 #max_score,
|
p@25
|
41 #labels=False
|
p@25
|
42 #) + 1
|
p@25
|
43 #user_df.plays = song_play_count_q
|
p@25
|
44 #user[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
|
p@25
|
45 return user_df
|
p@25
|
46 #for userID in user:
|
p@25
|
47 #song_play_count = pd.DataFrame(
|
p@25
|
48 #user[userID].items(),
|
p@25
|
49 #columns=["songID", "play_count"]
|
p@25
|
50 #)
|
p@25
|
51
|
p@25
|
52 # User-item data frame
|
p@25
|
53 users_df = pd.read_pickle('/Users/paulochiliguano/Documents/msc-project/\
|
p@25
|
54 dataset/CF_dataset.pkl')
|
p@25
|
55
|
p@25
|
56 # Normalise users' rating
|
p@25
|
57 # users_norm_df = pd.DataFrame()
|
p@25
|
58 #for k, v in users_df.groupby("user"):
|
p@25
|
59 #users_norm_df = users_norm_df.append(normalise_user_play_count(v))
|
p@25
|
60
|
p@25
|
61 # SongIDs of downloaded audio clips
|
p@25
|
62 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/7digital/\
|
p@25
|
63 CF_dataset_7digital.txt'
|
p@25
|
64 with open(filename, 'rb') as f:
|
p@25
|
65 available_clips = [line.strip().split('\t')[0] for line in f]
|
p@25
|
66
|
p@25
|
67 # Ground truth with available tracks
|
p@25
|
68 #users_ground_truth_df = users_norm_df[users_norm_df.song.isin(available_clips)]
|
p@25
|
69 users_df = users_df[users_df.song.isin(available_clips)]
|
p@25
|
70
|
p@25
|
71 # Users with more than 50 ratings
|
p@26
|
72 #users_df = users_df.groupby('user').filter(lambda x: len(x) >= 50)
|
p@25
|
73
|
p@25
|
74 # Normalise users' rating
|
p@25
|
75 users_norm_df = pd.DataFrame()
|
p@25
|
76 for k, v in users_df.groupby("user"):
|
p@26
|
77 norm = normalise_user_play_count(v)
|
p@26
|
78 users_norm_df = users_norm_df.append(norm)
|
p@26
|
79 # counts = norm['plays'].value_counts()
|
p@26
|
80 # if counts[counts.index == 5].values > 0:
|
p@26
|
81 # users_norm_df = users_norm_df.append(norm)
|
p@26
|
82
|
p@26
|
83 #for k, v in users_norm_df.groupby('user'):
|
p@26
|
84 # counts = v['plays'].value_counts()
|
p@26
|
85 # df = v.loc[v['plays'].isin(counts[counts >= 5].index), :]
|
p@26
|
86 # print df
|
p@25
|
87
|
p@25
|
88 trial = 10
|
p@25
|
89 users_train = []
|
p@25
|
90 users_test = []
|
p@26
|
91 #highest_rating = [4, 5]
|
p@26
|
92 #lowest_rating = [1, 2, 3]
|
p@25
|
93 for i in range(trial):
|
p@25
|
94 test_df = pd.DataFrame()
|
p@25
|
95 train_df = pd.DataFrame()
|
p@25
|
96 for k, v in users_norm_df.groupby("user"):
|
p@26
|
97 # likes = v.loc[v['plays'].isin(highest_rating)]
|
p@26
|
98 # dislikes = v.loc[v['plays'].isin(lowest_rating)]
|
p@26
|
99 # test_like_index = np.random.choice(
|
p@26
|
100 # likes.index,
|
p@26
|
101 # 1,
|
p@26
|
102 # replace=False
|
p@26
|
103 # )
|
p@26
|
104 # test_dislike_index = np.random.choice(
|
p@26
|
105 # dislikes.index,
|
p@26
|
106 # 1,
|
p@26
|
107 # replace=False
|
p@26
|
108 # )
|
p@26
|
109 # test_index = np.append(test_like_index, test_dislike_index)
|
p@26
|
110 # test_index = test_like_index
|
p@25
|
111 test_index = np.random.choice(
|
p@25
|
112 v.index,
|
p@25
|
113 int(len(v.index) / 5),
|
p@25
|
114 replace=False
|
p@25
|
115 )
|
p@26
|
116
|
p@25
|
117 test_df = test_df.append(v.loc[test_index])
|
p@25
|
118 train_df = train_df.append(v.loc[~v.index.isin(test_index)])
|
p@25
|
119
|
p@25
|
120 users_train.append([])
|
p@25
|
121 users_train[i] = {}
|
p@25
|
122 for k, v in train_df.groupby("user"):
|
p@25
|
123 users_train[i][k] = {
|
p@25
|
124 x: y["plays"].values[0] for x, y in v.groupby("song")
|
p@25
|
125 }
|
p@25
|
126
|
p@25
|
127 users_test.append([])
|
p@25
|
128 users_test[i] = {}
|
p@25
|
129 for k, v in test_df.groupby("user"):
|
p@25
|
130 users_test[i][k] = {
|
p@25
|
131 x: y["plays"].values[0] for x, y in v.groupby("song")
|
p@25
|
132 }
|
p@25
|
133
|
p@25
|
134 # Save training and test sets
|
p@25
|
135 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
|
p@25
|
136 cross_validation.pkl', 'wb')
|
p@25
|
137 pickle.dump((users_train, users_test), f, protocol=pickle.HIGHEST_PROTOCOL)
|
p@25
|
138 f.close()
|