annotate Code/split_dataset.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sat, 09 Jul 2022 00:50:43 -0500
parents e4bcfe00abf4
children
rev   line source
p@25 1 # -*- coding: utf-8 -*-
p@25 2 """
p@25 3 Created on Tue Aug 18 01:41:18 2015
p@25 4
p@25 5 @author: paulochiliguano
p@25 6 """
p@25 7
p@25 8 import pandas as pd
p@25 9 import cPickle as pickle
p@25 10 #import random
p@25 11 import numpy as np
p@25 12
p@25 13 # Normalise user's play count
p@25 14 def normalise_user_play_count(user_df, cv_value = 0.5, max_score=5, similar_score=3):
p@25 15
p@25 16 '''cv: Coefficient of variation, normalised measure of dispersion of a
p@25 17 probability distribution'''
p@25 18 cv = user_df.plays.std() / user_df.plays.mean()
p@25 19
p@25 20 if cv <= 0.5:
p@25 21 '''Homogenous listening habits'''
p@25 22 user_df.plays = similar_score
p@25 23 print "Homogeneous"
p@25 24 #for songID, play_count in user[userID].items():
p@25 25 #user[userID][songID] = 3
p@25 26 else:
p@25 27 '''Complementary cumulative distribution'''
p@25 28 user_df = user_df.sort(columns='plays', ascending=False)
p@25 29 user_df['ccs'] = 1 - user_df.plays.cumsum() / float(user_df.plays.sum())
p@25 30
p@25 31 user_df.loc[user_df.ccs >= 0.8, 'plays'] = 5
p@25 32 user_df.loc[(user_df.ccs < 0.8) & (user_df.ccs >= 0.6), 'plays'] = 4
p@25 33 user_df.loc[(user_df.ccs < 0.6) & (user_df.ccs >= 0.4), 'plays'] = 3
p@25 34 user_df.loc[(user_df.ccs < 0.4) & (user_df.ccs >= 0.2), 'plays'] = 2
p@25 35 user_df.loc[user_df.ccs < 0.2, 'plays'] = 1
p@25 36
p@25 37 user_df = user_df.drop('ccs', 1)
p@25 38 #song_play_count_q = pd.cut(
p@25 39 #user_df["plays"],
p@25 40 #max_score,
p@25 41 #labels=False
p@25 42 #) + 1
p@25 43 #user_df.plays = song_play_count_q
p@25 44 #user[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
p@25 45 return user_df
p@25 46 #for userID in user:
p@25 47 #song_play_count = pd.DataFrame(
p@25 48 #user[userID].items(),
p@25 49 #columns=["songID", "play_count"]
p@25 50 #)
p@25 51
p@25 52 # User-item data frame
p@25 53 users_df = pd.read_pickle('/Users/paulochiliguano/Documents/msc-project/\
p@25 54 dataset/CF_dataset.pkl')
p@25 55
p@25 56 # Normalise users' rating
p@25 57 # users_norm_df = pd.DataFrame()
p@25 58 #for k, v in users_df.groupby("user"):
p@25 59 #users_norm_df = users_norm_df.append(normalise_user_play_count(v))
p@25 60
p@25 61 # SongIDs of downloaded audio clips
p@25 62 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/7digital/\
p@25 63 CF_dataset_7digital.txt'
p@25 64 with open(filename, 'rb') as f:
p@25 65 available_clips = [line.strip().split('\t')[0] for line in f]
p@25 66
p@25 67 # Ground truth with available tracks
p@25 68 #users_ground_truth_df = users_norm_df[users_norm_df.song.isin(available_clips)]
p@25 69 users_df = users_df[users_df.song.isin(available_clips)]
p@25 70
p@25 71 # Users with more than 50 ratings
p@26 72 #users_df = users_df.groupby('user').filter(lambda x: len(x) >= 50)
p@25 73
p@25 74 # Normalise users' rating
p@25 75 users_norm_df = pd.DataFrame()
p@25 76 for k, v in users_df.groupby("user"):
p@26 77 norm = normalise_user_play_count(v)
p@26 78 users_norm_df = users_norm_df.append(norm)
p@26 79 # counts = norm['plays'].value_counts()
p@26 80 # if counts[counts.index == 5].values > 0:
p@26 81 # users_norm_df = users_norm_df.append(norm)
p@26 82
p@26 83 #for k, v in users_norm_df.groupby('user'):
p@26 84 # counts = v['plays'].value_counts()
p@26 85 # df = v.loc[v['plays'].isin(counts[counts >= 5].index), :]
p@26 86 # print df
p@25 87
p@25 88 trial = 10
p@25 89 users_train = []
p@25 90 users_test = []
p@26 91 #highest_rating = [4, 5]
p@26 92 #lowest_rating = [1, 2, 3]
p@25 93 for i in range(trial):
p@25 94 test_df = pd.DataFrame()
p@25 95 train_df = pd.DataFrame()
p@25 96 for k, v in users_norm_df.groupby("user"):
p@26 97 # likes = v.loc[v['plays'].isin(highest_rating)]
p@26 98 # dislikes = v.loc[v['plays'].isin(lowest_rating)]
p@26 99 # test_like_index = np.random.choice(
p@26 100 # likes.index,
p@26 101 # 1,
p@26 102 # replace=False
p@26 103 # )
p@26 104 # test_dislike_index = np.random.choice(
p@26 105 # dislikes.index,
p@26 106 # 1,
p@26 107 # replace=False
p@26 108 # )
p@26 109 # test_index = np.append(test_like_index, test_dislike_index)
p@26 110 # test_index = test_like_index
p@25 111 test_index = np.random.choice(
p@25 112 v.index,
p@25 113 int(len(v.index) / 5),
p@25 114 replace=False
p@25 115 )
p@26 116
p@25 117 test_df = test_df.append(v.loc[test_index])
p@25 118 train_df = train_df.append(v.loc[~v.index.isin(test_index)])
p@25 119
p@25 120 users_train.append([])
p@25 121 users_train[i] = {}
p@25 122 for k, v in train_df.groupby("user"):
p@25 123 users_train[i][k] = {
p@25 124 x: y["plays"].values[0] for x, y in v.groupby("song")
p@25 125 }
p@25 126
p@25 127 users_test.append([])
p@25 128 users_test[i] = {}
p@25 129 for k, v in test_df.groupby("user"):
p@25 130 users_test[i][k] = {
p@25 131 x: y["plays"].values[0] for x, y in v.groupby("song")
p@25 132 }
p@25 133
p@25 134 # Save training and test sets
p@25 135 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25 136 cross_validation.pkl', 'wb')
p@25 137 pickle.dump((users_train, users_test), f, protocol=pickle.HIGHEST_PROTOCOL)
p@25 138 f.close()