hybrid-music-recommender-using-content-based-and-social-information: Code/split

annotate Code/split_dataset.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads

author	Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date	Sat, 09 Jul 2022 00:50:43 -0500
parents	e4bcfe00abf4
children

rev	line source
p@25	1 # -- coding: utf-8 --
p@25	2 """
p@25	3 Created on Tue Aug 18 01:41:18 2015
p@25	4
p@25	5 @author: paulochiliguano
p@25	6 """
p@25	7
p@25	8 import pandas as pd
p@25	9 import cPickle as pickle
p@25	10 #import random
p@25	11 import numpy as np
p@25	12
p@25	13 # Normalise user's play count
p@25	14 def normalise_user_play_count(user_df, cv_value = 0.5, max_score=5, similar_score=3):
p@25	15
p@25	16 '''cv: Coefficient of variation, normalised measure of dispersion of a
p@25	17 probability distribution'''
p@25	18 cv = user_df.plays.std() / user_df.plays.mean()
p@25	19
p@25	20 if cv <= 0.5:
p@25	21 '''Homogenous listening habits'''
p@25	22 user_df.plays = similar_score
p@25	23 print "Homogeneous"
p@25	24 #for songID, play_count in user[userID].items():
p@25	25 #user[userID][songID] = 3
p@25	26 else:
p@25	27 '''Complementary cumulative distribution'''
p@25	28 user_df = user_df.sort(columns='plays', ascending=False)
p@25	29 user_df['ccs'] = 1 - user_df.plays.cumsum() / float(user_df.plays.sum())
p@25	30
p@25	31 user_df.loc[user_df.ccs >= 0.8, 'plays'] = 5
p@25	32 user_df.loc[(user_df.ccs < 0.8) & (user_df.ccs >= 0.6), 'plays'] = 4
p@25	33 user_df.loc[(user_df.ccs < 0.6) & (user_df.ccs >= 0.4), 'plays'] = 3
p@25	34 user_df.loc[(user_df.ccs < 0.4) & (user_df.ccs >= 0.2), 'plays'] = 2
p@25	35 user_df.loc[user_df.ccs < 0.2, 'plays'] = 1
p@25	36
p@25	37 user_df = user_df.drop('ccs', 1)
p@25	38 #song_play_count_q = pd.cut(
p@25	39 #user_df["plays"],
p@25	40 #max_score,
p@25	41 #labels=False
p@25	42 #) + 1
p@25	43 #user_df.plays = song_play_count_q
p@25	44 #user[userID] = song_play_count.set_index('songID')['play_count'].to_dict()
p@25	45 return user_df
p@25	46 #for userID in user:
p@25	47 #song_play_count = pd.DataFrame(
p@25	48 #user[userID].items(),
p@25	49 #columns=["songID", "play_count"]
p@25	50 #)
p@25	51
p@25	52 # User-item data frame
p@25	53 users_df = pd.read_pickle('/Users/paulochiliguano/Documents/msc-project/\
p@25	54 dataset/CF_dataset.pkl')
p@25	55
p@25	56 # Normalise users' rating
p@25	57 # users_norm_df = pd.DataFrame()
p@25	58 #for k, v in users_df.groupby("user"):
p@25	59 #users_norm_df = users_norm_df.append(normalise_user_play_count(v))
p@25	60
p@25	61 # SongIDs of downloaded audio clips
p@25	62 filename = '/Users/paulochiliguano/Documents/msc-project/dataset/7digital/\
p@25	63 CF_dataset_7digital.txt'
p@25	64 with open(filename, 'rb') as f:
p@25	65 available_clips = [line.strip().split('\t')[0] for line in f]
p@25	66
p@25	67 # Ground truth with available tracks
p@25	68 #users_ground_truth_df = users_norm_df[users_norm_df.song.isin(available_clips)]
p@25	69 users_df = users_df[users_df.song.isin(available_clips)]
p@25	70
p@25	71 # Users with more than 50 ratings
p@26	72 #users_df = users_df.groupby('user').filter(lambda x: len(x) >= 50)
p@25	73
p@25	74 # Normalise users' rating
p@25	75 users_norm_df = pd.DataFrame()
p@25	76 for k, v in users_df.groupby("user"):
p@26	77 norm = normalise_user_play_count(v)
p@26	78 users_norm_df = users_norm_df.append(norm)
p@26	79 # counts = norm['plays'].value_counts()
p@26	80 # if counts[counts.index == 5].values > 0:
p@26	81 # users_norm_df = users_norm_df.append(norm)
p@26	82
p@26	83 #for k, v in users_norm_df.groupby('user'):
p@26	84 # counts = v['plays'].value_counts()
p@26	85 # df = v.loc[v['plays'].isin(counts[counts >= 5].index), :]
p@26	86 # print df
p@25	87
p@25	88 trial = 10
p@25	89 users_train = []
p@25	90 users_test = []
p@26	91 #highest_rating = [4, 5]
p@26	92 #lowest_rating = [1, 2, 3]
p@25	93 for i in range(trial):
p@25	94 test_df = pd.DataFrame()
p@25	95 train_df = pd.DataFrame()
p@25	96 for k, v in users_norm_df.groupby("user"):
p@26	97 # likes = v.loc[v['plays'].isin(highest_rating)]
p@26	98 # dislikes = v.loc[v['plays'].isin(lowest_rating)]
p@26	99 # test_like_index = np.random.choice(
p@26	100 # likes.index,
p@26	101 # 1,
p@26	102 # replace=False
p@26	103 # )
p@26	104 # test_dislike_index = np.random.choice(
p@26	105 # dislikes.index,
p@26	106 # 1,
p@26	107 # replace=False
p@26	108 # )
p@26	109 # test_index = np.append(test_like_index, test_dislike_index)
p@26	110 # test_index = test_like_index
p@25	111 test_index = np.random.choice(
p@25	112 v.index,
p@25	113 int(len(v.index) / 5),
p@25	114 replace=False
p@25	115 )
p@26	116
p@25	117 test_df = test_df.append(v.loc[test_index])
p@25	118 train_df = train_df.append(v.loc[~v.index.isin(test_index)])
p@25	119
p@25	120 users_train.append([])
p@25	121 users_train[i] = {}
p@25	122 for k, v in train_df.groupby("user"):
p@25	123 users_train[i][k] = {
p@25	124 x: y["plays"].values[0] for x, y in v.groupby("song")
p@25	125 }
p@25	126
p@25	127 users_test.append([])
p@25	128 users_test[i] = {}
p@25	129 for k, v in test_df.groupby("user"):
p@25	130 users_test[i][k] = {
p@25	131 x: y["plays"].values[0] for x, y in v.groupby("song")
p@25	132 }
p@25	133
p@25	134 # Save training and test sets
p@25	135 f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\
p@25	136 cross_validation.pkl', 'wb')
p@25	137 pickle.dump((users_train, users_test), f, protocol=pickle.HIGHEST_PROTOCOL)
p@25	138 f.close()

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

annotate Code/split_dataset.py @ 47:b0186d4a4496 tip