Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/split_dataset.py @ 25:fafc0b249a73
Final code
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 23 Aug 2015 16:47:54 +0100 |
parents | |
children | e4bcfe00abf4 |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Tue Aug 18 01:41:18 2015 @author: paulochiliguano """ import pandas as pd import cPickle as pickle #import random import numpy as np # Normalise user's play count def normalise_user_play_count(user_df, cv_value = 0.5, max_score=5, similar_score=3): '''cv: Coefficient of variation, normalised measure of dispersion of a probability distribution''' cv = user_df.plays.std() / user_df.plays.mean() if cv <= 0.5: '''Homogenous listening habits''' user_df.plays = similar_score print "Homogeneous" #for songID, play_count in user[userID].items(): #user[userID][songID] = 3 else: '''Complementary cumulative distribution''' user_df = user_df.sort(columns='plays', ascending=False) user_df['ccs'] = 1 - user_df.plays.cumsum() / float(user_df.plays.sum()) user_df.loc[user_df.ccs >= 0.8, 'plays'] = 5 user_df.loc[(user_df.ccs < 0.8) & (user_df.ccs >= 0.6), 'plays'] = 4 user_df.loc[(user_df.ccs < 0.6) & (user_df.ccs >= 0.4), 'plays'] = 3 user_df.loc[(user_df.ccs < 0.4) & (user_df.ccs >= 0.2), 'plays'] = 2 user_df.loc[user_df.ccs < 0.2, 'plays'] = 1 user_df = user_df.drop('ccs', 1) #song_play_count_q = pd.cut( #user_df["plays"], #max_score, #labels=False #) + 1 #user_df.plays = song_play_count_q #user[userID] = song_play_count.set_index('songID')['play_count'].to_dict() return user_df #for userID in user: #song_play_count = pd.DataFrame( #user[userID].items(), #columns=["songID", "play_count"] #) # User-item data frame users_df = pd.read_pickle('/Users/paulochiliguano/Documents/msc-project/\ dataset/CF_dataset.pkl') # Normalise users' rating # users_norm_df = pd.DataFrame() #for k, v in users_df.groupby("user"): #users_norm_df = users_norm_df.append(normalise_user_play_count(v)) # SongIDs of downloaded audio clips filename = '/Users/paulochiliguano/Documents/msc-project/dataset/7digital/\ CF_dataset_7digital.txt' with open(filename, 'rb') as f: available_clips = [line.strip().split('\t')[0] for line in f] # Ground truth with available tracks #users_ground_truth_df = users_norm_df[users_norm_df.song.isin(available_clips)] users_df = users_df[users_df.song.isin(available_clips)] # Users with more than 50 ratings users_df = users_df.groupby('user').filter(lambda x: len(x) >= 50) # Normalise users' rating users_norm_df = pd.DataFrame() for k, v in users_df.groupby("user"): users_norm_df = users_norm_df.append(normalise_user_play_count(v)) trial = 10 users_train = [] users_test = [] #songs_train = [] #songs_test = [] for i in range(trial): test_df = pd.DataFrame() train_df = pd.DataFrame() for k, v in users_norm_df.groupby("user"): test_index = np.random.choice( v.index, int(len(v.index) / 5), replace=False ) # test_index = np.random.choice( # v.index, # 1, # replace=False # ) test_df = test_df.append(v.loc[test_index]) train_df = train_df.append(v.loc[~v.index.isin(test_index)]) users_train.append([]) users_train[i] = {} for k, v in train_df.groupby("user"): users_train[i][k] = { x: y["plays"].values[0] for x, y in v.groupby("song") } users_test.append([]) users_test[i] = {} for k, v in test_df.groupby("user"): users_test[i][k] = { x: y["plays"].values[0] for x, y in v.groupby("song") } #songs_test.append([]) #songs_test[i] = list(users_test.drop_duplicates(subset='song').song) #songs_test.append([]) #songs_test[i] = list(users_test.drop_duplicates(subset='song').song)''' # Filtered song library # Save training and test sets f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ cross_validation.pkl', 'wb') pickle.dump((users_train, users_test), f, protocol=pickle.HIGHEST_PROTOCOL) f.close() # Ground truth dictionary #users_ground_truth = {} #for k, v in users_norm_df.groupby("user"): #users_ground_truth[k] = { #x: y["plays"].values[0] for x, y in v.groupby("song") #} # Ground truth dictionary #users_ground_truth = {} #for k, v in users_ground_truth_df.groupby("user"): #users_ground_truth[k] = { #x: y["plays"].values[0] for x, y in v.groupby("song") #} ''' # Dataset for training/test songIDs = song_library.keys() dataset_keys = set(songIDs) random.shuffle(songIDs) folds = 10 fold_size = int(round(len(songIDs) / folds)) song_library_test = [] song_library_train = [] users_train = [] users_test = [] for i in range(folds): song_library_test.append([]) song_library_test[i] = { k: song_library[k] for k in songIDs[ (i * fold_size):((i + 1) * fold_size) ] } test_keys = set(song_library_test[i].keys()) train_keys = dataset_keys - test_keys song_library_train.append([]) song_library_train[i] = {k: song_library[k] for k in train_keys} users_train_df = users_ground_truth_df[ users_ground_truth_df.song.isin(train_keys) ] users_train.append([]) users_train[i] = {} for k, v in users_train_df.groupby("user"): users_train[i][k] = { x: y["plays"].values[0] for x, y in v.groupby("song") } users_test_df = users_ground_truth_df[ users_ground_truth_df.song.isin(test_keys) ] users_test.append([]) users_test[i] = {} for k, v in users_test_df.groupby("user"): users_test[i][k] = { x: y["plays"].values[0] for x, y in v.groupby("song") } for k, v in users_norm_df.groupby("user"): test_index = np.random.choice(v.index, int(len(v.index)/10), replace=False) print v.plays.count(), test_index.shape'''