p@25: # -*- coding: utf-8 -*- p@25: """ p@25: Created on Tue Aug 18 01:41:18 2015 p@25: p@25: @author: paulochiliguano p@25: """ p@25: p@25: import pandas as pd p@25: import cPickle as pickle p@25: #import random p@25: import numpy as np p@25: p@25: # Normalise user's play count p@25: def normalise_user_play_count(user_df, cv_value = 0.5, max_score=5, similar_score=3): p@25: p@25: '''cv: Coefficient of variation, normalised measure of dispersion of a p@25: probability distribution''' p@25: cv = user_df.plays.std() / user_df.plays.mean() p@25: p@25: if cv <= 0.5: p@25: '''Homogenous listening habits''' p@25: user_df.plays = similar_score p@25: print "Homogeneous" p@25: #for songID, play_count in user[userID].items(): p@25: #user[userID][songID] = 3 p@25: else: p@25: '''Complementary cumulative distribution''' p@25: user_df = user_df.sort(columns='plays', ascending=False) p@25: user_df['ccs'] = 1 - user_df.plays.cumsum() / float(user_df.plays.sum()) p@25: p@25: user_df.loc[user_df.ccs >= 0.8, 'plays'] = 5 p@25: user_df.loc[(user_df.ccs < 0.8) & (user_df.ccs >= 0.6), 'plays'] = 4 p@25: user_df.loc[(user_df.ccs < 0.6) & (user_df.ccs >= 0.4), 'plays'] = 3 p@25: user_df.loc[(user_df.ccs < 0.4) & (user_df.ccs >= 0.2), 'plays'] = 2 p@25: user_df.loc[user_df.ccs < 0.2, 'plays'] = 1 p@25: p@25: user_df = user_df.drop('ccs', 1) p@25: #song_play_count_q = pd.cut( p@25: #user_df["plays"], p@25: #max_score, p@25: #labels=False p@25: #) + 1 p@25: #user_df.plays = song_play_count_q p@25: #user[userID] = song_play_count.set_index('songID')['play_count'].to_dict() p@25: return user_df p@25: #for userID in user: p@25: #song_play_count = pd.DataFrame( p@25: #user[userID].items(), p@25: #columns=["songID", "play_count"] p@25: #) p@25: p@25: # User-item data frame p@25: users_df = pd.read_pickle('/Users/paulochiliguano/Documents/msc-project/\ p@25: dataset/CF_dataset.pkl') p@25: p@25: # Normalise users' rating p@25: # users_norm_df = pd.DataFrame() p@25: #for k, v in users_df.groupby("user"): p@25: #users_norm_df = users_norm_df.append(normalise_user_play_count(v)) p@25: p@25: # SongIDs of downloaded audio clips p@25: filename = '/Users/paulochiliguano/Documents/msc-project/dataset/7digital/\ p@25: CF_dataset_7digital.txt' p@25: with open(filename, 'rb') as f: p@25: available_clips = [line.strip().split('\t')[0] for line in f] p@25: p@25: # Ground truth with available tracks p@25: #users_ground_truth_df = users_norm_df[users_norm_df.song.isin(available_clips)] p@25: users_df = users_df[users_df.song.isin(available_clips)] p@25: p@25: # Users with more than 50 ratings p@26: #users_df = users_df.groupby('user').filter(lambda x: len(x) >= 50) p@25: p@25: # Normalise users' rating p@25: users_norm_df = pd.DataFrame() p@25: for k, v in users_df.groupby("user"): p@26: norm = normalise_user_play_count(v) p@26: users_norm_df = users_norm_df.append(norm) p@26: # counts = norm['plays'].value_counts() p@26: # if counts[counts.index == 5].values > 0: p@26: # users_norm_df = users_norm_df.append(norm) p@26: p@26: #for k, v in users_norm_df.groupby('user'): p@26: # counts = v['plays'].value_counts() p@26: # df = v.loc[v['plays'].isin(counts[counts >= 5].index), :] p@26: # print df p@25: p@25: trial = 10 p@25: users_train = [] p@25: users_test = [] p@26: #highest_rating = [4, 5] p@26: #lowest_rating = [1, 2, 3] p@25: for i in range(trial): p@25: test_df = pd.DataFrame() p@25: train_df = pd.DataFrame() p@25: for k, v in users_norm_df.groupby("user"): p@26: # likes = v.loc[v['plays'].isin(highest_rating)] p@26: # dislikes = v.loc[v['plays'].isin(lowest_rating)] p@26: # test_like_index = np.random.choice( p@26: # likes.index, p@26: # 1, p@26: # replace=False p@26: # ) p@26: # test_dislike_index = np.random.choice( p@26: # dislikes.index, p@26: # 1, p@26: # replace=False p@26: # ) p@26: # test_index = np.append(test_like_index, test_dislike_index) p@26: # test_index = test_like_index p@25: test_index = np.random.choice( p@25: v.index, p@25: int(len(v.index) / 5), p@25: replace=False p@25: ) p@26: p@25: test_df = test_df.append(v.loc[test_index]) p@25: train_df = train_df.append(v.loc[~v.index.isin(test_index)]) p@25: p@25: users_train.append([]) p@25: users_train[i] = {} p@25: for k, v in train_df.groupby("user"): p@25: users_train[i][k] = { p@25: x: y["plays"].values[0] for x, y in v.groupby("song") p@25: } p@25: p@25: users_test.append([]) p@25: users_test[i] = {} p@25: for k, v in test_df.groupby("user"): p@25: users_test[i][k] = { p@25: x: y["plays"].values[0] for x, y in v.groupby("song") p@25: } p@25: p@25: # Save training and test sets p@25: f = file('/Users/paulochiliguano/Documents/msc-project/dataset/\ p@25: cross_validation.pkl', 'wb') p@25: pickle.dump((users_train, users_test), f, protocol=pickle.HIGHEST_PROTOCOL) p@25: f.close()