Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
changeset 14:c63dac455296
Forgotten WMF code
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 21 Jul 2015 11:54:30 +0100 |
parents | 0a0d6203638a |
children | 2e3c57fba632 c0a08cbdfacd |
files | Code/latent_vectors.py Code/read_taste_profile.py |
diffstat | 2 files changed, 33 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Code/latent_vectors.py Tue Jul 21 11:54:30 2015 +0100 @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 20 13:37:43 2015 + +@author: Paulo +""" + + +import pandas as pd +import csv +import itertools +import numpy as np +import wmf + +# Read songID of downloaded audio clips +with open('/homes/pchilguano/dataset/audio_files.txt', 'rb') as input1: + available = list(csv.reader(input1)) + chain1 = list(itertools.chain(*available)) + +# Sparse user-item matrix +result = pd.DataFrame() +for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): + chunk = chunk[chunk.song.isin(chain1)] + result = result.append(chunk.pivot(index='user', columns='song', values='plays') + , ignore_index=True) + print (result.shape) +sresult = result.to_sparse() +sresult.to_pickle('/homes/pchilguano/dataset/taste_profile_sparse.pkl') + +# Weight Matrix Factorization +B = np.load("test_matrix.pkl") +S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6) +U, V = wmf.factorize(S, num_factors=40, lambda_reg=1e-5, num_iterations=2, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias) \ No newline at end of file
--- a/Code/read_taste_profile.py Tue Jul 21 10:03:34 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -import csv -import pandas as pd -import itertools - -# Read songIDs from Million Song Dataset songID-trackID mismatches -with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out: - writer = csv.writer(out, delimiter=',') - next = f.readline() - while next != "": - writer.writerow([next[8:26]]) - #print(next[8:26]) - next = f.readline() - -# Read unique songIDs from Taste Profile dataset -location = r'~/dataset/train_triplets.txt' -df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) -ddf = df.drop_duplicates(subset = 'song') -ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False) - -# Delete songIDs mismatches from Taste Profile Dataset -with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: - l1 = list(csv.reader(input1)) - chain1 = list(itertools.chain(*l1)) - l2 = list(csv.reader(input2)) - chain2 = list(itertools.chain(*l2)) - l3 = set(chain2) - set(chain1) - wr = csv.writer(myfile, delimiter=',') - for item in l3: - wr.writerow([item])