# HG changeset patch # User Paulo Chiliguano # Date 1437476070 -3600 # Node ID c63dac455296239fcc5164fb424d286eacd84baf # Parent 0a0d6203638af01dab84281a5e4a661293457c64 Forgotten WMF code diff -r 0a0d6203638a -r c63dac455296 Code/latent_vectors.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Code/latent_vectors.py Tue Jul 21 11:54:30 2015 +0100 @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 20 13:37:43 2015 + +@author: Paulo +""" + + +import pandas as pd +import csv +import itertools +import numpy as np +import wmf + +# Read songID of downloaded audio clips +with open('/homes/pchilguano/dataset/audio_files.txt', 'rb') as input1: + available = list(csv.reader(input1)) + chain1 = list(itertools.chain(*available)) + +# Sparse user-item matrix +result = pd.DataFrame() +for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): + chunk = chunk[chunk.song.isin(chain1)] + result = result.append(chunk.pivot(index='user', columns='song', values='plays') + , ignore_index=True) + print (result.shape) +sresult = result.to_sparse() +sresult.to_pickle('/homes/pchilguano/dataset/taste_profile_sparse.pkl') + +# Weight Matrix Factorization +B = np.load("test_matrix.pkl") +S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6) +U, V = wmf.factorize(S, num_factors=40, lambda_reg=1e-5, num_iterations=2, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias) \ No newline at end of file diff -r 0a0d6203638a -r c63dac455296 Code/read_taste_profile.py --- a/Code/read_taste_profile.py Tue Jul 21 10:03:34 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -import csv -import pandas as pd -import itertools - -# Read songIDs from Million Song Dataset songID-trackID mismatches -with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out: - writer = csv.writer(out, delimiter=',') - next = f.readline() - while next != "": - writer.writerow([next[8:26]]) - #print(next[8:26]) - next = f.readline() - -# Read unique songIDs from Taste Profile dataset -location = r'~/dataset/train_triplets.txt' -df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) -ddf = df.drop_duplicates(subset = 'song') -ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False) - -# Delete songIDs mismatches from Taste Profile Dataset -with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: - l1 = list(csv.reader(input1)) - chain1 = list(itertools.chain(*l1)) - l2 = list(csv.reader(input2)) - chain2 = list(itertools.chain(*l2)) - l3 = set(chain2) - set(chain1) - wr = csv.writer(myfile, delimiter=',') - for item in l3: - wr.writerow([item])