view Code/latent_vectors.py @ 18:c0a08cbdfacd

First script
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Tue, 28 Jul 2015 20:58:57 +0100
parents c63dac455296
children 2e3c57fba632
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 20 13:37:43 2015

@author: Paulo
"""


import pandas as pd
import csv
import itertools
import numpy as np
import wmf

# Read songID of downloaded audio clips
with open('/homes/pchilguano/dataset/audio_files.txt', 'rb') as input1:
    available = list(csv.reader(input1))
    chain1 = list(itertools.chain(*available))
    
# Sparse user-item matrix
result = pd.DataFrame()
for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
    chunk = chunk[chunk.song.isin(chain1)]    
    result = result.append(chunk.pivot(index='user', columns='song', values='plays')    
    , ignore_index=True)
    print (result.shape)
sresult = result.to_sparse()
sresult.to_pickle('/homes/pchilguano/dataset/taste_profile_sparse.pkl')

# Weight Matrix Factorization
B = np.load("test_matrix.pkl")
S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6)
U, V = wmf.factorize(S, num_factors=40, lambda_reg=1e-5, num_iterations=2, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias)