Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/read_taste_profile.py @ 7:4de098e10bbb
Read list of files and trying to chunk dataset
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 12 Jul 2015 23:56:25 +0100 |
parents | 566c90064778 |
children | 28f15e232028 |
line wrap: on
line source
import os import csv import pandas as pd import numpy as np # List of h5 files (audio streams) with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out: writer = csv.writer(out, delimiter=',') for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"): for file in files: if file.endswith(".h5"): #print(os.path.join(root, file)) track = ''.join(['SO',str(file)[2:-3]]) print(track) writer.writerow([track]) with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f: reader = csv.reader(f) your_list = list(reader) store = pd.HDFStore('/homes/pchilguano/dataset/store.h5') location = r'~/dataset/train_triplets.txt' chunksize = 4 for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize): #frame = pd.Dataframe() #frame = chunk.query('song == your_list') frame = chunk.pivot(index='user', columns='song', values='plays') store.append('df', frame) df = store['df'] chunks = (df.groupby().sum() for df in store.select('df', chunksize=500)) res = next(chunks) # will raise if there are no chunks! for c in chunks: res = res.add(c, fill_value=0) sdf = df.to_sparse()