Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
changeset 7:4de098e10bbb
Read list of files and trying to chunk dataset
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Sun, 12 Jul 2015 23:56:25 +0100 |
parents | 41e14a539dd3 |
children | 28f15e232028 |
files | Code/read_taste_profile.py |
diffstat | 1 files changed, 40 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/Code/read_taste_profile.py Sat Jul 11 21:51:15 2015 +0100 +++ b/Code/read_taste_profile.py Sun Jul 12 23:56:25 2015 +0100 @@ -1,12 +1,43 @@ -# -*- coding: utf-8 -*- -""" -Created on Sat Jul 11 15:34:52 2015 - -@author: Paulo -""" - +import os +import csv import pandas as pd import numpy as np -df = pd.read_csv("C:/Users/Paulo/Documents/Queen Mary/MSc Project/Dataset/train_triplets.txt") -df.head() \ No newline at end of file +# List of h5 files (audio streams) +with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out: + writer = csv.writer(out, delimiter=',') + for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"): + for file in files: + if file.endswith(".h5"): + #print(os.path.join(root, file)) + track = ''.join(['SO',str(file)[2:-3]]) + print(track) + writer.writerow([track]) + +with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f: + reader = csv.reader(f) + your_list = list(reader) + + +store = pd.HDFStore('/homes/pchilguano/dataset/store.h5') +location = r'~/dataset/train_triplets.txt' +chunksize = 4 +for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize): + #frame = pd.Dataframe() + #frame = chunk.query('song == your_list') + frame = chunk.pivot(index='user', columns='song', values='plays') + store.append('df', frame) + + + + +df = store['df'] +chunks = (df.groupby().sum() for df in store.select('df', chunksize=500)) +res = next(chunks) # will raise if there are no chunks! +for c in chunks: + res = res.add(c, fill_value=0) + + +sdf = df.to_sparse() + +