view Code/read_taste_profile.py @ 7:4de098e10bbb

Read list of files and trying to chunk dataset
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Sun, 12 Jul 2015 23:56:25 +0100
parents 566c90064778
children 28f15e232028
line wrap: on
line source
import os
import csv
import pandas as pd
import numpy as np

# List of h5 files (audio streams)
with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
	writer = csv.writer(out, delimiter=',')
	for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
		for file in files:
			if file.endswith(".h5"):
				#print(os.path.join(root, file))
				track = ''.join(['SO',str(file)[2:-3]])
				print(track)
				writer.writerow([track])
				
with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f:
    reader = csv.reader(f)
    your_list = list(reader)


store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')
location = r'~/dataset/train_triplets.txt'
chunksize = 4
for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
	#frame = pd.Dataframe()
	#frame = chunk.query('song == your_list')
	frame = chunk.pivot(index='user', columns='song', values='plays')
	store.append('df', frame)

	
	
	
df = store['df']
chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
res = next(chunks)  # will raise if there are no chunks!
for c in chunks:
    res = res.add(c, fill_value=0)


sdf = df.to_sparse()