# HG changeset patch
# User Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
# Date 1436741785 -3600
# Node ID 4de098e10bbb3a1c7d02a21ebf8f3f089be46537
# Parent  41e14a539dd35a670128fc3b239dc1bdf303ffab
Read list of files and trying to chunk dataset

diff -r 41e14a539dd3 -r 4de098e10bbb Code/read_taste_profile.py
--- a/Code/read_taste_profile.py	Sat Jul 11 21:51:15 2015 +0100
+++ b/Code/read_taste_profile.py	Sun Jul 12 23:56:25 2015 +0100
@@ -1,12 +1,43 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jul 11 15:34:52 2015
-
-@author: Paulo
-"""
-
+import os
+import csv
 import pandas as pd
 import numpy as np
 
-df = pd.read_csv("C:/Users/Paulo/Documents/Queen Mary/MSc Project/Dataset/train_triplets.txt")
-df.head()
\ No newline at end of file
+# List of h5 files (audio streams)
+with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
+	writer = csv.writer(out, delimiter=',')
+	for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
+		for file in files:
+			if file.endswith(".h5"):
+				#print(os.path.join(root, file))
+				track = ''.join(['SO',str(file)[2:-3]])
+				print(track)
+				writer.writerow([track])
+				
+with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f:
+    reader = csv.reader(f)
+    your_list = list(reader)
+
+
+store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')
+location = r'~/dataset/train_triplets.txt'
+chunksize = 4
+for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
+	#frame = pd.Dataframe()
+	#frame = chunk.query('song == your_list')
+	frame = chunk.pivot(index='user', columns='song', values='plays')
+	store.append('df', frame)
+
+	
+	
+	
+df = store['df']
+chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
+res = next(chunks)  # will raise if there are no chunks!
+for c in chunks:
+    res = res.add(c, fill_value=0)
+
+
+sdf = df.to_sparse()
+
+