# HG changeset patch
# User Paulo Chiliguano
# Date 1436741785 -3600
# Node ID 4de098e10bbb3a1c7d02a21ebf8f3f089be46537
# Parent 41e14a539dd35a670128fc3b239dc1bdf303ffab
Read list of files and trying to chunk dataset
diff -r 41e14a539dd3 -r 4de098e10bbb Code/read_taste_profile.py
--- a/Code/read_taste_profile.py Sat Jul 11 21:51:15 2015 +0100
+++ b/Code/read_taste_profile.py Sun Jul 12 23:56:25 2015 +0100
@@ -1,12 +1,43 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sat Jul 11 15:34:52 2015
-
-@author: Paulo
-"""
-
+import os
+import csv
import pandas as pd
import numpy as np
-df = pd.read_csv("C:/Users/Paulo/Documents/Queen Mary/MSc Project/Dataset/train_triplets.txt")
-df.head()
\ No newline at end of file
+# List of h5 files (audio streams)
+with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
+ writer = csv.writer(out, delimiter=',')
+ for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
+ for file in files:
+ if file.endswith(".h5"):
+ #print(os.path.join(root, file))
+ track = ''.join(['SO',str(file)[2:-3]])
+ print(track)
+ writer.writerow([track])
+
+with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f:
+ reader = csv.reader(f)
+ your_list = list(reader)
+
+
+store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')
+location = r'~/dataset/train_triplets.txt'
+chunksize = 4
+for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
+ #frame = pd.Dataframe()
+ #frame = chunk.query('song == your_list')
+ frame = chunk.pivot(index='user', columns='song', values='plays')
+ store.append('df', frame)
+
+
+
+
+df = store['df']
+chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
+res = next(chunks) # will raise if there are no chunks!
+for c in chunks:
+ res = res.add(c, fill_value=0)
+
+
+sdf = df.to_sparse()
+
+