diff pyspark/csvParser.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyspark/csvParser.py	Sat Feb 20 18:14:24 2016 +0100
@@ -0,0 +1,113 @@
+# Part of DML (Digital Music Laboratory)
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+# -*- coding: utf-8 -*-
+__author__="Daniel Wolff"
+
+import codecs
+import warnings
+import numpy
+import csv
+from n3Parser import uri2path
+
+
+# reads csv file into a table,
+# the first column, containing "time" is converted to float, the rest is left at strings
+# data formats are for example:
+# for silvet pitch output:['time','duration','pitch','velocity','label']
+# for qm_vamp_key_standard output: ['time','keynr','label']
+# for qm_vamp_key_standard_tonic output: ['time','keynr','label']
+#
+# data can be nicely traversed:
+# for time, duration,pitch,velocity,label 
+def get_array_from_csv(input_f_file):
+
+    output = []
+    badcount = 0
+    
+    # keep track of column names
+    ncols = 0
+    with open(uri2path(input_f_file), 'rb') as csvfile:
+        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in contents:
+            if ncols == 0:
+                ncols = len(row)
+                
+	    if len(row) >= ncols:
+                # we assume format time , ...
+	        output.append([float(row[0])] + row[1:])
+            else: 
+                badcount += 1
+                
+    if badcount > 0:
+        warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
+        
+    return output
+
+
+
+
+
+# converts csv input to dictionary with entities named as in "columtype".
+#
+# first value (time) is assumed to be float
+# for silvet pitch output call_
+#  csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label'])
+# for qm_vamp_key_standard output call 
+#    csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
+# for qm_vamp_key_standard_tonic output call 
+#    csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
+def get_dict_from_csv(input_f_file, columtype = ['time']):
+    
+    output = []
+    badcount = 0
+    
+    # keep track of column names
+    ncols = 0
+    with open(uri2path(input_f_file), 'rb') as csvfile:
+        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in contents:
+            
+            # initialise the column name
+            if ncols == 0:
+                ncols = len(row)
+                
+                # get number of descriptors, and append if left empty
+                ncoldescr = len(columtype)
+                if ncoldescr < ncols:
+                    warnings.warn("Column types missing")
+                    columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)])
+
+	    if len(row) == ncols:
+                # parse the csv data into dict
+                rowdict = dict()
+                for i,col in enumerate(columtype):
+                    # first value (time) is transformed to float
+                    if i == 0:
+                        rowdict[col] = float(row[i])
+                    else:
+                        rowdict[col] = row[i]
+                
+                # append dictionary to output
+                output.append(rowdict)
+                
+            else: 
+                badcount += 1
+                
+    if badcount > 0:
+        warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
+
+    return output