dml-open-backendtools: pyspark/csvParser.py comparison

comparison pyspark/csvParser.py @ 0:e34cf1b6fe09 tip

commit

author	Daniel Wolff
date	Sat, 20 Feb 2016 18:14:24 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e34cf1b6fe09
+# Part of DML (Digital Music Laboratory)
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+# -*- coding: utf-8 -*-
+__author__="Daniel Wolff"
+import codecs
+import warnings
+import numpy
+import csv
+from n3Parser import uri2path
+# reads csv file into a table,
+# the first column, containing "time" is converted to float, the rest is left at strings
+# data formats are for example:
+# for silvet pitch output:['time','duration','pitch','velocity','label']
+# for qm_vamp_key_standard output: ['time','keynr','label']
+# for qm_vamp_key_standard_tonic output: ['time','keynr','label']
+#
+# data can be nicely traversed:
+# for time, duration,pitch,velocity,label
+def get_array_from_csv(input_f_file):
+output = []
+badcount = 0
+# keep track of column names
+ncols = 0
+with open(uri2path(input_f_file), 'rb') as csvfile:
+contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+for row in contents:
+if ncols == 0:
+ncols = len(row)
+	    if len(row) >= ncols:
+# we assume format time , ...
+	        output.append([float(row[0])] + row[1:])
+else:
+badcount += 1
+if badcount > 0:
+warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
+return output
+# converts csv input to dictionary with entities named as in "columtype".
+#
+# first value (time) is assumed to be float
+# for silvet pitch output call_
+#  csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label'])
+# for qm_vamp_key_standard output call
+#    csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
+# for qm_vamp_key_standard_tonic output call
+#    csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
+def get_dict_from_csv(input_f_file, columtype = ['time']):
+output = []
+badcount = 0
+# keep track of column names
+ncols = 0
+with open(uri2path(input_f_file), 'rb') as csvfile:
+contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+for row in contents:
+# initialise the column name
+if ncols == 0:
+ncols = len(row)
+# get number of descriptors, and append if left empty
+ncoldescr = len(columtype)
+if ncoldescr < ncols:
+warnings.warn("Column types missing")
+columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)])
+	    if len(row) == ncols:
+# parse the csv data into dict
+rowdict = dict()
+for i,col in enumerate(columtype):
+# first value (time) is transformed to float
+if i == 0:
+rowdict[col] = float(row[i])
+else:
+rowdict[col] = row[i]
+# append dictionary to output
+output.append(rowdict)
+else:
+badcount += 1
+if badcount > 0:
+warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
+return output

Mercurial > hg > dml-open-backendtools

comparison pyspark/csvParser.py @ 0:e34cf1b6fe09 tip