dml-open-backendtools: pyspark/csvParser.py annotate

annotate pyspark/csvParser.py @ 0:e34cf1b6fe09 tip

commit

author	Daniel Wolff
date	Sat, 20 Feb 2016 18:14:24 +0100
parents
children

rev	line source
Daniel@0	1 # Part of DML (Digital Music Laboratory)
Daniel@0	2 #
Daniel@0	3 # This program is free software; you can redistribute it and/or
Daniel@0	4 # modify it under the terms of the GNU General Public License
Daniel@0	5 # as published by the Free Software Foundation; either version 2
Daniel@0	6 # of the License, or (at your option) any later version.
Daniel@0	7 #
Daniel@0	8 # This program is distributed in the hope that it will be useful,
Daniel@0	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0	11 # GNU General Public License for more details.
Daniel@0	12 #
Daniel@0	13 # You should have received a copy of the GNU General Public
Daniel@0	14 # License along with this library; if not, write to the Free Software
Daniel@0	15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0	16
Daniel@0	17 # -- coding: utf-8 --
Daniel@0	18 __author__="Daniel Wolff"
Daniel@0	19
Daniel@0	20 import codecs
Daniel@0	21 import warnings
Daniel@0	22 import numpy
Daniel@0	23 import csv
Daniel@0	24 from n3Parser import uri2path
Daniel@0	25
Daniel@0	26
Daniel@0	27 # reads csv file into a table,
Daniel@0	28 # the first column, containing "time" is converted to float, the rest is left at strings
Daniel@0	29 # data formats are for example:
Daniel@0	30 # for silvet pitch output:['time','duration','pitch','velocity','label']
Daniel@0	31 # for qm_vamp_key_standard output: ['time','keynr','label']
Daniel@0	32 # for qm_vamp_key_standard_tonic output: ['time','keynr','label']
Daniel@0	33 #
Daniel@0	34 # data can be nicely traversed:
Daniel@0	35 # for time, duration,pitch,velocity,label
Daniel@0	36 def get_array_from_csv(input_f_file):
Daniel@0	37
Daniel@0	38 output = []
Daniel@0	39 badcount = 0
Daniel@0	40
Daniel@0	41 # keep track of column names
Daniel@0	42 ncols = 0
Daniel@0	43 with open(uri2path(input_f_file), 'rb') as csvfile:
Daniel@0	44 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0	45 for row in contents:
Daniel@0	46 if ncols == 0:
Daniel@0	47 ncols = len(row)
Daniel@0	48
Daniel@0	49 if len(row) >= ncols:
Daniel@0	50 # we assume format time , ...
Daniel@0	51 output.append([float(row[0])] + row[1:])
Daniel@0	52 else:
Daniel@0	53 badcount += 1
Daniel@0	54
Daniel@0	55 if badcount > 0:
Daniel@0	56 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
Daniel@0	57
Daniel@0	58 return output
Daniel@0	59
Daniel@0	60
Daniel@0	61
Daniel@0	62
Daniel@0	63
Daniel@0	64 # converts csv input to dictionary with entities named as in "columtype".
Daniel@0	65 #
Daniel@0	66 # first value (time) is assumed to be float
Daniel@0	67 # for silvet pitch output call_
Daniel@0	68 # csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label'])
Daniel@0	69 # for qm_vamp_key_standard output call
Daniel@0	70 # csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
Daniel@0	71 # for qm_vamp_key_standard_tonic output call
Daniel@0	72 # csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
Daniel@0	73 def get_dict_from_csv(input_f_file, columtype = ['time']):
Daniel@0	74
Daniel@0	75 output = []
Daniel@0	76 badcount = 0
Daniel@0	77
Daniel@0	78 # keep track of column names
Daniel@0	79 ncols = 0
Daniel@0	80 with open(uri2path(input_f_file), 'rb') as csvfile:
Daniel@0	81 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0	82 for row in contents:
Daniel@0	83
Daniel@0	84 # initialise the column name
Daniel@0	85 if ncols == 0:
Daniel@0	86 ncols = len(row)
Daniel@0	87
Daniel@0	88 # get number of descriptors, and append if left empty
Daniel@0	89 ncoldescr = len(columtype)
Daniel@0	90 if ncoldescr < ncols:
Daniel@0	91 warnings.warn("Column types missing")
Daniel@0	92 columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)])
Daniel@0	93
Daniel@0	94 if len(row) == ncols:
Daniel@0	95 # parse the csv data into dict
Daniel@0	96 rowdict = dict()
Daniel@0	97 for i,col in enumerate(columtype):
Daniel@0	98 # first value (time) is transformed to float
Daniel@0	99 if i == 0:
Daniel@0	100 rowdict[col] = float(row[i])
Daniel@0	101 else:
Daniel@0	102 rowdict[col] = row[i]
Daniel@0	103
Daniel@0	104 # append dictionary to output
Daniel@0	105 output.append(rowdict)
Daniel@0	106
Daniel@0	107 else:
Daniel@0	108 badcount += 1
Daniel@0	109
Daniel@0	110 if badcount > 0:
Daniel@0	111 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
Daniel@0	112
Daniel@0	113 return output

Mercurial > hg > dml-open-backendtools

annotate pyspark/csvParser.py @ 0:e34cf1b6fe09 tip