Mercurial > hg > dml-open-backendtools
diff pyspark/csvParser.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyspark/csvParser.py Sat Feb 20 18:14:24 2016 +0100 @@ -0,0 +1,113 @@ +# Part of DML (Digital Music Laboratory) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# -*- coding: utf-8 -*- +__author__="Daniel Wolff" + +import codecs +import warnings +import numpy +import csv +from n3Parser import uri2path + + +# reads csv file into a table, +# the first column, containing "time" is converted to float, the rest is left at strings +# data formats are for example: +# for silvet pitch output:['time','duration','pitch','velocity','label'] +# for qm_vamp_key_standard output: ['time','keynr','label'] +# for qm_vamp_key_standard_tonic output: ['time','keynr','label'] +# +# data can be nicely traversed: +# for time, duration,pitch,velocity,label +def get_array_from_csv(input_f_file): + + output = [] + badcount = 0 + + # keep track of column names + ncols = 0 + with open(uri2path(input_f_file), 'rb') as csvfile: + contents = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in contents: + if ncols == 0: + ncols = len(row) + + if len(row) >= ncols: + # we assume format time , ... + output.append([float(row[0])] + row[1:]) + else: + badcount += 1 + + if badcount > 0: + warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") + + return output + + + + + +# converts csv input to dictionary with entities named as in "columtype". +# +# first value (time) is assumed to be float +# for silvet pitch output call_ +# csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label']) +# for qm_vamp_key_standard output call +# csv_to_dict(input_f_file, columtype = ['time','keynr','label']) +# for qm_vamp_key_standard_tonic output call +# csv_to_dict(input_f_file, columtype = ['time','keynr','label']) +def get_dict_from_csv(input_f_file, columtype = ['time']): + + output = [] + badcount = 0 + + # keep track of column names + ncols = 0 + with open(uri2path(input_f_file), 'rb') as csvfile: + contents = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in contents: + + # initialise the column name + if ncols == 0: + ncols = len(row) + + # get number of descriptors, and append if left empty + ncoldescr = len(columtype) + if ncoldescr < ncols: + warnings.warn("Column types missing") + columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)]) + + if len(row) == ncols: + # parse the csv data into dict + rowdict = dict() + for i,col in enumerate(columtype): + # first value (time) is transformed to float + if i == 0: + rowdict[col] = float(row[i]) + else: + rowdict[col] = row[i] + + # append dictionary to output + output.append(rowdict) + + else: + badcount += 1 + + if badcount > 0: + warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") + + return output