Mercurial > hg > dml-open-backendtools
view pyspark/csvParser.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line source
# Part of DML (Digital Music Laboratory) # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # -*- coding: utf-8 -*- __author__="Daniel Wolff" import codecs import warnings import numpy import csv from n3Parser import uri2path # reads csv file into a table, # the first column, containing "time" is converted to float, the rest is left at strings # data formats are for example: # for silvet pitch output:['time','duration','pitch','velocity','label'] # for qm_vamp_key_standard output: ['time','keynr','label'] # for qm_vamp_key_standard_tonic output: ['time','keynr','label'] # # data can be nicely traversed: # for time, duration,pitch,velocity,label def get_array_from_csv(input_f_file): output = [] badcount = 0 # keep track of column names ncols = 0 with open(uri2path(input_f_file), 'rb') as csvfile: contents = csv.reader(csvfile, delimiter=',', quotechar='"') for row in contents: if ncols == 0: ncols = len(row) if len(row) >= ncols: # we assume format time , ... output.append([float(row[0])] + row[1:]) else: badcount += 1 if badcount > 0: warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") return output # converts csv input to dictionary with entities named as in "columtype". # # first value (time) is assumed to be float # for silvet pitch output call_ # csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label']) # for qm_vamp_key_standard output call # csv_to_dict(input_f_file, columtype = ['time','keynr','label']) # for qm_vamp_key_standard_tonic output call # csv_to_dict(input_f_file, columtype = ['time','keynr','label']) def get_dict_from_csv(input_f_file, columtype = ['time']): output = [] badcount = 0 # keep track of column names ncols = 0 with open(uri2path(input_f_file), 'rb') as csvfile: contents = csv.reader(csvfile, delimiter=',', quotechar='"') for row in contents: # initialise the column name if ncols == 0: ncols = len(row) # get number of descriptors, and append if left empty ncoldescr = len(columtype) if ncoldescr < ncols: warnings.warn("Column types missing") columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)]) if len(row) == ncols: # parse the csv data into dict rowdict = dict() for i,col in enumerate(columtype): # first value (time) is transformed to float if i == 0: rowdict[col] = float(row[i]) else: rowdict[col] = row[i] # append dictionary to output output.append(rowdict) else: badcount += 1 if badcount > 0: warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") return output