annotate pyspark/csvParser.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
rev   line source
Daniel@0 1 # Part of DML (Digital Music Laboratory)
Daniel@0 2 #
Daniel@0 3 # This program is free software; you can redistribute it and/or
Daniel@0 4 # modify it under the terms of the GNU General Public License
Daniel@0 5 # as published by the Free Software Foundation; either version 2
Daniel@0 6 # of the License, or (at your option) any later version.
Daniel@0 7 #
Daniel@0 8 # This program is distributed in the hope that it will be useful,
Daniel@0 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0 11 # GNU General Public License for more details.
Daniel@0 12 #
Daniel@0 13 # You should have received a copy of the GNU General Public
Daniel@0 14 # License along with this library; if not, write to the Free Software
Daniel@0 15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0 16
Daniel@0 17 # -*- coding: utf-8 -*-
Daniel@0 18 __author__="Daniel Wolff"
Daniel@0 19
Daniel@0 20 import codecs
Daniel@0 21 import warnings
Daniel@0 22 import numpy
Daniel@0 23 import csv
Daniel@0 24 from n3Parser import uri2path
Daniel@0 25
Daniel@0 26
Daniel@0 27 # reads csv file into a table,
Daniel@0 28 # the first column, containing "time" is converted to float, the rest is left at strings
Daniel@0 29 # data formats are for example:
Daniel@0 30 # for silvet pitch output:['time','duration','pitch','velocity','label']
Daniel@0 31 # for qm_vamp_key_standard output: ['time','keynr','label']
Daniel@0 32 # for qm_vamp_key_standard_tonic output: ['time','keynr','label']
Daniel@0 33 #
Daniel@0 34 # data can be nicely traversed:
Daniel@0 35 # for time, duration,pitch,velocity,label
Daniel@0 36 def get_array_from_csv(input_f_file):
Daniel@0 37
Daniel@0 38 output = []
Daniel@0 39 badcount = 0
Daniel@0 40
Daniel@0 41 # keep track of column names
Daniel@0 42 ncols = 0
Daniel@0 43 with open(uri2path(input_f_file), 'rb') as csvfile:
Daniel@0 44 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0 45 for row in contents:
Daniel@0 46 if ncols == 0:
Daniel@0 47 ncols = len(row)
Daniel@0 48
Daniel@0 49 if len(row) >= ncols:
Daniel@0 50 # we assume format time , ...
Daniel@0 51 output.append([float(row[0])] + row[1:])
Daniel@0 52 else:
Daniel@0 53 badcount += 1
Daniel@0 54
Daniel@0 55 if badcount > 0:
Daniel@0 56 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
Daniel@0 57
Daniel@0 58 return output
Daniel@0 59
Daniel@0 60
Daniel@0 61
Daniel@0 62
Daniel@0 63
Daniel@0 64 # converts csv input to dictionary with entities named as in "columtype".
Daniel@0 65 #
Daniel@0 66 # first value (time) is assumed to be float
Daniel@0 67 # for silvet pitch output call_
Daniel@0 68 # csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label'])
Daniel@0 69 # for qm_vamp_key_standard output call
Daniel@0 70 # csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
Daniel@0 71 # for qm_vamp_key_standard_tonic output call
Daniel@0 72 # csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
Daniel@0 73 def get_dict_from_csv(input_f_file, columtype = ['time']):
Daniel@0 74
Daniel@0 75 output = []
Daniel@0 76 badcount = 0
Daniel@0 77
Daniel@0 78 # keep track of column names
Daniel@0 79 ncols = 0
Daniel@0 80 with open(uri2path(input_f_file), 'rb') as csvfile:
Daniel@0 81 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0 82 for row in contents:
Daniel@0 83
Daniel@0 84 # initialise the column name
Daniel@0 85 if ncols == 0:
Daniel@0 86 ncols = len(row)
Daniel@0 87
Daniel@0 88 # get number of descriptors, and append if left empty
Daniel@0 89 ncoldescr = len(columtype)
Daniel@0 90 if ncoldescr < ncols:
Daniel@0 91 warnings.warn("Column types missing")
Daniel@0 92 columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)])
Daniel@0 93
Daniel@0 94 if len(row) == ncols:
Daniel@0 95 # parse the csv data into dict
Daniel@0 96 rowdict = dict()
Daniel@0 97 for i,col in enumerate(columtype):
Daniel@0 98 # first value (time) is transformed to float
Daniel@0 99 if i == 0:
Daniel@0 100 rowdict[col] = float(row[i])
Daniel@0 101 else:
Daniel@0 102 rowdict[col] = row[i]
Daniel@0 103
Daniel@0 104 # append dictionary to output
Daniel@0 105 output.append(rowdict)
Daniel@0 106
Daniel@0 107 else:
Daniel@0 108 badcount += 1
Daniel@0 109
Daniel@0 110 if badcount > 0:
Daniel@0 111 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
Daniel@0 112
Daniel@0 113 return output