comparison pyspark/csvParser.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e34cf1b6fe09
1 # Part of DML (Digital Music Laboratory)
2 #
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public
14 # License along with this library; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17 # -*- coding: utf-8 -*-
18 __author__="Daniel Wolff"
19
20 import codecs
21 import warnings
22 import numpy
23 import csv
24 from n3Parser import uri2path
25
26
27 # reads csv file into a table,
28 # the first column, containing "time" is converted to float, the rest is left at strings
29 # data formats are for example:
30 # for silvet pitch output:['time','duration','pitch','velocity','label']
31 # for qm_vamp_key_standard output: ['time','keynr','label']
32 # for qm_vamp_key_standard_tonic output: ['time','keynr','label']
33 #
34 # data can be nicely traversed:
35 # for time, duration,pitch,velocity,label
36 def get_array_from_csv(input_f_file):
37
38 output = []
39 badcount = 0
40
41 # keep track of column names
42 ncols = 0
43 with open(uri2path(input_f_file), 'rb') as csvfile:
44 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
45 for row in contents:
46 if ncols == 0:
47 ncols = len(row)
48
49 if len(row) >= ncols:
50 # we assume format time , ...
51 output.append([float(row[0])] + row[1:])
52 else:
53 badcount += 1
54
55 if badcount > 0:
56 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
57
58 return output
59
60
61
62
63
64 # converts csv input to dictionary with entities named as in "columtype".
65 #
66 # first value (time) is assumed to be float
67 # for silvet pitch output call_
68 # csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label'])
69 # for qm_vamp_key_standard output call
70 # csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
71 # for qm_vamp_key_standard_tonic output call
72 # csv_to_dict(input_f_file, columtype = ['time','keynr','label'])
73 def get_dict_from_csv(input_f_file, columtype = ['time']):
74
75 output = []
76 badcount = 0
77
78 # keep track of column names
79 ncols = 0
80 with open(uri2path(input_f_file), 'rb') as csvfile:
81 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
82 for row in contents:
83
84 # initialise the column name
85 if ncols == 0:
86 ncols = len(row)
87
88 # get number of descriptors, and append if left empty
89 ncoldescr = len(columtype)
90 if ncoldescr < ncols:
91 warnings.warn("Column types missing")
92 columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)])
93
94 if len(row) == ncols:
95 # parse the csv data into dict
96 rowdict = dict()
97 for i,col in enumerate(columtype):
98 # first value (time) is transformed to float
99 if i == 0:
100 rowdict[col] = float(row[i])
101 else:
102 rowdict[col] = row[i]
103
104 # append dictionary to output
105 output.append(rowdict)
106
107 else:
108 badcount += 1
109
110 if badcount > 0:
111 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
112
113 return output