Mercurial > hg > dml-open-backendtools
comparison pyspark/csvParser.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e34cf1b6fe09 |
---|---|
1 # Part of DML (Digital Music Laboratory) | |
2 # | |
3 # This program is free software; you can redistribute it and/or | |
4 # modify it under the terms of the GNU General Public License | |
5 # as published by the Free Software Foundation; either version 2 | |
6 # of the License, or (at your option) any later version. | |
7 # | |
8 # This program is distributed in the hope that it will be useful, | |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 # GNU General Public License for more details. | |
12 # | |
13 # You should have received a copy of the GNU General Public | |
14 # License along with this library; if not, write to the Free Software | |
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
16 | |
17 # -*- coding: utf-8 -*- | |
18 __author__="Daniel Wolff" | |
19 | |
20 import codecs | |
21 import warnings | |
22 import numpy | |
23 import csv | |
24 from n3Parser import uri2path | |
25 | |
26 | |
27 # reads csv file into a table, | |
28 # the first column, containing "time" is converted to float, the rest is left at strings | |
29 # data formats are for example: | |
30 # for silvet pitch output:['time','duration','pitch','velocity','label'] | |
31 # for qm_vamp_key_standard output: ['time','keynr','label'] | |
32 # for qm_vamp_key_standard_tonic output: ['time','keynr','label'] | |
33 # | |
34 # data can be nicely traversed: | |
35 # for time, duration,pitch,velocity,label | |
36 def get_array_from_csv(input_f_file): | |
37 | |
38 output = [] | |
39 badcount = 0 | |
40 | |
41 # keep track of column names | |
42 ncols = 0 | |
43 with open(uri2path(input_f_file), 'rb') as csvfile: | |
44 contents = csv.reader(csvfile, delimiter=',', quotechar='"') | |
45 for row in contents: | |
46 if ncols == 0: | |
47 ncols = len(row) | |
48 | |
49 if len(row) >= ncols: | |
50 # we assume format time , ... | |
51 output.append([float(row[0])] + row[1:]) | |
52 else: | |
53 badcount += 1 | |
54 | |
55 if badcount > 0: | |
56 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") | |
57 | |
58 return output | |
59 | |
60 | |
61 | |
62 | |
63 | |
64 # converts csv input to dictionary with entities named as in "columtype". | |
65 # | |
66 # first value (time) is assumed to be float | |
67 # for silvet pitch output call_ | |
68 # csv_to_dict(input_f_file, columtype = ['time','duration','pitch','velocity','label']) | |
69 # for qm_vamp_key_standard output call | |
70 # csv_to_dict(input_f_file, columtype = ['time','keynr','label']) | |
71 # for qm_vamp_key_standard_tonic output call | |
72 # csv_to_dict(input_f_file, columtype = ['time','keynr','label']) | |
73 def get_dict_from_csv(input_f_file, columtype = ['time']): | |
74 | |
75 output = [] | |
76 badcount = 0 | |
77 | |
78 # keep track of column names | |
79 ncols = 0 | |
80 with open(uri2path(input_f_file), 'rb') as csvfile: | |
81 contents = csv.reader(csvfile, delimiter=',', quotechar='"') | |
82 for row in contents: | |
83 | |
84 # initialise the column name | |
85 if ncols == 0: | |
86 ncols = len(row) | |
87 | |
88 # get number of descriptors, and append if left empty | |
89 ncoldescr = len(columtype) | |
90 if ncoldescr < ncols: | |
91 warnings.warn("Column types missing") | |
92 columtype.extend(['data'+str(i) for i in range(ncoldescr+1, ncols+1)]) | |
93 | |
94 if len(row) == ncols: | |
95 # parse the csv data into dict | |
96 rowdict = dict() | |
97 for i,col in enumerate(columtype): | |
98 # first value (time) is transformed to float | |
99 if i == 0: | |
100 rowdict[col] = float(row[i]) | |
101 else: | |
102 rowdict[col] = row[i] | |
103 | |
104 # append dictionary to output | |
105 output.append(rowdict) | |
106 | |
107 else: | |
108 badcount += 1 | |
109 | |
110 if badcount > 0: | |
111 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") | |
112 | |
113 return output |