Daniel@0
|
1 # Part of DML (Digital Music Laboratory)
|
Daniel@0
|
2 #
|
Daniel@0
|
3 # This program is free software; you can redistribute it and/or
|
Daniel@0
|
4 # modify it under the terms of the GNU General Public License
|
Daniel@0
|
5 # as published by the Free Software Foundation; either version 2
|
Daniel@0
|
6 # of the License, or (at your option) any later version.
|
Daniel@0
|
7 #
|
Daniel@0
|
8 # This program is distributed in the hope that it will be useful,
|
Daniel@0
|
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Daniel@0
|
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Daniel@0
|
11 # GNU General Public License for more details.
|
Daniel@0
|
12 #
|
Daniel@0
|
13 # You should have received a copy of the GNU General Public
|
Daniel@0
|
14 # License along with this library; if not, write to the Free Software
|
Daniel@0
|
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Daniel@0
|
16
|
Daniel@0
|
17 # -*- coding: utf-8 -*-
|
Daniel@0
|
18 __author__="Daniel Wolff, hargreaves"
|
Daniel@0
|
19
|
Daniel@0
|
20 # this script derives standard statistics for tuning frequency,
|
Daniel@0
|
21 # in particular:
|
Daniel@0
|
22 # average
|
Daniel@0
|
23 # standard deviation
|
Daniel@0
|
24
|
Daniel@0
|
25 from rdflib import Graph, Namespace, BNode, RDF, Literal
|
Daniel@0
|
26 import codecs
|
Daniel@0
|
27 import warnings
|
Daniel@0
|
28 import numpy
|
Daniel@0
|
29 import csv
|
Daniel@0
|
30 from n3Parser import get_rdf_graph_from_n3, uri2path
|
Daniel@0
|
31 # from csvParser import get_dict_from_csv, get_array_from_csv
|
Daniel@0
|
32
|
Daniel@0
|
33 # statistics per clip ?
|
Daniel@0
|
34 perfilestats = 1
|
Daniel@0
|
35
|
Daniel@0
|
36 # dml namespace
|
Daniel@0
|
37 dml_ns = Namespace("http://dml.org/dml/cla#")
|
Daniel@0
|
38
|
Daniel@0
|
39 # Add triples representing a 'key histogram' result to
|
Daniel@0
|
40 # an RDF graph
|
Daniel@0
|
41 def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
|
Daniel@0
|
42
|
Daniel@0
|
43 # add base
|
Daniel@0
|
44 output_bnode = BNode()
|
Daniel@0
|
45 output_rdf_graph.add((transform, dml_ns.output, output_bnode))
|
Daniel@0
|
46 for input_f_file in input_f_files:
|
Daniel@0
|
47 output_rdf_graph.add((transform, dml_ns.input, input_f_file))
|
Daniel@0
|
48 output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
|
Daniel@0
|
49 output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))
|
Daniel@0
|
50
|
Daniel@0
|
51 # add mean and std
|
Daniel@0
|
52 output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
|
Daniel@0
|
53 output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
|
Daniel@0
|
54
|
Daniel@0
|
55 # add histogram
|
Daniel@0
|
56 for i in range(0,len(statistics["histogram"]["count"])):
|
Daniel@0
|
57
|
Daniel@0
|
58 bin_bnode = BNode()
|
Daniel@0
|
59 output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
|
Daniel@0
|
60 output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
|
Daniel@0
|
61 output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
|
Daniel@0
|
62 output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))
|
Daniel@0
|
63
|
Daniel@0
|
64 return output_rdf_graph
|
Daniel@0
|
65
|
Daniel@0
|
66 # Parse the input_f_files n3 files, and generate
|
Daniel@0
|
67 # a key histogram
|
Daniel@0
|
68 def find_cla_tf_statistics(input_f_files):
|
Daniel@0
|
69
|
Daniel@0
|
70
|
Daniel@0
|
71 sample_count = len(input_f_files)
|
Daniel@0
|
72
|
Daniel@0
|
73 all_data = []
|
Daniel@0
|
74 perfile_freq = []
|
Daniel@0
|
75 perfile_hist = []
|
Daniel@0
|
76 hist_index =[]
|
Daniel@0
|
77 for input_f_file in input_f_files:
|
Daniel@0
|
78
|
Daniel@0
|
79 # get all data from feature file
|
Daniel@0
|
80 data = file_to_table(input_f_file)
|
Daniel@0
|
81
|
Daniel@0
|
82 # filter those rows which have an A
|
Daniel@0
|
83 # returns duration, frequency
|
Daniel@0
|
84 data = filter_norm_A(data)
|
Daniel@0
|
85
|
Daniel@0
|
86 if perfilestats:
|
Daniel@0
|
87 # get frequency and duration columns
|
Daniel@0
|
88 freq = string2numpy(data,2)
|
Daniel@0
|
89 dur = string2numpy(data,1)
|
Daniel@0
|
90 # get mean values per clip now,
|
Daniel@0
|
91 # then statistics over clips later
|
Daniel@0
|
92 avg, std = numstats(freq, weights = dur)
|
Daniel@0
|
93 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
|
Daniel@0
|
94
|
Daniel@0
|
95 # remember statistics
|
Daniel@0
|
96 perfile_freq.append(avg)
|
Daniel@0
|
97 perfile_hist.append(hist["count"])
|
Daniel@0
|
98
|
Daniel@0
|
99 # remember histogram index
|
Daniel@0
|
100 if len(hist_index) == 0:
|
Daniel@0
|
101 hist_index = hist["index"]
|
Daniel@0
|
102
|
Daniel@0
|
103 else:
|
Daniel@0
|
104 # this version just adds everything per collection,
|
Daniel@0
|
105 # recordings are not treated as seperate entities
|
Daniel@0
|
106 all_data.extend(data)
|
Daniel@0
|
107
|
Daniel@0
|
108
|
Daniel@0
|
109 if perfilestats:
|
Daniel@0
|
110 avg, std = histostats(numpy.array(perfile_freq,dtype=float))
|
Daniel@0
|
111 hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
|
Daniel@0
|
112
|
Daniel@0
|
113 else:
|
Daniel@0
|
114 # get frequency and duration columns
|
Daniel@0
|
115 freq = string2numpy(all_data,2)
|
Daniel@0
|
116 dur = string2numpy(all_data,1)
|
Daniel@0
|
117
|
Daniel@0
|
118 # get basic statistics
|
Daniel@0
|
119 avg, std = numstats(freq, weights = dur)
|
Daniel@0
|
120
|
Daniel@0
|
121 # get histogram weighted by duration
|
Daniel@0
|
122 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
|
Daniel@0
|
123
|
Daniel@0
|
124 return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)
|
Daniel@0
|
125
|
Daniel@0
|
126 # convert one column, specified by datapos, to numpy
|
Daniel@0
|
127 def string2numpy(data,datapos):
|
Daniel@0
|
128
|
Daniel@0
|
129 edata = []
|
Daniel@0
|
130 for row in data:
|
Daniel@0
|
131 edata.append(row[datapos])
|
Daniel@0
|
132
|
Daniel@0
|
133 colu = numpy.array(edata,dtype=float)
|
Daniel@0
|
134 return colu
|
Daniel@0
|
135
|
Daniel@0
|
136 #calculates the histogram
|
Daniel@0
|
137 # nbins: number of bins
|
Daniel@0
|
138 # lb: lower bound
|
Daniel@0
|
139 # ub: upper bound
|
Daniel@0
|
140 def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
|
Daniel@0
|
141
|
Daniel@0
|
142 # lower bounds defined?
|
Daniel@0
|
143 if lb == -1 or ub == -1:
|
Daniel@0
|
144 lb = colu.min()
|
Daniel@0
|
145 ub = colu.max()
|
Daniel@0
|
146
|
Daniel@0
|
147 # get histogram
|
Daniel@0
|
148 count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
|
Daniel@0
|
149 count = count.tolist()
|
Daniel@0
|
150 index = index.tolist()
|
Daniel@0
|
151
|
Daniel@0
|
152 # normalise for clip
|
Daniel@0
|
153 count = count / numpy.max(count)
|
Daniel@0
|
154
|
Daniel@0
|
155 # return histogram
|
Daniel@0
|
156 return {"count":count, "index":index}
|
Daniel@0
|
157
|
Daniel@0
|
158
|
Daniel@0
|
159 # calculates unweighted statistics for the histograms
|
Daniel@0
|
160 def histostats(counts):
|
Daniel@0
|
161 avg = numpy.average(counts, axis = 0).tolist()
|
Daniel@0
|
162
|
Daniel@0
|
163 #weighted standard deviation
|
Daniel@0
|
164 std = numpy.std(counts, axis =0)
|
Daniel@0
|
165
|
Daniel@0
|
166 #med = numpy.median(colu, weights = weights).tolist()
|
Daniel@0
|
167 # could use https://pypi.python.org/pypi/wquantiles for weighted median
|
Daniel@0
|
168
|
Daniel@0
|
169 return (avg,std)
|
Daniel@0
|
170
|
Daniel@0
|
171 #calculates weighted statistics for numerical input
|
Daniel@0
|
172 def numstats(colu, weights = []):
|
Daniel@0
|
173
|
Daniel@0
|
174 # we want to always use the last dimension
|
Daniel@0
|
175 # get average
|
Daniel@0
|
176 avg = numpy.average(colu, axis = 0 ,weights = weights)
|
Daniel@0
|
177
|
Daniel@0
|
178 #weighted standard deviation
|
Daniel@0
|
179 std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
|
Daniel@0
|
180 #std = numpy.std(colu, weights = weights).tolist()
|
Daniel@0
|
181
|
Daniel@0
|
182 #med = numpy.median(colu, weights = weights).tolist()
|
Daniel@0
|
183 # could use https://pypi.python.org/pypi/wquantiles for weighted median
|
Daniel@0
|
184
|
Daniel@0
|
185 return (avg,std)
|
Daniel@0
|
186
|
Daniel@0
|
187
|
Daniel@0
|
188 # only returns data columns which refer to the note A
|
Daniel@0
|
189 # the frequencies are folded up / down to A4
|
Daniel@0
|
190 # returns time, duration, frequency
|
Daniel@0
|
191 def filter_norm_A(data):
|
Daniel@0
|
192 Adata = []
|
Daniel@0
|
193 for row in data:
|
Daniel@0
|
194 # we assume format time , duration , pitch, ingeger_pitch, label
|
Daniel@0
|
195 if 'A3' in row[4]:
|
Daniel@0
|
196 Adata.append(row[:2] + [2*row[2]])
|
Daniel@0
|
197 elif 'A4' in row[4]:
|
Daniel@0
|
198 Adata.append(row[:3])
|
Daniel@0
|
199 elif 'A5' in row[4]:
|
Daniel@0
|
200 Adata.append(row[:2] + [0.5*row[2]])
|
Daniel@0
|
201
|
Daniel@0
|
202 return Adata
|
Daniel@0
|
203
|
Daniel@0
|
204
|
Daniel@0
|
205 # Read named features into table of format
|
Daniel@0
|
206 # time, feature[0], feature[1} ...
|
Daniel@0
|
207 def file_to_table(input_f_file):
|
Daniel@0
|
208 if input_f_file.endswith('.n3'):
|
Daniel@0
|
209 data = n3_to_table(input_f_file)
|
Daniel@0
|
210 elif input_f_file.endswith('.csv'):
|
Daniel@0
|
211 data = csv_to_table(input_f_file)
|
Daniel@0
|
212 #data = get_array_from_csv(input_f_file)
|
Daniel@0
|
213 #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
|
Daniel@0
|
214 return data
|
Daniel@0
|
215
|
Daniel@0
|
216
|
Daniel@0
|
217 # Read named features into table of format
|
Daniel@0
|
218 # time, feature[0], feature[1} ...
|
Daniel@0
|
219 def n3_to_table(input_f_file):
|
Daniel@0
|
220
|
Daniel@0
|
221 # read feature file
|
Daniel@0
|
222 feature_graph = get_rdf_graph_from_n3(input_f_file)
|
Daniel@0
|
223
|
Daniel@0
|
224 # we construct a generic search string that gets all
|
Daniel@0
|
225 # necessary features
|
Daniel@0
|
226
|
Daniel@0
|
227 q = """prefix dml: <http://dml.org/dml/cla#>
|
Daniel@0
|
228 SELECT ?event ?tl_time ?tl_duration ?feature ?label
|
Daniel@0
|
229 WHERE {
|
Daniel@0
|
230 ?event event:time ?event_time .
|
Daniel@0
|
231 ?event_time tl:beginsAt ?tl_time .
|
Daniel@0
|
232 ?event_time tl:duration ?tl_duration .
|
Daniel@0
|
233 ?event rdfs:label ?label .
|
Daniel@0
|
234 ?event af:feature ?feature .
|
Daniel@0
|
235 }"""
|
Daniel@0
|
236
|
Daniel@0
|
237 # query parsed file
|
Daniel@0
|
238 qres = feature_graph.query(q)
|
Daniel@0
|
239 data = []
|
Daniel@0
|
240 for row in qres:
|
Daniel@0
|
241 # parse time
|
Daniel@0
|
242 tl_time_str_len = len(row.tl_time)
|
Daniel@0
|
243 tl_time = float(row.tl_time[2:tl_time_str_len-1])
|
Daniel@0
|
244
|
Daniel@0
|
245 # parse duration
|
Daniel@0
|
246 tl_dur_str_len = len(row.tl_duration)
|
Daniel@0
|
247 tl_duration = row.tl_duration[2:tl_dur_str_len-1]
|
Daniel@0
|
248 # parse feature
|
Daniel@0
|
249 data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
|
Daniel@0
|
250
|
Daniel@0
|
251 #data = numpy.array(data, dtype=float)
|
Daniel@0
|
252 # print data
|
Daniel@0
|
253 # we assume format time , duration , pitch, velocity, label
|
Daniel@0
|
254 return data #int(last_key)
|
Daniel@0
|
255
|
Daniel@0
|
256 # todo: do the same conversion for csv, should allow to use the same script with csv
|
Daniel@0
|
257 def csv_to_table(input_f_file):
|
Daniel@0
|
258
|
Daniel@0
|
259
|
Daniel@0
|
260 output = []
|
Daniel@0
|
261 badcount = 0
|
Daniel@0
|
262
|
Daniel@0
|
263 # keep track of column names
|
Daniel@0
|
264 ncols = 0
|
Daniel@0
|
265 with open(uri2path(input_f_file), 'rb') as csvfile:
|
Daniel@0
|
266 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
|
Daniel@0
|
267 for row in contents:
|
Daniel@0
|
268 if ncols == 0:
|
Daniel@0
|
269 ncols = len(row)
|
Daniel@0
|
270
|
Daniel@0
|
271 if len(row) >= ncols:
|
Daniel@0
|
272 # we assume format time , duration , pitch, velocity, label
|
Daniel@0
|
273 output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:])
|
Daniel@0
|
274 else:
|
Daniel@0
|
275 badcount += 1
|
Daniel@0
|
276
|
Daniel@0
|
277 if badcount > 0:
|
Daniel@0
|
278 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
|
Daniel@0
|
279
|
Daniel@0
|
280 return output |