Mercurial > hg > dml-open-backendtools
comparison pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e34cf1b6fe09 |
---|---|
1 # Part of DML (Digital Music Laboratory) | |
2 # | |
3 # This program is free software; you can redistribute it and/or | |
4 # modify it under the terms of the GNU General Public License | |
5 # as published by the Free Software Foundation; either version 2 | |
6 # of the License, or (at your option) any later version. | |
7 # | |
8 # This program is distributed in the hope that it will be useful, | |
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
11 # GNU General Public License for more details. | |
12 # | |
13 # You should have received a copy of the GNU General Public | |
14 # License along with this library; if not, write to the Free Software | |
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
16 | |
17 # -*- coding: utf-8 -*- | |
18 __author__="Daniel Wolff, hargreaves" | |
19 | |
20 # this script derives standard statistics for tuning frequency, | |
21 # in particular: | |
22 # average | |
23 # standard deviation | |
24 | |
25 from rdflib import Graph, Namespace, BNode, RDF, Literal | |
26 import codecs | |
27 import warnings | |
28 import numpy | |
29 import csv | |
30 from n3Parser import get_rdf_graph_from_n3, uri2path | |
31 # from csvParser import get_dict_from_csv, get_array_from_csv | |
32 | |
33 # statistics per clip ? | |
34 perfilestats = 1 | |
35 | |
36 # dml namespace | |
37 dml_ns = Namespace("http://dml.org/dml/cla#") | |
38 | |
39 # Add triples representing a 'key histogram' result to | |
40 # an RDF graph | |
41 def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files): | |
42 | |
43 # add base | |
44 output_bnode = BNode() | |
45 output_rdf_graph.add((transform, dml_ns.output, output_bnode)) | |
46 for input_f_file in input_f_files: | |
47 output_rdf_graph.add((transform, dml_ns.input, input_f_file)) | |
48 output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics)) | |
49 output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count))) | |
50 | |
51 # add mean and std | |
52 output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"]))) | |
53 output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"]))) | |
54 | |
55 # add histogram | |
56 for i in range(0,len(statistics["histogram"]["count"])): | |
57 | |
58 bin_bnode = BNode() | |
59 output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode)) | |
60 output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1))) | |
61 output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i]))) | |
62 output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i]))) | |
63 | |
64 return output_rdf_graph | |
65 | |
66 # Parse the input_f_files n3 files, and generate | |
67 # a key histogram | |
68 def find_cla_tf_statistics(input_f_files): | |
69 | |
70 | |
71 sample_count = len(input_f_files) | |
72 | |
73 all_data = [] | |
74 perfile_freq = [] | |
75 perfile_hist = [] | |
76 hist_index =[] | |
77 for input_f_file in input_f_files: | |
78 | |
79 # get all data from feature file | |
80 data = file_to_table(input_f_file) | |
81 | |
82 # filter those rows which have an A | |
83 # returns duration, frequency | |
84 data = filter_norm_A(data) | |
85 | |
86 if perfilestats: | |
87 # get frequency and duration columns | |
88 freq = string2numpy(data,2) | |
89 dur = string2numpy(data,1) | |
90 # get mean values per clip now, | |
91 # then statistics over clips later | |
92 avg, std = numstats(freq, weights = dur) | |
93 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur) | |
94 | |
95 # remember statistics | |
96 perfile_freq.append(avg) | |
97 perfile_hist.append(hist["count"]) | |
98 | |
99 # remember histogram index | |
100 if len(hist_index) == 0: | |
101 hist_index = hist["index"] | |
102 | |
103 else: | |
104 # this version just adds everything per collection, | |
105 # recordings are not treated as seperate entities | |
106 all_data.extend(data) | |
107 | |
108 | |
109 if perfilestats: | |
110 avg, std = histostats(numpy.array(perfile_freq,dtype=float)) | |
111 hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float)) | |
112 | |
113 else: | |
114 # get frequency and duration columns | |
115 freq = string2numpy(all_data,2) | |
116 dur = string2numpy(all_data,1) | |
117 | |
118 # get basic statistics | |
119 avg, std = numstats(freq, weights = dur) | |
120 | |
121 # get histogram weighted by duration | |
122 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur) | |
123 | |
124 return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files) | |
125 | |
126 # convert one column, specified by datapos, to numpy | |
127 def string2numpy(data,datapos): | |
128 | |
129 edata = [] | |
130 for row in data: | |
131 edata.append(row[datapos]) | |
132 | |
133 colu = numpy.array(edata,dtype=float) | |
134 return colu | |
135 | |
136 #calculates the histogram | |
137 # nbins: number of bins | |
138 # lb: lower bound | |
139 # ub: upper bound | |
140 def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []): | |
141 | |
142 # lower bounds defined? | |
143 if lb == -1 or ub == -1: | |
144 lb = colu.min() | |
145 ub = colu.max() | |
146 | |
147 # get histogram | |
148 count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights) | |
149 count = count.tolist() | |
150 index = index.tolist() | |
151 | |
152 # normalise for clip | |
153 count = count / numpy.max(count) | |
154 | |
155 # return histogram | |
156 return {"count":count, "index":index} | |
157 | |
158 | |
159 # calculates unweighted statistics for the histograms | |
160 def histostats(counts): | |
161 avg = numpy.average(counts, axis = 0).tolist() | |
162 | |
163 #weighted standard deviation | |
164 std = numpy.std(counts, axis =0) | |
165 | |
166 #med = numpy.median(colu, weights = weights).tolist() | |
167 # could use https://pypi.python.org/pypi/wquantiles for weighted median | |
168 | |
169 return (avg,std) | |
170 | |
171 #calculates weighted statistics for numerical input | |
172 def numstats(colu, weights = []): | |
173 | |
174 # we want to always use the last dimension | |
175 # get average | |
176 avg = numpy.average(colu, axis = 0 ,weights = weights) | |
177 | |
178 #weighted standard deviation | |
179 std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights)) | |
180 #std = numpy.std(colu, weights = weights).tolist() | |
181 | |
182 #med = numpy.median(colu, weights = weights).tolist() | |
183 # could use https://pypi.python.org/pypi/wquantiles for weighted median | |
184 | |
185 return (avg,std) | |
186 | |
187 | |
188 # only returns data columns which refer to the note A | |
189 # the frequencies are folded up / down to A4 | |
190 # returns time, duration, frequency | |
191 def filter_norm_A(data): | |
192 Adata = [] | |
193 for row in data: | |
194 # we assume format time , duration , pitch, ingeger_pitch, label | |
195 if 'A3' in row[4]: | |
196 Adata.append(row[:2] + [2*row[2]]) | |
197 elif 'A4' in row[4]: | |
198 Adata.append(row[:3]) | |
199 elif 'A5' in row[4]: | |
200 Adata.append(row[:2] + [0.5*row[2]]) | |
201 | |
202 return Adata | |
203 | |
204 | |
205 # Read named features into table of format | |
206 # time, feature[0], feature[1} ... | |
207 def file_to_table(input_f_file): | |
208 if input_f_file.endswith('.n3'): | |
209 data = n3_to_table(input_f_file) | |
210 elif input_f_file.endswith('.csv'): | |
211 data = csv_to_table(input_f_file) | |
212 #data = get_array_from_csv(input_f_file) | |
213 #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label']) | |
214 return data | |
215 | |
216 | |
217 # Read named features into table of format | |
218 # time, feature[0], feature[1} ... | |
219 def n3_to_table(input_f_file): | |
220 | |
221 # read feature file | |
222 feature_graph = get_rdf_graph_from_n3(input_f_file) | |
223 | |
224 # we construct a generic search string that gets all | |
225 # necessary features | |
226 | |
227 q = """prefix dml: <http://dml.org/dml/cla#> | |
228 SELECT ?event ?tl_time ?tl_duration ?feature ?label | |
229 WHERE { | |
230 ?event event:time ?event_time . | |
231 ?event_time tl:beginsAt ?tl_time . | |
232 ?event_time tl:duration ?tl_duration . | |
233 ?event rdfs:label ?label . | |
234 ?event af:feature ?feature . | |
235 }""" | |
236 | |
237 # query parsed file | |
238 qres = feature_graph.query(q) | |
239 data = [] | |
240 for row in qres: | |
241 # parse time | |
242 tl_time_str_len = len(row.tl_time) | |
243 tl_time = float(row.tl_time[2:tl_time_str_len-1]) | |
244 | |
245 # parse duration | |
246 tl_dur_str_len = len(row.tl_duration) | |
247 tl_duration = row.tl_duration[2:tl_dur_str_len-1] | |
248 # parse feature | |
249 data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label]) | |
250 | |
251 #data = numpy.array(data, dtype=float) | |
252 # print data | |
253 # we assume format time , duration , pitch, velocity, label | |
254 return data #int(last_key) | |
255 | |
256 # todo: do the same conversion for csv, should allow to use the same script with csv | |
257 def csv_to_table(input_f_file): | |
258 | |
259 | |
260 output = [] | |
261 badcount = 0 | |
262 | |
263 # keep track of column names | |
264 ncols = 0 | |
265 with open(uri2path(input_f_file), 'rb') as csvfile: | |
266 contents = csv.reader(csvfile, delimiter=',', quotechar='"') | |
267 for row in contents: | |
268 if ncols == 0: | |
269 ncols = len(row) | |
270 | |
271 if len(row) >= ncols: | |
272 # we assume format time , duration , pitch, velocity, label | |
273 output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:]) | |
274 else: | |
275 badcount += 1 | |
276 | |
277 if badcount > 0: | |
278 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") | |
279 | |
280 return output |