comparison pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e34cf1b6fe09
1 # Part of DML (Digital Music Laboratory)
2 #
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public
14 # License along with this library; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
17 # -*- coding: utf-8 -*-
18 __author__="Daniel Wolff, hargreaves"
19
20 # this script derives standard statistics for tuning frequency,
21 # in particular:
22 # average
23 # standard deviation
24
25 from rdflib import Graph, Namespace, BNode, RDF, Literal
26 import codecs
27 import warnings
28 import numpy
29 import csv
30 from n3Parser import get_rdf_graph_from_n3, uri2path
31 # from csvParser import get_dict_from_csv, get_array_from_csv
32
33 # statistics per clip ?
34 perfilestats = 1
35
36 # dml namespace
37 dml_ns = Namespace("http://dml.org/dml/cla#")
38
39 # Add triples representing a 'key histogram' result to
40 # an RDF graph
41 def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
42
43 # add base
44 output_bnode = BNode()
45 output_rdf_graph.add((transform, dml_ns.output, output_bnode))
46 for input_f_file in input_f_files:
47 output_rdf_graph.add((transform, dml_ns.input, input_f_file))
48 output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
49 output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))
50
51 # add mean and std
52 output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
53 output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
54
55 # add histogram
56 for i in range(0,len(statistics["histogram"]["count"])):
57
58 bin_bnode = BNode()
59 output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
60 output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
61 output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
62 output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))
63
64 return output_rdf_graph
65
66 # Parse the input_f_files n3 files, and generate
67 # a key histogram
68 def find_cla_tf_statistics(input_f_files):
69
70
71 sample_count = len(input_f_files)
72
73 all_data = []
74 perfile_freq = []
75 perfile_hist = []
76 hist_index =[]
77 for input_f_file in input_f_files:
78
79 # get all data from feature file
80 data = file_to_table(input_f_file)
81
82 # filter those rows which have an A
83 # returns duration, frequency
84 data = filter_norm_A(data)
85
86 if perfilestats:
87 # get frequency and duration columns
88 freq = string2numpy(data,2)
89 dur = string2numpy(data,1)
90 # get mean values per clip now,
91 # then statistics over clips later
92 avg, std = numstats(freq, weights = dur)
93 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
94
95 # remember statistics
96 perfile_freq.append(avg)
97 perfile_hist.append(hist["count"])
98
99 # remember histogram index
100 if len(hist_index) == 0:
101 hist_index = hist["index"]
102
103 else:
104 # this version just adds everything per collection,
105 # recordings are not treated as seperate entities
106 all_data.extend(data)
107
108
109 if perfilestats:
110 avg, std = histostats(numpy.array(perfile_freq,dtype=float))
111 hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
112
113 else:
114 # get frequency and duration columns
115 freq = string2numpy(all_data,2)
116 dur = string2numpy(all_data,1)
117
118 # get basic statistics
119 avg, std = numstats(freq, weights = dur)
120
121 # get histogram weighted by duration
122 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
123
124 return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)
125
126 # convert one column, specified by datapos, to numpy
127 def string2numpy(data,datapos):
128
129 edata = []
130 for row in data:
131 edata.append(row[datapos])
132
133 colu = numpy.array(edata,dtype=float)
134 return colu
135
136 #calculates the histogram
137 # nbins: number of bins
138 # lb: lower bound
139 # ub: upper bound
140 def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
141
142 # lower bounds defined?
143 if lb == -1 or ub == -1:
144 lb = colu.min()
145 ub = colu.max()
146
147 # get histogram
148 count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
149 count = count.tolist()
150 index = index.tolist()
151
152 # normalise for clip
153 count = count / numpy.max(count)
154
155 # return histogram
156 return {"count":count, "index":index}
157
158
159 # calculates unweighted statistics for the histograms
160 def histostats(counts):
161 avg = numpy.average(counts, axis = 0).tolist()
162
163 #weighted standard deviation
164 std = numpy.std(counts, axis =0)
165
166 #med = numpy.median(colu, weights = weights).tolist()
167 # could use https://pypi.python.org/pypi/wquantiles for weighted median
168
169 return (avg,std)
170
171 #calculates weighted statistics for numerical input
172 def numstats(colu, weights = []):
173
174 # we want to always use the last dimension
175 # get average
176 avg = numpy.average(colu, axis = 0 ,weights = weights)
177
178 #weighted standard deviation
179 std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
180 #std = numpy.std(colu, weights = weights).tolist()
181
182 #med = numpy.median(colu, weights = weights).tolist()
183 # could use https://pypi.python.org/pypi/wquantiles for weighted median
184
185 return (avg,std)
186
187
188 # only returns data columns which refer to the note A
189 # the frequencies are folded up / down to A4
190 # returns time, duration, frequency
191 def filter_norm_A(data):
192 Adata = []
193 for row in data:
194 # we assume format time , duration , pitch, ingeger_pitch, label
195 if 'A3' in row[4]:
196 Adata.append(row[:2] + [2*row[2]])
197 elif 'A4' in row[4]:
198 Adata.append(row[:3])
199 elif 'A5' in row[4]:
200 Adata.append(row[:2] + [0.5*row[2]])
201
202 return Adata
203
204
205 # Read named features into table of format
206 # time, feature[0], feature[1} ...
207 def file_to_table(input_f_file):
208 if input_f_file.endswith('.n3'):
209 data = n3_to_table(input_f_file)
210 elif input_f_file.endswith('.csv'):
211 data = csv_to_table(input_f_file)
212 #data = get_array_from_csv(input_f_file)
213 #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
214 return data
215
216
217 # Read named features into table of format
218 # time, feature[0], feature[1} ...
219 def n3_to_table(input_f_file):
220
221 # read feature file
222 feature_graph = get_rdf_graph_from_n3(input_f_file)
223
224 # we construct a generic search string that gets all
225 # necessary features
226
227 q = """prefix dml: <http://dml.org/dml/cla#>
228 SELECT ?event ?tl_time ?tl_duration ?feature ?label
229 WHERE {
230 ?event event:time ?event_time .
231 ?event_time tl:beginsAt ?tl_time .
232 ?event_time tl:duration ?tl_duration .
233 ?event rdfs:label ?label .
234 ?event af:feature ?feature .
235 }"""
236
237 # query parsed file
238 qres = feature_graph.query(q)
239 data = []
240 for row in qres:
241 # parse time
242 tl_time_str_len = len(row.tl_time)
243 tl_time = float(row.tl_time[2:tl_time_str_len-1])
244
245 # parse duration
246 tl_dur_str_len = len(row.tl_duration)
247 tl_duration = row.tl_duration[2:tl_dur_str_len-1]
248 # parse feature
249 data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
250
251 #data = numpy.array(data, dtype=float)
252 # print data
253 # we assume format time , duration , pitch, velocity, label
254 return data #int(last_key)
255
256 # todo: do the same conversion for csv, should allow to use the same script with csv
257 def csv_to_table(input_f_file):
258
259
260 output = []
261 badcount = 0
262
263 # keep track of column names
264 ncols = 0
265 with open(uri2path(input_f_file), 'rb') as csvfile:
266 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
267 for row in contents:
268 if ncols == 0:
269 ncols = len(row)
270
271 if len(row) >= ncols:
272 # we assume format time , duration , pitch, velocity, label
273 output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:])
274 else:
275 badcount += 1
276
277 if badcount > 0:
278 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
279
280 return output