dml-open-backendtools: pyspark/transforms/tuningFrequencyStatistics.py annotate

annotate pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip

commit

author	Daniel Wolff
date	Sat, 20 Feb 2016 18:14:24 +0100
parents
children

rev	line source
Daniel@0	1 # Part of DML (Digital Music Laboratory)
Daniel@0	2 #
Daniel@0	3 # This program is free software; you can redistribute it and/or
Daniel@0	4 # modify it under the terms of the GNU General Public License
Daniel@0	5 # as published by the Free Software Foundation; either version 2
Daniel@0	6 # of the License, or (at your option) any later version.
Daniel@0	7 #
Daniel@0	8 # This program is distributed in the hope that it will be useful,
Daniel@0	9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0	10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0	11 # GNU General Public License for more details.
Daniel@0	12 #
Daniel@0	13 # You should have received a copy of the GNU General Public
Daniel@0	14 # License along with this library; if not, write to the Free Software
Daniel@0	15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0	16
Daniel@0	17 # -- coding: utf-8 --
Daniel@0	18 __author__="Daniel Wolff, hargreaves"
Daniel@0	19
Daniel@0	20 # this script derives standard statistics for tuning frequency,
Daniel@0	21 # in particular:
Daniel@0	22 # average
Daniel@0	23 # standard deviation
Daniel@0	24
Daniel@0	25 from rdflib import Graph, Namespace, BNode, RDF, Literal
Daniel@0	26 import codecs
Daniel@0	27 import warnings
Daniel@0	28 import numpy
Daniel@0	29 import csv
Daniel@0	30 from n3Parser import get_rdf_graph_from_n3, uri2path
Daniel@0	31 # from csvParser import get_dict_from_csv, get_array_from_csv
Daniel@0	32
Daniel@0	33 # statistics per clip ?
Daniel@0	34 perfilestats = 1
Daniel@0	35
Daniel@0	36 # dml namespace
Daniel@0	37 dml_ns = Namespace("http://dml.org/dml/cla#")
Daniel@0	38
Daniel@0	39 # Add triples representing a 'key histogram' result to
Daniel@0	40 # an RDF graph
Daniel@0	41 def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
Daniel@0	42
Daniel@0	43 # add base
Daniel@0	44 output_bnode = BNode()
Daniel@0	45 output_rdf_graph.add((transform, dml_ns.output, output_bnode))
Daniel@0	46 for input_f_file in input_f_files:
Daniel@0	47 output_rdf_graph.add((transform, dml_ns.input, input_f_file))
Daniel@0	48 output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
Daniel@0	49 output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))
Daniel@0	50
Daniel@0	51 # add mean and std
Daniel@0	52 output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
Daniel@0	53 output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
Daniel@0	54
Daniel@0	55 # add histogram
Daniel@0	56 for i in range(0,len(statistics["histogram"]["count"])):
Daniel@0	57
Daniel@0	58 bin_bnode = BNode()
Daniel@0	59 output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
Daniel@0	60 output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
Daniel@0	61 output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
Daniel@0	62 output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))
Daniel@0	63
Daniel@0	64 return output_rdf_graph
Daniel@0	65
Daniel@0	66 # Parse the input_f_files n3 files, and generate
Daniel@0	67 # a key histogram
Daniel@0	68 def find_cla_tf_statistics(input_f_files):
Daniel@0	69
Daniel@0	70
Daniel@0	71 sample_count = len(input_f_files)
Daniel@0	72
Daniel@0	73 all_data = []
Daniel@0	74 perfile_freq = []
Daniel@0	75 perfile_hist = []
Daniel@0	76 hist_index =[]
Daniel@0	77 for input_f_file in input_f_files:
Daniel@0	78
Daniel@0	79 # get all data from feature file
Daniel@0	80 data = file_to_table(input_f_file)
Daniel@0	81
Daniel@0	82 # filter those rows which have an A
Daniel@0	83 # returns duration, frequency
Daniel@0	84 data = filter_norm_A(data)
Daniel@0	85
Daniel@0	86 if perfilestats:
Daniel@0	87 # get frequency and duration columns
Daniel@0	88 freq = string2numpy(data,2)
Daniel@0	89 dur = string2numpy(data,1)
Daniel@0	90 # get mean values per clip now,
Daniel@0	91 # then statistics over clips later
Daniel@0	92 avg, std = numstats(freq, weights = dur)
Daniel@0	93 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
Daniel@0	94
Daniel@0	95 # remember statistics
Daniel@0	96 perfile_freq.append(avg)
Daniel@0	97 perfile_hist.append(hist["count"])
Daniel@0	98
Daniel@0	99 # remember histogram index
Daniel@0	100 if len(hist_index) == 0:
Daniel@0	101 hist_index = hist["index"]
Daniel@0	102
Daniel@0	103 else:
Daniel@0	104 # this version just adds everything per collection,
Daniel@0	105 # recordings are not treated as seperate entities
Daniel@0	106 all_data.extend(data)
Daniel@0	107
Daniel@0	108
Daniel@0	109 if perfilestats:
Daniel@0	110 avg, std = histostats(numpy.array(perfile_freq,dtype=float))
Daniel@0	111 hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
Daniel@0	112
Daniel@0	113 else:
Daniel@0	114 # get frequency and duration columns
Daniel@0	115 freq = string2numpy(all_data,2)
Daniel@0	116 dur = string2numpy(all_data,1)
Daniel@0	117
Daniel@0	118 # get basic statistics
Daniel@0	119 avg, std = numstats(freq, weights = dur)
Daniel@0	120
Daniel@0	121 # get histogram weighted by duration
Daniel@0	122 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
Daniel@0	123
Daniel@0	124 return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)
Daniel@0	125
Daniel@0	126 # convert one column, specified by datapos, to numpy
Daniel@0	127 def string2numpy(data,datapos):
Daniel@0	128
Daniel@0	129 edata = []
Daniel@0	130 for row in data:
Daniel@0	131 edata.append(row[datapos])
Daniel@0	132
Daniel@0	133 colu = numpy.array(edata,dtype=float)
Daniel@0	134 return colu
Daniel@0	135
Daniel@0	136 #calculates the histogram
Daniel@0	137 # nbins: number of bins
Daniel@0	138 # lb: lower bound
Daniel@0	139 # ub: upper bound
Daniel@0	140 def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
Daniel@0	141
Daniel@0	142 # lower bounds defined?
Daniel@0	143 if lb == -1 or ub == -1:
Daniel@0	144 lb = colu.min()
Daniel@0	145 ub = colu.max()
Daniel@0	146
Daniel@0	147 # get histogram
Daniel@0	148 count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
Daniel@0	149 count = count.tolist()
Daniel@0	150 index = index.tolist()
Daniel@0	151
Daniel@0	152 # normalise for clip
Daniel@0	153 count = count / numpy.max(count)
Daniel@0	154
Daniel@0	155 # return histogram
Daniel@0	156 return {"count":count, "index":index}
Daniel@0	157
Daniel@0	158
Daniel@0	159 # calculates unweighted statistics for the histograms
Daniel@0	160 def histostats(counts):
Daniel@0	161 avg = numpy.average(counts, axis = 0).tolist()
Daniel@0	162
Daniel@0	163 #weighted standard deviation
Daniel@0	164 std = numpy.std(counts, axis =0)
Daniel@0	165
Daniel@0	166 #med = numpy.median(colu, weights = weights).tolist()
Daniel@0	167 # could use https://pypi.python.org/pypi/wquantiles for weighted median
Daniel@0	168
Daniel@0	169 return (avg,std)
Daniel@0	170
Daniel@0	171 #calculates weighted statistics for numerical input
Daniel@0	172 def numstats(colu, weights = []):
Daniel@0	173
Daniel@0	174 # we want to always use the last dimension
Daniel@0	175 # get average
Daniel@0	176 avg = numpy.average(colu, axis = 0 ,weights = weights)
Daniel@0	177
Daniel@0	178 #weighted standard deviation
Daniel@0	179 std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
Daniel@0	180 #std = numpy.std(colu, weights = weights).tolist()
Daniel@0	181
Daniel@0	182 #med = numpy.median(colu, weights = weights).tolist()
Daniel@0	183 # could use https://pypi.python.org/pypi/wquantiles for weighted median
Daniel@0	184
Daniel@0	185 return (avg,std)
Daniel@0	186
Daniel@0	187
Daniel@0	188 # only returns data columns which refer to the note A
Daniel@0	189 # the frequencies are folded up / down to A4
Daniel@0	190 # returns time, duration, frequency
Daniel@0	191 def filter_norm_A(data):
Daniel@0	192 Adata = []
Daniel@0	193 for row in data:
Daniel@0	194 # we assume format time , duration , pitch, ingeger_pitch, label
Daniel@0	195 if 'A3' in row[4]:
Daniel@0	196 Adata.append(row[:2] + [2*row[2]])
Daniel@0	197 elif 'A4' in row[4]:
Daniel@0	198 Adata.append(row[:3])
Daniel@0	199 elif 'A5' in row[4]:
Daniel@0	200 Adata.append(row[:2] + [0.5*row[2]])
Daniel@0	201
Daniel@0	202 return Adata
Daniel@0	203
Daniel@0	204
Daniel@0	205 # Read named features into table of format
Daniel@0	206 # time, feature[0], feature[1} ...
Daniel@0	207 def file_to_table(input_f_file):
Daniel@0	208 if input_f_file.endswith('.n3'):
Daniel@0	209 data = n3_to_table(input_f_file)
Daniel@0	210 elif input_f_file.endswith('.csv'):
Daniel@0	211 data = csv_to_table(input_f_file)
Daniel@0	212 #data = get_array_from_csv(input_f_file)
Daniel@0	213 #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
Daniel@0	214 return data
Daniel@0	215
Daniel@0	216
Daniel@0	217 # Read named features into table of format
Daniel@0	218 # time, feature[0], feature[1} ...
Daniel@0	219 def n3_to_table(input_f_file):
Daniel@0	220
Daniel@0	221 # read feature file
Daniel@0	222 feature_graph = get_rdf_graph_from_n3(input_f_file)
Daniel@0	223
Daniel@0	224 # we construct a generic search string that gets all
Daniel@0	225 # necessary features
Daniel@0	226
Daniel@0	227 q = """prefix dml: <http://dml.org/dml/cla#>
Daniel@0	228 SELECT ?event ?tl_time ?tl_duration ?feature ?label
Daniel@0	229 WHERE {
Daniel@0	230 ?event event:time ?event_time .
Daniel@0	231 ?event_time tl:beginsAt ?tl_time .
Daniel@0	232 ?event_time tl:duration ?tl_duration .
Daniel@0	233 ?event rdfs:label ?label .
Daniel@0	234 ?event af:feature ?feature .
Daniel@0	235 }"""
Daniel@0	236
Daniel@0	237 # query parsed file
Daniel@0	238 qres = feature_graph.query(q)
Daniel@0	239 data = []
Daniel@0	240 for row in qres:
Daniel@0	241 # parse time
Daniel@0	242 tl_time_str_len = len(row.tl_time)
Daniel@0	243 tl_time = float(row.tl_time[2:tl_time_str_len-1])
Daniel@0	244
Daniel@0	245 # parse duration
Daniel@0	246 tl_dur_str_len = len(row.tl_duration)
Daniel@0	247 tl_duration = row.tl_duration[2:tl_dur_str_len-1]
Daniel@0	248 # parse feature
Daniel@0	249 data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
Daniel@0	250
Daniel@0	251 #data = numpy.array(data, dtype=float)
Daniel@0	252 # print data
Daniel@0	253 # we assume format time , duration , pitch, velocity, label
Daniel@0	254 return data #int(last_key)
Daniel@0	255
Daniel@0	256 # todo: do the same conversion for csv, should allow to use the same script with csv
Daniel@0	257 def csv_to_table(input_f_file):
Daniel@0	258
Daniel@0	259
Daniel@0	260 output = []
Daniel@0	261 badcount = 0
Daniel@0	262
Daniel@0	263 # keep track of column names
Daniel@0	264 ncols = 0
Daniel@0	265 with open(uri2path(input_f_file), 'rb') as csvfile:
Daniel@0	266 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0	267 for row in contents:
Daniel@0	268 if ncols == 0:
Daniel@0	269 ncols = len(row)
Daniel@0	270
Daniel@0	271 if len(row) >= ncols:
Daniel@0	272 # we assume format time , duration , pitch, velocity, label
Daniel@0	273 output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:])
Daniel@0	274 else:
Daniel@0	275 badcount += 1
Daniel@0	276
Daniel@0	277 if badcount > 0:
Daniel@0	278 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
Daniel@0	279
Daniel@0	280 return output

Mercurial > hg > dml-open-backendtools

annotate pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip