annotate pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
rev   line source
Daniel@0 1 # Part of DML (Digital Music Laboratory)
Daniel@0 2 #
Daniel@0 3 # This program is free software; you can redistribute it and/or
Daniel@0 4 # modify it under the terms of the GNU General Public License
Daniel@0 5 # as published by the Free Software Foundation; either version 2
Daniel@0 6 # of the License, or (at your option) any later version.
Daniel@0 7 #
Daniel@0 8 # This program is distributed in the hope that it will be useful,
Daniel@0 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
Daniel@0 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Daniel@0 11 # GNU General Public License for more details.
Daniel@0 12 #
Daniel@0 13 # You should have received a copy of the GNU General Public
Daniel@0 14 # License along with this library; if not, write to the Free Software
Daniel@0 15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Daniel@0 16
Daniel@0 17 # -*- coding: utf-8 -*-
Daniel@0 18 __author__="Daniel Wolff, hargreaves"
Daniel@0 19
Daniel@0 20 # this script derives standard statistics for tuning frequency,
Daniel@0 21 # in particular:
Daniel@0 22 # average
Daniel@0 23 # standard deviation
Daniel@0 24
Daniel@0 25 from rdflib import Graph, Namespace, BNode, RDF, Literal
Daniel@0 26 import codecs
Daniel@0 27 import warnings
Daniel@0 28 import numpy
Daniel@0 29 import csv
Daniel@0 30 from n3Parser import get_rdf_graph_from_n3, uri2path
Daniel@0 31 # from csvParser import get_dict_from_csv, get_array_from_csv
Daniel@0 32
Daniel@0 33 # statistics per clip ?
Daniel@0 34 perfilestats = 1
Daniel@0 35
Daniel@0 36 # dml namespace
Daniel@0 37 dml_ns = Namespace("http://dml.org/dml/cla#")
Daniel@0 38
Daniel@0 39 # Add triples representing a 'key histogram' result to
Daniel@0 40 # an RDF graph
Daniel@0 41 def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
Daniel@0 42
Daniel@0 43 # add base
Daniel@0 44 output_bnode = BNode()
Daniel@0 45 output_rdf_graph.add((transform, dml_ns.output, output_bnode))
Daniel@0 46 for input_f_file in input_f_files:
Daniel@0 47 output_rdf_graph.add((transform, dml_ns.input, input_f_file))
Daniel@0 48 output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
Daniel@0 49 output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))
Daniel@0 50
Daniel@0 51 # add mean and std
Daniel@0 52 output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
Daniel@0 53 output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
Daniel@0 54
Daniel@0 55 # add histogram
Daniel@0 56 for i in range(0,len(statistics["histogram"]["count"])):
Daniel@0 57
Daniel@0 58 bin_bnode = BNode()
Daniel@0 59 output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
Daniel@0 60 output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
Daniel@0 61 output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
Daniel@0 62 output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))
Daniel@0 63
Daniel@0 64 return output_rdf_graph
Daniel@0 65
Daniel@0 66 # Parse the input_f_files n3 files, and generate
Daniel@0 67 # a key histogram
Daniel@0 68 def find_cla_tf_statistics(input_f_files):
Daniel@0 69
Daniel@0 70
Daniel@0 71 sample_count = len(input_f_files)
Daniel@0 72
Daniel@0 73 all_data = []
Daniel@0 74 perfile_freq = []
Daniel@0 75 perfile_hist = []
Daniel@0 76 hist_index =[]
Daniel@0 77 for input_f_file in input_f_files:
Daniel@0 78
Daniel@0 79 # get all data from feature file
Daniel@0 80 data = file_to_table(input_f_file)
Daniel@0 81
Daniel@0 82 # filter those rows which have an A
Daniel@0 83 # returns duration, frequency
Daniel@0 84 data = filter_norm_A(data)
Daniel@0 85
Daniel@0 86 if perfilestats:
Daniel@0 87 # get frequency and duration columns
Daniel@0 88 freq = string2numpy(data,2)
Daniel@0 89 dur = string2numpy(data,1)
Daniel@0 90 # get mean values per clip now,
Daniel@0 91 # then statistics over clips later
Daniel@0 92 avg, std = numstats(freq, weights = dur)
Daniel@0 93 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
Daniel@0 94
Daniel@0 95 # remember statistics
Daniel@0 96 perfile_freq.append(avg)
Daniel@0 97 perfile_hist.append(hist["count"])
Daniel@0 98
Daniel@0 99 # remember histogram index
Daniel@0 100 if len(hist_index) == 0:
Daniel@0 101 hist_index = hist["index"]
Daniel@0 102
Daniel@0 103 else:
Daniel@0 104 # this version just adds everything per collection,
Daniel@0 105 # recordings are not treated as seperate entities
Daniel@0 106 all_data.extend(data)
Daniel@0 107
Daniel@0 108
Daniel@0 109 if perfilestats:
Daniel@0 110 avg, std = histostats(numpy.array(perfile_freq,dtype=float))
Daniel@0 111 hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
Daniel@0 112
Daniel@0 113 else:
Daniel@0 114 # get frequency and duration columns
Daniel@0 115 freq = string2numpy(all_data,2)
Daniel@0 116 dur = string2numpy(all_data,1)
Daniel@0 117
Daniel@0 118 # get basic statistics
Daniel@0 119 avg, std = numstats(freq, weights = dur)
Daniel@0 120
Daniel@0 121 # get histogram weighted by duration
Daniel@0 122 hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
Daniel@0 123
Daniel@0 124 return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)
Daniel@0 125
Daniel@0 126 # convert one column, specified by datapos, to numpy
Daniel@0 127 def string2numpy(data,datapos):
Daniel@0 128
Daniel@0 129 edata = []
Daniel@0 130 for row in data:
Daniel@0 131 edata.append(row[datapos])
Daniel@0 132
Daniel@0 133 colu = numpy.array(edata,dtype=float)
Daniel@0 134 return colu
Daniel@0 135
Daniel@0 136 #calculates the histogram
Daniel@0 137 # nbins: number of bins
Daniel@0 138 # lb: lower bound
Daniel@0 139 # ub: upper bound
Daniel@0 140 def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
Daniel@0 141
Daniel@0 142 # lower bounds defined?
Daniel@0 143 if lb == -1 or ub == -1:
Daniel@0 144 lb = colu.min()
Daniel@0 145 ub = colu.max()
Daniel@0 146
Daniel@0 147 # get histogram
Daniel@0 148 count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
Daniel@0 149 count = count.tolist()
Daniel@0 150 index = index.tolist()
Daniel@0 151
Daniel@0 152 # normalise for clip
Daniel@0 153 count = count / numpy.max(count)
Daniel@0 154
Daniel@0 155 # return histogram
Daniel@0 156 return {"count":count, "index":index}
Daniel@0 157
Daniel@0 158
Daniel@0 159 # calculates unweighted statistics for the histograms
Daniel@0 160 def histostats(counts):
Daniel@0 161 avg = numpy.average(counts, axis = 0).tolist()
Daniel@0 162
Daniel@0 163 #weighted standard deviation
Daniel@0 164 std = numpy.std(counts, axis =0)
Daniel@0 165
Daniel@0 166 #med = numpy.median(colu, weights = weights).tolist()
Daniel@0 167 # could use https://pypi.python.org/pypi/wquantiles for weighted median
Daniel@0 168
Daniel@0 169 return (avg,std)
Daniel@0 170
Daniel@0 171 #calculates weighted statistics for numerical input
Daniel@0 172 def numstats(colu, weights = []):
Daniel@0 173
Daniel@0 174 # we want to always use the last dimension
Daniel@0 175 # get average
Daniel@0 176 avg = numpy.average(colu, axis = 0 ,weights = weights)
Daniel@0 177
Daniel@0 178 #weighted standard deviation
Daniel@0 179 std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
Daniel@0 180 #std = numpy.std(colu, weights = weights).tolist()
Daniel@0 181
Daniel@0 182 #med = numpy.median(colu, weights = weights).tolist()
Daniel@0 183 # could use https://pypi.python.org/pypi/wquantiles for weighted median
Daniel@0 184
Daniel@0 185 return (avg,std)
Daniel@0 186
Daniel@0 187
Daniel@0 188 # only returns data columns which refer to the note A
Daniel@0 189 # the frequencies are folded up / down to A4
Daniel@0 190 # returns time, duration, frequency
Daniel@0 191 def filter_norm_A(data):
Daniel@0 192 Adata = []
Daniel@0 193 for row in data:
Daniel@0 194 # we assume format time , duration , pitch, ingeger_pitch, label
Daniel@0 195 if 'A3' in row[4]:
Daniel@0 196 Adata.append(row[:2] + [2*row[2]])
Daniel@0 197 elif 'A4' in row[4]:
Daniel@0 198 Adata.append(row[:3])
Daniel@0 199 elif 'A5' in row[4]:
Daniel@0 200 Adata.append(row[:2] + [0.5*row[2]])
Daniel@0 201
Daniel@0 202 return Adata
Daniel@0 203
Daniel@0 204
Daniel@0 205 # Read named features into table of format
Daniel@0 206 # time, feature[0], feature[1} ...
Daniel@0 207 def file_to_table(input_f_file):
Daniel@0 208 if input_f_file.endswith('.n3'):
Daniel@0 209 data = n3_to_table(input_f_file)
Daniel@0 210 elif input_f_file.endswith('.csv'):
Daniel@0 211 data = csv_to_table(input_f_file)
Daniel@0 212 #data = get_array_from_csv(input_f_file)
Daniel@0 213 #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
Daniel@0 214 return data
Daniel@0 215
Daniel@0 216
Daniel@0 217 # Read named features into table of format
Daniel@0 218 # time, feature[0], feature[1} ...
Daniel@0 219 def n3_to_table(input_f_file):
Daniel@0 220
Daniel@0 221 # read feature file
Daniel@0 222 feature_graph = get_rdf_graph_from_n3(input_f_file)
Daniel@0 223
Daniel@0 224 # we construct a generic search string that gets all
Daniel@0 225 # necessary features
Daniel@0 226
Daniel@0 227 q = """prefix dml: <http://dml.org/dml/cla#>
Daniel@0 228 SELECT ?event ?tl_time ?tl_duration ?feature ?label
Daniel@0 229 WHERE {
Daniel@0 230 ?event event:time ?event_time .
Daniel@0 231 ?event_time tl:beginsAt ?tl_time .
Daniel@0 232 ?event_time tl:duration ?tl_duration .
Daniel@0 233 ?event rdfs:label ?label .
Daniel@0 234 ?event af:feature ?feature .
Daniel@0 235 }"""
Daniel@0 236
Daniel@0 237 # query parsed file
Daniel@0 238 qres = feature_graph.query(q)
Daniel@0 239 data = []
Daniel@0 240 for row in qres:
Daniel@0 241 # parse time
Daniel@0 242 tl_time_str_len = len(row.tl_time)
Daniel@0 243 tl_time = float(row.tl_time[2:tl_time_str_len-1])
Daniel@0 244
Daniel@0 245 # parse duration
Daniel@0 246 tl_dur_str_len = len(row.tl_duration)
Daniel@0 247 tl_duration = row.tl_duration[2:tl_dur_str_len-1]
Daniel@0 248 # parse feature
Daniel@0 249 data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
Daniel@0 250
Daniel@0 251 #data = numpy.array(data, dtype=float)
Daniel@0 252 # print data
Daniel@0 253 # we assume format time , duration , pitch, velocity, label
Daniel@0 254 return data #int(last_key)
Daniel@0 255
Daniel@0 256 # todo: do the same conversion for csv, should allow to use the same script with csv
Daniel@0 257 def csv_to_table(input_f_file):
Daniel@0 258
Daniel@0 259
Daniel@0 260 output = []
Daniel@0 261 badcount = 0
Daniel@0 262
Daniel@0 263 # keep track of column names
Daniel@0 264 ncols = 0
Daniel@0 265 with open(uri2path(input_f_file), 'rb') as csvfile:
Daniel@0 266 contents = csv.reader(csvfile, delimiter=',', quotechar='"')
Daniel@0 267 for row in contents:
Daniel@0 268 if ncols == 0:
Daniel@0 269 ncols = len(row)
Daniel@0 270
Daniel@0 271 if len(row) >= ncols:
Daniel@0 272 # we assume format time , duration , pitch, velocity, label
Daniel@0 273 output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:])
Daniel@0 274 else:
Daniel@0 275 badcount += 1
Daniel@0 276
Daniel@0 277 if badcount > 0:
Daniel@0 278 warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
Daniel@0 279
Daniel@0 280 return output