Mercurial > hg > dml-open-backendtools
view pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line source
# Part of DML (Digital Music Laboratory) # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # -*- coding: utf-8 -*- __author__="Daniel Wolff, hargreaves" # this script derives standard statistics for tuning frequency, # in particular: # average # standard deviation from rdflib import Graph, Namespace, BNode, RDF, Literal import codecs import warnings import numpy import csv from n3Parser import get_rdf_graph_from_n3, uri2path # from csvParser import get_dict_from_csv, get_array_from_csv # statistics per clip ? perfilestats = 1 # dml namespace dml_ns = Namespace("http://dml.org/dml/cla#") # Add triples representing a 'key histogram' result to # an RDF graph def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files): # add base output_bnode = BNode() output_rdf_graph.add((transform, dml_ns.output, output_bnode)) for input_f_file in input_f_files: output_rdf_graph.add((transform, dml_ns.input, input_f_file)) output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics)) output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count))) # add mean and std output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"]))) output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"]))) # add histogram for i in range(0,len(statistics["histogram"]["count"])): bin_bnode = BNode() output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode)) output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1))) output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i]))) output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i]))) return output_rdf_graph # Parse the input_f_files n3 files, and generate # a key histogram def find_cla_tf_statistics(input_f_files): sample_count = len(input_f_files) all_data = [] perfile_freq = [] perfile_hist = [] hist_index =[] for input_f_file in input_f_files: # get all data from feature file data = file_to_table(input_f_file) # filter those rows which have an A # returns duration, frequency data = filter_norm_A(data) if perfilestats: # get frequency and duration columns freq = string2numpy(data,2) dur = string2numpy(data,1) # get mean values per clip now, # then statistics over clips later avg, std = numstats(freq, weights = dur) hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur) # remember statistics perfile_freq.append(avg) perfile_hist.append(hist["count"]) # remember histogram index if len(hist_index) == 0: hist_index = hist["index"] else: # this version just adds everything per collection, # recordings are not treated as seperate entities all_data.extend(data) if perfilestats: avg, std = histostats(numpy.array(perfile_freq,dtype=float)) hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float)) else: # get frequency and duration columns freq = string2numpy(all_data,2) dur = string2numpy(all_data,1) # get basic statistics avg, std = numstats(freq, weights = dur) # get histogram weighted by duration hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur) return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files) # convert one column, specified by datapos, to numpy def string2numpy(data,datapos): edata = [] for row in data: edata.append(row[datapos]) colu = numpy.array(edata,dtype=float) return colu #calculates the histogram # nbins: number of bins # lb: lower bound # ub: upper bound def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []): # lower bounds defined? if lb == -1 or ub == -1: lb = colu.min() ub = colu.max() # get histogram count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights) count = count.tolist() index = index.tolist() # normalise for clip count = count / numpy.max(count) # return histogram return {"count":count, "index":index} # calculates unweighted statistics for the histograms def histostats(counts): avg = numpy.average(counts, axis = 0).tolist() #weighted standard deviation std = numpy.std(counts, axis =0) #med = numpy.median(colu, weights = weights).tolist() # could use https://pypi.python.org/pypi/wquantiles for weighted median return (avg,std) #calculates weighted statistics for numerical input def numstats(colu, weights = []): # we want to always use the last dimension # get average avg = numpy.average(colu, axis = 0 ,weights = weights) #weighted standard deviation std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights)) #std = numpy.std(colu, weights = weights).tolist() #med = numpy.median(colu, weights = weights).tolist() # could use https://pypi.python.org/pypi/wquantiles for weighted median return (avg,std) # only returns data columns which refer to the note A # the frequencies are folded up / down to A4 # returns time, duration, frequency def filter_norm_A(data): Adata = [] for row in data: # we assume format time , duration , pitch, ingeger_pitch, label if 'A3' in row[4]: Adata.append(row[:2] + [2*row[2]]) elif 'A4' in row[4]: Adata.append(row[:3]) elif 'A5' in row[4]: Adata.append(row[:2] + [0.5*row[2]]) return Adata # Read named features into table of format # time, feature[0], feature[1} ... def file_to_table(input_f_file): if input_f_file.endswith('.n3'): data = n3_to_table(input_f_file) elif input_f_file.endswith('.csv'): data = csv_to_table(input_f_file) #data = get_array_from_csv(input_f_file) #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label']) return data # Read named features into table of format # time, feature[0], feature[1} ... def n3_to_table(input_f_file): # read feature file feature_graph = get_rdf_graph_from_n3(input_f_file) # we construct a generic search string that gets all # necessary features q = """prefix dml: <http://dml.org/dml/cla#> SELECT ?event ?tl_time ?tl_duration ?feature ?label WHERE { ?event event:time ?event_time . ?event_time tl:beginsAt ?tl_time . ?event_time tl:duration ?tl_duration . ?event rdfs:label ?label . ?event af:feature ?feature . }""" # query parsed file qres = feature_graph.query(q) data = [] for row in qres: # parse time tl_time_str_len = len(row.tl_time) tl_time = float(row.tl_time[2:tl_time_str_len-1]) # parse duration tl_dur_str_len = len(row.tl_duration) tl_duration = row.tl_duration[2:tl_dur_str_len-1] # parse feature data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label]) #data = numpy.array(data, dtype=float) # print data # we assume format time , duration , pitch, velocity, label return data #int(last_key) # todo: do the same conversion for csv, should allow to use the same script with csv def csv_to_table(input_f_file): output = [] badcount = 0 # keep track of column names ncols = 0 with open(uri2path(input_f_file), 'rb') as csvfile: contents = csv.reader(csvfile, delimiter=',', quotechar='"') for row in contents: if ncols == 0: ncols = len(row) if len(row) >= ncols: # we assume format time , duration , pitch, velocity, label output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:]) else: badcount += 1 if badcount > 0: warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") return output