Mercurial > hg > dml-open-backendtools
diff pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip
commit
author | Daniel Wolff |
---|---|
date | Sat, 20 Feb 2016 18:14:24 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyspark/transforms/tuningFrequencyStatistics.py Sat Feb 20 18:14:24 2016 +0100 @@ -0,0 +1,280 @@ +# Part of DML (Digital Music Laboratory) +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# -*- coding: utf-8 -*- +__author__="Daniel Wolff, hargreaves" + +# this script derives standard statistics for tuning frequency, +# in particular: +# average +# standard deviation + +from rdflib import Graph, Namespace, BNode, RDF, Literal +import codecs +import warnings +import numpy +import csv +from n3Parser import get_rdf_graph_from_n3, uri2path +# from csvParser import get_dict_from_csv, get_array_from_csv + +# statistics per clip ? +perfilestats = 1 + +# dml namespace +dml_ns = Namespace("http://dml.org/dml/cla#") + +# Add triples representing a 'key histogram' result to +# an RDF graph +def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files): + + # add base + output_bnode = BNode() + output_rdf_graph.add((transform, dml_ns.output, output_bnode)) + for input_f_file in input_f_files: + output_rdf_graph.add((transform, dml_ns.input, input_f_file)) + output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics)) + output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count))) + + # add mean and std + output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"]))) + output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"]))) + + # add histogram + for i in range(0,len(statistics["histogram"]["count"])): + + bin_bnode = BNode() + output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode)) + output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1))) + output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i]))) + output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i]))) + + return output_rdf_graph + +# Parse the input_f_files n3 files, and generate +# a key histogram +def find_cla_tf_statistics(input_f_files): + + + sample_count = len(input_f_files) + + all_data = [] + perfile_freq = [] + perfile_hist = [] + hist_index =[] + for input_f_file in input_f_files: + + # get all data from feature file + data = file_to_table(input_f_file) + + # filter those rows which have an A + # returns duration, frequency + data = filter_norm_A(data) + + if perfilestats: + # get frequency and duration columns + freq = string2numpy(data,2) + dur = string2numpy(data,1) + # get mean values per clip now, + # then statistics over clips later + avg, std = numstats(freq, weights = dur) + hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur) + + # remember statistics + perfile_freq.append(avg) + perfile_hist.append(hist["count"]) + + # remember histogram index + if len(hist_index) == 0: + hist_index = hist["index"] + + else: + # this version just adds everything per collection, + # recordings are not treated as seperate entities + all_data.extend(data) + + + if perfilestats: + avg, std = histostats(numpy.array(perfile_freq,dtype=float)) + hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float)) + + else: + # get frequency and duration columns + freq = string2numpy(all_data,2) + dur = string2numpy(all_data,1) + + # get basic statistics + avg, std = numstats(freq, weights = dur) + + # get histogram weighted by duration + hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur) + + return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files) + +# convert one column, specified by datapos, to numpy +def string2numpy(data,datapos): + + edata = [] + for row in data: + edata.append(row[datapos]) + + colu = numpy.array(edata,dtype=float) + return colu + +#calculates the histogram +# nbins: number of bins +# lb: lower bound +# ub: upper bound +def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []): + + # lower bounds defined? + if lb == -1 or ub == -1: + lb = colu.min() + ub = colu.max() + + # get histogram + count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights) + count = count.tolist() + index = index.tolist() + + # normalise for clip + count = count / numpy.max(count) + + # return histogram + return {"count":count, "index":index} + + +# calculates unweighted statistics for the histograms +def histostats(counts): + avg = numpy.average(counts, axis = 0).tolist() + + #weighted standard deviation + std = numpy.std(counts, axis =0) + + #med = numpy.median(colu, weights = weights).tolist() + # could use https://pypi.python.org/pypi/wquantiles for weighted median + + return (avg,std) + +#calculates weighted statistics for numerical input +def numstats(colu, weights = []): + + # we want to always use the last dimension + # get average + avg = numpy.average(colu, axis = 0 ,weights = weights) + + #weighted standard deviation + std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights)) + #std = numpy.std(colu, weights = weights).tolist() + + #med = numpy.median(colu, weights = weights).tolist() + # could use https://pypi.python.org/pypi/wquantiles for weighted median + + return (avg,std) + + +# only returns data columns which refer to the note A +# the frequencies are folded up / down to A4 +# returns time, duration, frequency +def filter_norm_A(data): + Adata = [] + for row in data: + # we assume format time , duration , pitch, ingeger_pitch, label + if 'A3' in row[4]: + Adata.append(row[:2] + [2*row[2]]) + elif 'A4' in row[4]: + Adata.append(row[:3]) + elif 'A5' in row[4]: + Adata.append(row[:2] + [0.5*row[2]]) + + return Adata + + +# Read named features into table of format +# time, feature[0], feature[1} ... +def file_to_table(input_f_file): + if input_f_file.endswith('.n3'): + data = n3_to_table(input_f_file) + elif input_f_file.endswith('.csv'): + data = csv_to_table(input_f_file) + #data = get_array_from_csv(input_f_file) + #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label']) + return data + + +# Read named features into table of format +# time, feature[0], feature[1} ... +def n3_to_table(input_f_file): + + # read feature file + feature_graph = get_rdf_graph_from_n3(input_f_file) + + # we construct a generic search string that gets all + # necessary features + + q = """prefix dml: <http://dml.org/dml/cla#> + SELECT ?event ?tl_time ?tl_duration ?feature ?label + WHERE { + ?event event:time ?event_time . + ?event_time tl:beginsAt ?tl_time . + ?event_time tl:duration ?tl_duration . + ?event rdfs:label ?label . + ?event af:feature ?feature . + }""" + + # query parsed file + qres = feature_graph.query(q) + data = [] + for row in qres: + # parse time + tl_time_str_len = len(row.tl_time) + tl_time = float(row.tl_time[2:tl_time_str_len-1]) + + # parse duration + tl_dur_str_len = len(row.tl_duration) + tl_duration = row.tl_duration[2:tl_dur_str_len-1] + # parse feature + data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label]) + + #data = numpy.array(data, dtype=float) + # print data + # we assume format time , duration , pitch, velocity, label + return data #int(last_key) + +# todo: do the same conversion for csv, should allow to use the same script with csv +def csv_to_table(input_f_file): + + + output = [] + badcount = 0 + + # keep track of column names + ncols = 0 + with open(uri2path(input_f_file), 'rb') as csvfile: + contents = csv.reader(csvfile, delimiter=',', quotechar='"') + for row in contents: + if ncols == 0: + ncols = len(row) + + if len(row) >= ncols: + # we assume format time , duration , pitch, velocity, label + output.append([float(row[0]), float(row[1]), float(row[2])] + row[3:]) + else: + badcount += 1 + + if badcount > 0: + warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries") + + return output \ No newline at end of file