dml-open-backendtools: pyspark/transforms/tuningFrequencyStatistics.py comparison

comparison pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip

commit

author	Daniel Wolff
date	Sat, 20 Feb 2016 18:14:24 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e34cf1b6fe09
+# Part of DML (Digital Music Laboratory)
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+# -*- coding: utf-8 -*-
+__author__="Daniel Wolff, hargreaves"
+# this script derives standard statistics for tuning frequency,
+# in particular:
+# average
+# standard deviation
+from rdflib import Graph, Namespace, BNode, RDF, Literal
+import codecs
+import warnings
+import numpy
+import csv
+from n3Parser import get_rdf_graph_from_n3, uri2path
+# from csvParser import get_dict_from_csv, get_array_from_csv
+# statistics per clip ?
+perfilestats = 1
+# dml namespace
+dml_ns = Namespace("http://dml.org/dml/cla#")
+# Add triples representing a 'key histogram' result to
+# an RDF graph
+def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
+# add base
+output_bnode = BNode()
+output_rdf_graph.add((transform, dml_ns.output, output_bnode))
+for input_f_file in input_f_files:
+output_rdf_graph.add((transform, dml_ns.input, input_f_file))
+output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
+output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))
+# add mean and std
+output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
+output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
+# add histogram
+for i in range(0,len(statistics["histogram"]["count"])):
+bin_bnode = BNode()
+output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
+output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
+output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
+output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))
+return output_rdf_graph
+# Parse the input_f_files n3 files, and generate
+# a key histogram
+def find_cla_tf_statistics(input_f_files):
+sample_count = len(input_f_files)
+all_data = []
+perfile_freq = []
+perfile_hist = []
+hist_index =[]
+for input_f_file in input_f_files:
+# get all data from feature file
+data = file_to_table(input_f_file)
+# filter those rows which have an A
+# returns duration, frequency
+data = filter_norm_A(data)
+if perfilestats:
+# get frequency and duration columns
+freq = string2numpy(data,2)
+dur = string2numpy(data,1)
+# get mean values per clip now,
+# then statistics over clips later
+avg, std = numstats(freq, weights = dur)
+hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
+# remember statistics
+perfile_freq.append(avg)
+perfile_hist.append(hist["count"])
+# remember histogram index
+if len(hist_index) == 0:
+hist_index = hist["index"]
+else:
+# this version just adds everything per collection,
+# recordings are not treated as seperate entities
+all_data.extend(data)
+if perfilestats:
+avg, std = histostats(numpy.array(perfile_freq,dtype=float))
+hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
+else:
+# get frequency and duration columns
+freq = string2numpy(all_data,2)
+dur = string2numpy(all_data,1)
+# get basic statistics
+avg, std = numstats(freq, weights = dur)
+# get histogram weighted by duration
+hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
+return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)
+# convert one column, specified by datapos, to numpy
+def string2numpy(data,datapos):
+edata = []
+for row in data:
+edata.append(row[datapos])
+colu = numpy.array(edata,dtype=float)
+return colu
+#calculates the histogram
+# nbins: number of bins
+# lb: lower bound
+# ub: upper bound
+def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
+# lower bounds defined?
+if lb == -1 or ub == -1:
+lb = colu.min()
+ub = colu.max()
+# get histogram
+count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
+count = count.tolist()
+index = index.tolist()
+# normalise for clip
+count = count / numpy.max(count)
+# return histogram
+return {"count":count, "index":index}
+# calculates unweighted statistics for the histograms
+def histostats(counts):
+avg = numpy.average(counts, axis = 0).tolist()
+#weighted standard deviation
+std = numpy.std(counts, axis =0)
+#med = numpy.median(colu, weights = weights).tolist()
+# could use https://pypi.python.org/pypi/wquantiles for weighted median
+return (avg,std)
+#calculates weighted statistics for  numerical input
+def numstats(colu, weights = []):
+# we want to always use the last dimension
+# get average
+avg = numpy.average(colu, axis = 0 ,weights = weights)
+#weighted standard deviation
+std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
+#std = numpy.std(colu, weights = weights).tolist()
+#med = numpy.median(colu, weights = weights).tolist()
+# could use https://pypi.python.org/pypi/wquantiles for weighted median
+return (avg,std)
+# only returns data columns which refer to the note A
+# the frequencies are folded up / down to A4
+# returns time, duration, frequency
+def filter_norm_A(data):
+Adata = []
+for row in data:
+# we assume format time , duration , pitch, ingeger_pitch, label
+if 'A3' in row[4]:
+Adata.append(row[:2] + [2*row[2]])
+elif 'A4' in row[4]:
+Adata.append(row[:3])
+elif 'A5' in row[4]:
+Adata.append(row[:2] + [0.5*row[2]])
+return Adata
+# Read named features into table of format
+# time, feature[0], feature[1} ...
+def file_to_table(input_f_file):
+if input_f_file.endswith('.n3'):
+data = n3_to_table(input_f_file)
+elif input_f_file.endswith('.csv'):
+data = csv_to_table(input_f_file)
+#data = get_array_from_csv(input_f_file)
+#data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
+return data
+# Read named features into table of format
+# time, feature[0], feature[1} ...
+def n3_to_table(input_f_file):
+# read feature file
+feature_graph = get_rdf_graph_from_n3(input_f_file)
+# we construct a generic search string that gets all
+# necessary features
+q = """prefix dml:     <http://dml.org/dml/cla#>
+SELECT ?event ?tl_time ?tl_duration ?feature ?label
+WHERE {
+?event event:time ?event_time .
+?event_time tl:beginsAt ?tl_time .
+?event_time tl:duration ?tl_duration .
+?event rdfs:label ?label .
+?event af:feature ?feature .
+}"""
+# query parsed file
+qres = feature_graph.query(q)
+data = []
+for row in qres:
+# parse time
+tl_time_str_len = len(row.tl_time)
+tl_time = float(row.tl_time[2:tl_time_str_len-1])
+# parse duration
+tl_dur_str_len = len(row.tl_duration)
+tl_duration = row.tl_duration[2:tl_dur_str_len-1]
+# parse feature
+data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
+#data = numpy.array(data, dtype=float)
+# print data
+# we assume format time , duration , pitch, velocity, label
+return data #int(last_key)
+# todo: do the same conversion for csv, should allow to use the same script with csv
+def csv_to_table(input_f_file):
+output = []
+badcount = 0
+# keep track of column names
+ncols = 0
+with open(uri2path(input_f_file), 'rb') as csvfile:
+contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+for row in contents:
+if ncols == 0:
+ncols = len(row)
+	    if len(row) >= ncols:
+# we assume format time , duration , pitch, velocity, label
+	        output.append([float(row[0]), float(row[1]),  float(row[2])] + row[3:])
+else:
+badcount += 1
+if badcount > 0:
+warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
+return output

Mercurial > hg > dml-open-backendtools

comparison pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip