diff pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyspark/transforms/tuningFrequencyStatistics.py	Sat Feb 20 18:14:24 2016 +0100
@@ -0,0 +1,280 @@
+# Part of DML (Digital Music Laboratory)
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+# -*- coding: utf-8 -*-
+__author__="Daniel Wolff, hargreaves"
+
+# this script derives standard statistics for tuning frequency, 
+# in particular:
+# average
+# standard deviation
+
+from rdflib import Graph, Namespace, BNode, RDF, Literal
+import codecs
+import warnings
+import numpy
+import csv
+from n3Parser import get_rdf_graph_from_n3, uri2path
+# from csvParser import get_dict_from_csv, get_array_from_csv
+
+# statistics per clip ?
+perfilestats = 1
+
+# dml namespace
+dml_ns = Namespace("http://dml.org/dml/cla#")
+
+# Add triples representing a 'key histogram' result to
+# an RDF graph
+def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
+    
+    # add base
+    output_bnode = BNode()
+    output_rdf_graph.add((transform, dml_ns.output, output_bnode))
+    for input_f_file in input_f_files:
+        output_rdf_graph.add((transform, dml_ns.input, input_f_file))
+    output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
+    output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))
+
+    # add mean and std
+    output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
+    output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
+    
+    # add histogram
+    for i in range(0,len(statistics["histogram"]["count"])):
+
+        bin_bnode = BNode()
+        output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
+        output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
+        output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
+        output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))
+
+    return output_rdf_graph
+
+# Parse the input_f_files n3 files, and generate
+# a key histogram
+def find_cla_tf_statistics(input_f_files):
+
+
+    sample_count = len(input_f_files)
+    
+    all_data = []
+    perfile_freq = []
+    perfile_hist = []
+    hist_index =[]
+    for input_f_file in input_f_files:
+
+        # get all data from feature file
+        data = file_to_table(input_f_file)
+
+        # filter those rows which have an A
+        # returns duration, frequency
+        data = filter_norm_A(data)
+        
+        if perfilestats:
+            # get frequency and duration columns
+            freq = string2numpy(data,2)
+            dur = string2numpy(data,1)
+            # get mean values per clip now,
+            # then statistics over clips later
+            avg, std = numstats(freq, weights = dur)
+            hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
+            
+            # remember statistics
+            perfile_freq.append(avg) 
+            perfile_hist.append(hist["count"])
+            
+            # remember histogram index
+            if len(hist_index) == 0:
+                hist_index = hist["index"]
+            
+        else:
+            # this version just adds everything per collection,
+            # recordings are not treated as seperate entities
+            all_data.extend(data)
+            
+
+    if perfilestats:
+        avg, std = histostats(numpy.array(perfile_freq,dtype=float))
+        hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
+    
+    else:
+        # get frequency and duration columns
+        freq = string2numpy(all_data,2)
+        dur = string2numpy(all_data,1)
+
+        # get basic statistics
+        avg, std = numstats(freq, weights = dur)
+
+        # get histogram weighted by duration
+        hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
+        
+    return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)
+
+# convert one column, specified by datapos, to numpy
+def string2numpy(data,datapos):
+    
+    edata = []
+    for row in data:
+        edata.append(row[datapos])
+        
+    colu = numpy.array(edata,dtype=float)
+    return colu
+
+#calculates the histogram
+# nbins: number of bins
+# lb: lower bound
+# ub: upper bound
+def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
+    
+    # lower bounds defined?
+    if lb == -1 or ub == -1:
+        lb = colu.min()
+        ub = colu.max()
+        
+    # get histogram
+    count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
+    count = count.tolist()
+    index = index.tolist()
+    
+    # normalise for clip
+    count = count / numpy.max(count)
+    
+    # return histogram
+    return {"count":count, "index":index}
+
+
+# calculates unweighted statistics for the histograms
+def histostats(counts):
+    avg = numpy.average(counts, axis = 0).tolist()
+
+    #weighted standard deviation
+    std = numpy.std(counts, axis =0)
+
+    #med = numpy.median(colu, weights = weights).tolist()
+    # could use https://pypi.python.org/pypi/wquantiles for weighted median
+
+    return (avg,std)
+    
+#calculates weighted statistics for  numerical input
+def numstats(colu, weights = []):
+    
+    # we want to always use the last dimension
+    # get average
+    avg = numpy.average(colu, axis = 0 ,weights = weights)
+
+    #weighted standard deviation
+    std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
+    #std = numpy.std(colu, weights = weights).tolist()
+    
+    #med = numpy.median(colu, weights = weights).tolist()
+    # could use https://pypi.python.org/pypi/wquantiles for weighted median
+
+    return (avg,std)
+
+
+# only returns data columns which refer to the note A
+# the frequencies are folded up / down to A4
+# returns time, duration, frequency
+def filter_norm_A(data):
+    Adata = []
+    for row in data:
+        # we assume format time , duration , pitch, ingeger_pitch, label 
+        if 'A3' in row[4]:
+            Adata.append(row[:2] + [2*row[2]])
+        elif 'A4' in row[4]:
+            Adata.append(row[:3])
+        elif 'A5' in row[4]:
+            Adata.append(row[:2] + [0.5*row[2]])
+            
+    return Adata
+    
+    
+# Read named features into table of format
+# time, feature[0], feature[1} ...
+def file_to_table(input_f_file):
+    if input_f_file.endswith('.n3'):
+        data = n3_to_table(input_f_file)
+    elif input_f_file.endswith('.csv'):    
+        data = csv_to_table(input_f_file)
+        #data = get_array_from_csv(input_f_file)
+        #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
+    return data
+
+
+# Read named features into table of format
+# time, feature[0], feature[1} ...
+def n3_to_table(input_f_file):
+
+    # read feature file
+    feature_graph = get_rdf_graph_from_n3(input_f_file)
+
+    # we construct a generic search string that gets all 
+    # necessary features 
+
+    q = """prefix dml:     <http://dml.org/dml/cla#>
+            SELECT ?event ?tl_time ?tl_duration ?feature ?label
+            WHERE {
+                ?event event:time ?event_time .
+                ?event_time tl:beginsAt ?tl_time .
+                ?event_time tl:duration ?tl_duration .
+                ?event rdfs:label ?label .
+                ?event af:feature ?feature .
+            }"""       
+    
+    # query parsed file
+    qres = feature_graph.query(q)
+    data = []
+    for row in qres:
+        # parse time
+        tl_time_str_len = len(row.tl_time)
+        tl_time = float(row.tl_time[2:tl_time_str_len-1])
+        
+        # parse duration
+        tl_dur_str_len = len(row.tl_duration)
+        tl_duration = row.tl_duration[2:tl_dur_str_len-1]
+        # parse feature
+        data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
+        
+    #data = numpy.array(data, dtype=float)
+    # print data
+    # we assume format time , duration , pitch, velocity, label 
+    return data #int(last_key)
+
+# todo: do the same conversion for csv, should allow to use the same script with csv
+def csv_to_table(input_f_file):
+    
+    
+    output = []
+    badcount = 0
+    
+    # keep track of column names
+    ncols = 0
+    with open(uri2path(input_f_file), 'rb') as csvfile:
+        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in contents:
+            if ncols == 0:
+                ncols = len(row)
+                
+	    if len(row) >= ncols:
+                # we assume format time , duration , pitch, velocity, label 
+	        output.append([float(row[0]), float(row[1]),  float(row[2])] + row[3:])
+            else: 
+                badcount += 1
+                
+    if badcount > 0:
+        warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
+            
+    return output
\ No newline at end of file