view pyspark/transforms/tuningFrequencyStatistics.py @ 0:e34cf1b6fe09 tip

commit
author Daniel Wolff
date Sat, 20 Feb 2016 18:14:24 +0100
parents
children
line wrap: on
line source
# Part of DML (Digital Music Laboratory)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

# -*- coding: utf-8 -*-
__author__="Daniel Wolff, hargreaves"

# this script derives standard statistics for tuning frequency, 
# in particular:
# average
# standard deviation

from rdflib import Graph, Namespace, BNode, RDF, Literal
import codecs
import warnings
import numpy
import csv
from n3Parser import get_rdf_graph_from_n3, uri2path
# from csvParser import get_dict_from_csv, get_array_from_csv

# statistics per clip ?
perfilestats = 1

# dml namespace
dml_ns = Namespace("http://dml.org/dml/cla#")

# Add triples representing a 'key histogram' result to
# an RDF graph
def add_tf_statistics_to_graph(statistics, output_rdf_graph, transform, sample_count, input_f_files):
    
    # add base
    output_bnode = BNode()
    output_rdf_graph.add((transform, dml_ns.output, output_bnode))
    for input_f_file in input_f_files:
        output_rdf_graph.add((transform, dml_ns.input, input_f_file))
    output_rdf_graph.add((output_bnode, RDF.type, dml_ns.TuningFrequencyStatistics))
    output_rdf_graph.add((output_bnode, dml_ns.sample_count, Literal(sample_count)))

    # add mean and std
    output_rdf_graph.add((output_bnode, dml_ns.mean, Literal(statistics["mean"])))
    output_rdf_graph.add((output_bnode, dml_ns.std_dev, Literal(statistics["std-dev"])))
    
    # add histogram
    for i in range(0,len(statistics["histogram"]["count"])):

        bin_bnode = BNode()
        output_rdf_graph.add((output_bnode, dml_ns.bin, bin_bnode))
        output_rdf_graph.add((bin_bnode, dml_ns.bin_number, Literal(i+1)))
        output_rdf_graph.add((bin_bnode, dml_ns.bin_value, Literal(statistics["histogram"]["count"][i])))
        output_rdf_graph.add((bin_bnode, dml_ns.bin_name, Literal(statistics["histogram"]["index"][i])))

    return output_rdf_graph

# Parse the input_f_files n3 files, and generate
# a key histogram
def find_cla_tf_statistics(input_f_files):


    sample_count = len(input_f_files)
    
    all_data = []
    perfile_freq = []
    perfile_hist = []
    hist_index =[]
    for input_f_file in input_f_files:

        # get all data from feature file
        data = file_to_table(input_f_file)

        # filter those rows which have an A
        # returns duration, frequency
        data = filter_norm_A(data)
        
        if perfilestats:
            # get frequency and duration columns
            freq = string2numpy(data,2)
            dur = string2numpy(data,1)
            # get mean values per clip now,
            # then statistics over clips later
            avg, std = numstats(freq, weights = dur)
            hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
            
            # remember statistics
            perfile_freq.append(avg) 
            perfile_hist.append(hist["count"])
            
            # remember histogram index
            if len(hist_index) == 0:
                hist_index = hist["index"]
            
        else:
            # this version just adds everything per collection,
            # recordings are not treated as seperate entities
            all_data.extend(data)
            

    if perfilestats:
        avg, std = histostats(numpy.array(perfile_freq,dtype=float))
        hist_avg, hist_std = histostats(numpy.array(perfile_hist,dtype=float))
    
    else:
        # get frequency and duration columns
        freq = string2numpy(all_data,2)
        dur = string2numpy(all_data,1)

        # get basic statistics
        avg, std = numstats(freq, weights = dur)

        # get histogram weighted by duration
        hist = histogram(freq, nbins = 100, lb=390, ub=490, weights = dur)
        
    return {"mean": avg, "std-dev": std, "histogram": hist}, sample_count#(key_hist, num_f_files)

# convert one column, specified by datapos, to numpy
def string2numpy(data,datapos):
    
    edata = []
    for row in data:
        edata.append(row[datapos])
        
    colu = numpy.array(edata,dtype=float)
    return colu

#calculates the histogram
# nbins: number of bins
# lb: lower bound
# ub: upper bound
def histogram(colu, nbins = 100, lb=-1, ub=-1, weights = []):
    
    # lower bounds defined?
    if lb == -1 or ub == -1:
        lb = colu.min()
        ub = colu.max()
        
    # get histogram
    count,index = numpy.histogram(colu,bins=nbins,range = [lb, ub],weights = weights)
    count = count.tolist()
    index = index.tolist()
    
    # normalise for clip
    count = count / numpy.max(count)
    
    # return histogram
    return {"count":count, "index":index}


# calculates unweighted statistics for the histograms
def histostats(counts):
    avg = numpy.average(counts, axis = 0).tolist()

    #weighted standard deviation
    std = numpy.std(counts, axis =0)

    #med = numpy.median(colu, weights = weights).tolist()
    # could use https://pypi.python.org/pypi/wquantiles for weighted median

    return (avg,std)
    
#calculates weighted statistics for  numerical input
def numstats(colu, weights = []):
    
    # we want to always use the last dimension
    # get average
    avg = numpy.average(colu, axis = 0 ,weights = weights)

    #weighted standard deviation
    std = numpy.sqrt(numpy.average((colu-avg)**2, axis = 0, weights=weights))
    #std = numpy.std(colu, weights = weights).tolist()
    
    #med = numpy.median(colu, weights = weights).tolist()
    # could use https://pypi.python.org/pypi/wquantiles for weighted median

    return (avg,std)


# only returns data columns which refer to the note A
# the frequencies are folded up / down to A4
# returns time, duration, frequency
def filter_norm_A(data):
    Adata = []
    for row in data:
        # we assume format time , duration , pitch, ingeger_pitch, label 
        if 'A3' in row[4]:
            Adata.append(row[:2] + [2*row[2]])
        elif 'A4' in row[4]:
            Adata.append(row[:3])
        elif 'A5' in row[4]:
            Adata.append(row[:2] + [0.5*row[2]])
            
    return Adata
    
    
# Read named features into table of format
# time, feature[0], feature[1} ...
def file_to_table(input_f_file):
    if input_f_file.endswith('.n3'):
        data = n3_to_table(input_f_file)
    elif input_f_file.endswith('.csv'):    
        data = csv_to_table(input_f_file)
        #data = get_array_from_csv(input_f_file)
        #data = get_dict_from_csv(input_f_file,columtype = ['time','duration','pitch','velocity','label'])
    return data


# Read named features into table of format
# time, feature[0], feature[1} ...
def n3_to_table(input_f_file):

    # read feature file
    feature_graph = get_rdf_graph_from_n3(input_f_file)

    # we construct a generic search string that gets all 
    # necessary features 

    q = """prefix dml:     <http://dml.org/dml/cla#>
            SELECT ?event ?tl_time ?tl_duration ?feature ?label
            WHERE {
                ?event event:time ?event_time .
                ?event_time tl:beginsAt ?tl_time .
                ?event_time tl:duration ?tl_duration .
                ?event rdfs:label ?label .
                ?event af:feature ?feature .
            }"""       
    
    # query parsed file
    qres = feature_graph.query(q)
    data = []
    for row in qres:
        # parse time
        tl_time_str_len = len(row.tl_time)
        tl_time = float(row.tl_time[2:tl_time_str_len-1])
        
        # parse duration
        tl_dur_str_len = len(row.tl_duration)
        tl_duration = row.tl_duration[2:tl_dur_str_len-1]
        # parse feature
        data.append([tl_time, tl_duration] + [float(i) for i in row.feature.split(' ') ] + [row.label])
        
    #data = numpy.array(data, dtype=float)
    # print data
    # we assume format time , duration , pitch, velocity, label 
    return data #int(last_key)

# todo: do the same conversion for csv, should allow to use the same script with csv
def csv_to_table(input_f_file):
    
    
    output = []
    badcount = 0
    
    # keep track of column names
    ncols = 0
    with open(uri2path(input_f_file), 'rb') as csvfile:
        contents = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in contents:
            if ncols == 0:
                ncols = len(row)
                
	    if len(row) >= ncols:
                # we assume format time , duration , pitch, velocity, label 
	        output.append([float(row[0]), float(row[1]),  float(row[2])] + row[3:])
            else: 
                badcount += 1
                
    if badcount > 0:
        warnings.warn("Incomplete csv file, ignoring " + str(badcount) + " entries")
            
    return output