view dml-cla/python/tuning_stats_byyear.py @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
line wrap: on
line source
# Part of DML (Digital Music Laboratory)
# Copyright 2014-2015 Daniel Wolff, City University
 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

# -*- coding: utf-8 -*-
__author__='wolffd'
__date__ ="$11-Mar-2015 12:47:23$"

# this script derives standard statistics for tuning frequency, 
# results are combined by year(range)
# average
# standard deviation

# test JSON:
#{ "module":"tuning_stats_byyear",
#      "function":"per_file",
#      "arguments": [[
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0001773XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "2015-12-14"},
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0002164XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "2015-12-14"},
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0001773XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "2015-12-14"},
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0002164XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "2015-12-14"},
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0001773XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "12.5.1993"},
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0001773XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "2015-12-14"},
#      {"transcription": { "tag": "csv", "value":"D:\\mirg\\Pitch_Analysis\\026A-1CL0001773XX-0100A0_vamp_silvet_silvet_notes.csv"}, "date": "1907"}]]
#}

from aggregate import *
import numpy
import re

# get code for single-collection tuning statistics
from tuning_stats import fold_pitch, numpy_column, transcription_from_csv, \
        transcription_from_n3,tuning_note,stats, weighted_stats

# width of bins in years
bin_width = 1

# parsers for n3 / csv
parser_table = { 'n3':transcription_from_n3, 
                 'csv':transcription_from_csv }


datematch = ''
# gets statistics on a per-file basis and returns histograms 
# over the file averages
# inputs['filename']: filenames of fine-tune pitch extracted from recordings
# inputs['date']: string containing year-... for each recording
def per_file(inputs):
    means = []
    years = []

    def accum(item):
        # see if this has a valid date
        y = parse_years(item['date'])
        if y > 1000:

            # get duration and normalised frequency for all tuning pitches (A3,A4,A5)
            a_notes = [ (note[1],fold_pitch(note[2],note[3])) 
                        for note in decode_tagged(parser_table,item['transcription']) 
                        if tuning_note(note[3]) ]

            if len(a_notes)==0:
                print_status("No notes for "+str(item['filename']))
            else:
                # get frequency and duration columns
                freq = numpy_column(a_notes,1)
                dur = numpy_column(a_notes,0)
                
                # get mean values per clip now,
                # then statistics over clips later
                avg, std = weighted_stats(freq, weights = dur)
                
                # only append years if there is data
                years.append(y)
                means.append(avg) 
        else:
            print_status("No year found for "+str(item['filename']))

    # get statistics per file      
    st=for_each(inputs,accum)

    # get year bins
    years = numpy.array(years)
    yearbins = range(numpy.min(years),numpy.max(years),bin_width)
    #yearbinends = numpy.array(yearbins) + bin_width
    
    avg = []
    std = []
    
    # foreach over the year-bundled outputs
    for year in yearbins:
       valid_idx = [i for (i, val) in enumerate(years) if val >= year and val < (year + bin_width)]
       valid_means = [means[i] for i in valid_idx]
       
       # get statistics
       y_avg,y_std = stats(numpy.array(valid_means,dtype=float))
       avg.append(y_avg)
       std.append(y_std)

    return { 'result': { 'mean': avg, 'std-dev': std,'years': yearbins}, 
             'stats' : st }

# parses year from string
# input: string containing 4-digit year
def parse_years(date):
    yearstr = re.search(r'[12]\d{3}', date)
    if yearstr:
        return int(yearstr.group(0))
    else: 
        return -1