Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code
view annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py @ 5:b523456082ca tip
Update path to dataset and reflect modified chunk naming convention.
author | peterf |
---|---|
date | Mon, 01 Feb 2016 21:35:27 +0000 |
parents | f079d2de4aa2 |
children |
line wrap: on
line source
#!/usr/bin/python # # evaluate_annotations_random_excerpts.py: # Analyse preliminary set of annotations; subsequently obtain balanced sample # of chunks for annotator`warm-up' phase # # Script used in preliminary evaluations # # Author: Peter Foster # (c) 2014 Peter Foster # from pandas import Series, DataFrame import pandas.io.parsers import re import numpy as np from collections import defaultdict AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/' Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None) Annotations.columns = ['audiofile', 'chunk', 'annotation'] #Check integrity -- only specific characters allowed permittedCharacters = 'cmfvns' assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True)) #Get unique, sorted strings Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s))) #Set random seed for bootstrap sampling np.random.seed(4756) def bootstrap_statistic(Vector, Statistic, nSamples): #Compute statistic across bootstrap samples S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)] return S #Get sampling distribution of annotation strings, in addition to bootstrapped standard errors Stats = {} Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}} Stats['annotation_strings_bootstrap'] = {} for s in Annotations['annotation'].unique(): Statistic = lambda V,I: sum(V[I] == s) / float(len(I)) #Get sample statistic #print('Bootstrapping sample statistic for annotation string ' + s) Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations))) Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) Stats['annotation_strings'] = DataFrame(Stats['annotation_strings']) Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) #Get sampling distribution of annotation characters, in addition to bootstrapped standard errors Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}} for c in permittedCharacters: Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I)) #print('Bootstrapping sample statistic for annotation character ' + c) Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations))) Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) Stats['annotation_characters'] = DataFrame(Stats['annotation_characters']) Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) print('Sampling distribution of annotation strings') print(Stats['annotation_strings']) print('Sampling distribution of annotation characters') print(Stats['annotation_characters']) Stats['annotation_strings'].index.name = 'annotation_string' Stats['annotation_characters'].index.name = 'annotation_character' Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv') Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv') #Write balanced sample of chunks to file, based on annotation strings #Duration of each chunk in seconds chunkDuration = 4 sampleRate = 48000 nRandomSamples = 5 AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) for s in Annotations['annotation'].unique(): R = Annotations[Annotations['annotation'] == s] I = np.random.choice(len(R), nRandomSamples) R = R.iloc[I] R['framestart'] = R['chunk'] * sampleRate * chunkDuration R['chunk'] = R['chunk'].apply(str) R['framestart'] = R['framestart'].apply(str) #R.drop('annotation') AnnotationSample = AnnotationSample.append(R, ignore_index=True) AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')