peterf@0: #!/usr/bin/python peterf@0: # peterf@1: # evaluate_annotations_random_excerpts.py: peterf@1: # Analyse preliminary set of annotations; subsequently obtain balanced sample peterf@1: # of chunks for annotator`warm-up' phase peterf@1: # peterf@1: # Script used in preliminary evaluations peterf@0: # peterf@0: # Author: Peter Foster peterf@0: # (c) 2014 Peter Foster peterf@0: # peterf@1: peterf@0: from pandas import Series, DataFrame peterf@0: import pandas.io.parsers peterf@0: import re peterf@0: import numpy as np peterf@0: from collections import defaultdict peterf@0: peterf@0: AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' peterf@0: OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/' peterf@0: Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None) peterf@0: Annotations.columns = ['audiofile', 'chunk', 'annotation'] peterf@0: #Check integrity -- only specific characters allowed peterf@0: permittedCharacters = 'cmfvns' peterf@0: assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True)) peterf@0: peterf@0: #Get unique, sorted strings peterf@0: Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s))) peterf@0: peterf@0: #Set random seed for bootstrap sampling peterf@0: np.random.seed(4756) peterf@0: peterf@0: def bootstrap_statistic(Vector, Statistic, nSamples): peterf@0: #Compute statistic across bootstrap samples peterf@0: S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)] peterf@0: return S peterf@0: peterf@0: #Get sampling distribution of annotation strings, in addition to bootstrapped standard errors peterf@0: Stats = {} peterf@0: Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}} peterf@0: Stats['annotation_strings_bootstrap'] = {} peterf@0: for s in Annotations['annotation'].unique(): peterf@0: Statistic = lambda V,I: sum(V[I] == s) / float(len(I)) peterf@0: #Get sample statistic peterf@0: #print('Bootstrapping sample statistic for annotation string ' + s) peterf@0: Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations))) peterf@0: Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) peterf@0: Stats['annotation_strings'] = DataFrame(Stats['annotation_strings']) peterf@0: Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) peterf@0: peterf@0: #Get sampling distribution of annotation characters, in addition to bootstrapped standard errors peterf@0: Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}} peterf@0: for c in permittedCharacters: peterf@0: Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I)) peterf@0: #print('Bootstrapping sample statistic for annotation character ' + c) peterf@0: Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations))) peterf@0: Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) peterf@0: Stats['annotation_characters'] = DataFrame(Stats['annotation_characters']) peterf@0: Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) peterf@0: peterf@0: print('Sampling distribution of annotation strings') peterf@0: print(Stats['annotation_strings']) peterf@0: print('Sampling distribution of annotation characters') peterf@0: print(Stats['annotation_characters']) peterf@0: peterf@0: Stats['annotation_strings'].index.name = 'annotation_string' peterf@0: Stats['annotation_characters'].index.name = 'annotation_character' peterf@0: Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv') peterf@0: Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv') peterf@0: peterf@0: #Write balanced sample of chunks to file, based on annotation strings peterf@0: #Duration of each chunk in seconds peterf@0: chunkDuration = 4 peterf@0: sampleRate = 48000 peterf@0: nRandomSamples = 5 peterf@0: AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) peterf@0: for s in Annotations['annotation'].unique(): peterf@0: R = Annotations[Annotations['annotation'] == s] peterf@0: I = np.random.choice(len(R), nRandomSamples) peterf@0: R = R.iloc[I] peterf@0: R['framestart'] = R['chunk'] * sampleRate * chunkDuration peterf@0: R['chunk'] = R['chunk'].apply(str) peterf@0: R['framestart'] = R['framestart'].apply(str) peterf@0: #R.drop('annotation') peterf@0: AnnotationSample = AnnotationSample.append(R, ignore_index=True) peterf@0: AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')