chime-home-dataset-annotation-and-baseline-evaluation-code: annotation_scripts/warmup_phase/evaluate_annotations_random

annotate annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py @ 5:b523456082ca tip

Update path to dataset and reflect modified chunk naming convention.

author	peterf
date	Mon, 01 Feb 2016 21:35:27 +0000
parents	f079d2de4aa2
children

rev	line source
peterf@0	1 #!/usr/bin/python
peterf@0	2 #
peterf@1	3 # evaluate_annotations_random_excerpts.py:
peterf@1	4 # Analyse preliminary set of annotations; subsequently obtain balanced sample
peterf@1	5 # of chunks for annotator`warm-up' phase
peterf@1	6 #
peterf@1	7 # Script used in preliminary evaluations
peterf@0	8 #
peterf@0	9 # Author: Peter Foster
peterf@0	10 # (c) 2014 Peter Foster
peterf@0	11 #
peterf@1	12
peterf@0	13 from pandas import Series, DataFrame
peterf@0	14 import pandas.io.parsers
peterf@0	15 import re
peterf@0	16 import numpy as np
peterf@0	17 from collections import defaultdict
peterf@0	18
peterf@0	19 AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
peterf@0	20 OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
peterf@0	21 Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
peterf@0	22 Annotations.columns = ['audiofile', 'chunk', 'annotation']
peterf@0	23 #Check integrity -- only specific characters allowed
peterf@0	24 permittedCharacters = 'cmfvns'
peterf@0	25 assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))
peterf@0	26
peterf@0	27 #Get unique, sorted strings
peterf@0	28 Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))
peterf@0	29
peterf@0	30 #Set random seed for bootstrap sampling
peterf@0	31 np.random.seed(4756)
peterf@0	32
peterf@0	33 def bootstrap_statistic(Vector, Statistic, nSamples):
peterf@0	34 #Compute statistic across bootstrap samples
peterf@0	35 S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
peterf@0	36 return S
peterf@0	37
peterf@0	38 #Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
peterf@0	39 Stats = {}
peterf@0	40 Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
peterf@0	41 Stats['annotation_strings_bootstrap'] = {}
peterf@0	42 for s in Annotations['annotation'].unique():
peterf@0	43 Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
peterf@0	44 #Get sample statistic
peterf@0	45 #print('Bootstrapping sample statistic for annotation string ' + s)
peterf@0	46 Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
peterf@0	47 Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
peterf@0	48 Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
peterf@0	49 Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
peterf@0	50
peterf@0	51 #Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
peterf@0	52 Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
peterf@0	53 for c in permittedCharacters:
peterf@0	54 Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
peterf@0	55 #print('Bootstrapping sample statistic for annotation character ' + c)
peterf@0	56 Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
peterf@0	57 Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
peterf@0	58 Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
peterf@0	59 Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
peterf@0	60
peterf@0	61 print('Sampling distribution of annotation strings')
peterf@0	62 print(Stats['annotation_strings'])
peterf@0	63 print('Sampling distribution of annotation characters')
peterf@0	64 print(Stats['annotation_characters'])
peterf@0	65
peterf@0	66 Stats['annotation_strings'].index.name = 'annotation_string'
peterf@0	67 Stats['annotation_characters'].index.name = 'annotation_character'
peterf@0	68 Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
peterf@0	69 Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')
peterf@0	70
peterf@0	71 #Write balanced sample of chunks to file, based on annotation strings
peterf@0	72 #Duration of each chunk in seconds
peterf@0	73 chunkDuration = 4
peterf@0	74 sampleRate = 48000
peterf@0	75 nRandomSamples = 5
peterf@0	76 AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart'])
peterf@0	77 for s in Annotations['annotation'].unique():
peterf@0	78 R = Annotations[Annotations['annotation'] == s]
peterf@0	79 I = np.random.choice(len(R), nRandomSamples)
peterf@0	80 R = R.iloc[I]
peterf@0	81 R['framestart'] = R['chunk'] * sampleRate * chunkDuration
peterf@0	82 R['chunk'] = R['chunk'].apply(str)
peterf@0	83 R['framestart'] = R['framestart'].apply(str)
peterf@0	84 #R.drop('annotation')
peterf@0	85 AnnotationSample = AnnotationSample.append(R, ignore_index=True)
peterf@0	86 AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')

Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code

annotate annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py @ 5:b523456082ca tip