Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code
changeset 1:f079d2de4aa2
Headers and LICENSE; VERSION
author | peterf |
---|---|
date | Tue, 07 Jul 2015 16:03:52 +0100 |
parents | 75c79305d794 |
children | cb535b80218a |
files | LICENSE VERSION analysis_of_annotations/evaluate_annotations.py annotation_scripts/annotationkit_create_annotation_protocol.py annotation_scripts/annotationkit_play_chunks.py annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py annotation_scripts/warmup_phase/play_random_excerpts.py |
diffstat | 8 files changed, 126 insertions(+), 92 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Tue Jul 07 16:03:52 2015 +0100 @@ -0,0 +1,4 @@ +This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. + +Copyright for annotation data (c) 2015 Queen Mary University of London. +Copyright for audio recordings (c) 2010 University of Sheffield.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VERSION Tue Jul 07 16:03:52 2015 +0100 @@ -0,0 +1,1 @@ +Version 0.9.0
--- a/analysis_of_annotations/evaluate_annotations.py Tue Jul 07 14:42:09 2015 +0100 +++ b/analysis_of_annotations/evaluate_annotations.py Tue Jul 07 16:03:52 2015 +0100 @@ -1,3 +1,13 @@ +#!/usr/bin/python +# +# evaluate_annotations.py: +# Compute descriptive statistics of annotations, including annotator +# agreement +# +# Author: Peter Foster +# (c) 2015 Peter Foster +# + from pandas import Series, DataFrame import glob import pandas.io.parsers
--- a/annotation_scripts/annotationkit_create_annotation_protocol.py Tue Jul 07 14:42:09 2015 +0100 +++ b/annotation_scripts/annotationkit_create_annotation_protocol.py Tue Jul 07 16:03:52 2015 +0100 @@ -1,12 +1,13 @@ #!/usr/bin/python # # annotationkit_create_annotation_protocol.py: -# Prepare annotation protocol from list of audio files -# Used for obtaining CHiME challenge dataset annotations -# Read files from standard input and write csv to standard output +# Prepare annotation protocol CSV from list of audio files +# +# Read list of files from standard input and write CSV to standard output +# See annotationkit_create_annotation_protocol_wrapper.sh for usage example # # Author: Peter Foster -# (c) 2014 Peter Foster +# (c) 2015 Peter Foster # import fileinput
--- a/annotation_scripts/annotationkit_play_chunks.py Tue Jul 07 14:42:09 2015 +0100 +++ b/annotation_scripts/annotationkit_play_chunks.py Tue Jul 07 16:03:52 2015 +0100 @@ -2,11 +2,13 @@ # # annotationkit_play_chunks.py: -# Play excerpts from a list of audio files as defined by a protocol file -# Used for obtaining CHiME challenge dataset annotations +# Play excerpts from a list of audio files as specified by protocol CSV file +# and obtain annotations using text interface +# +# See annotationkit_play_chunks_wrapper.sh for usage example # # Author: Peter Foster -# (c) 2014 Peter Foster +# (c) 2015 Peter Foster # from scikits.audiolab import Sndfile @@ -114,4 +116,4 @@ #Amend and write annotations to csv AnnotatedChunks['annotation'].ix[i] = annotationString AnnotatedChunks.to_csv(OutputFile) - \ No newline at end of file +
--- a/annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py Tue Jul 07 14:42:09 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -#!/usr/bin/python - -# -# evaluate_annotaions_random_excerpts.py:: -# Play random excerpts from a list of audio files -# -# Author: Peter Foster -# (c) 2014 Peter Foster -# -from pandas import Series, DataFrame -import pandas.io.parsers -import re -import numpy as np -from collections import defaultdict - -AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' -#OutputDir = '/import/c4dm-scratch/peterf/audex/results/exploratory/' -OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/' -Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None) -Annotations.columns = ['audiofile', 'chunk', 'annotation'] -#Check integrity -- only specific characters allowed -permittedCharacters = 'cmfvns' -assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True)) - -#Get unique, sorted strings -Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s))) - -#Set random seed for bootstrap sampling -np.random.seed(4756) - -def bootstrap_statistic(Vector, Statistic, nSamples): - #Compute statistic across bootstrap samples - S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)] - return S - -#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors -Stats = {} -Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}} -Stats['annotation_strings_bootstrap'] = {} -for s in Annotations['annotation'].unique(): - Statistic = lambda V,I: sum(V[I] == s) / float(len(I)) - #Get sample statistic - #print('Bootstrapping sample statistic for annotation string ' + s) - Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations))) - Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) -Stats['annotation_strings'] = DataFrame(Stats['annotation_strings']) -Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) - -#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors -Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}} -for c in permittedCharacters: - Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I)) - #print('Bootstrapping sample statistic for annotation character ' + c) - Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations))) - Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) -Stats['annotation_characters'] = DataFrame(Stats['annotation_characters']) -Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) - -print('Sampling distribution of annotation strings') -print(Stats['annotation_strings']) -print('Sampling distribution of annotation characters') -print(Stats['annotation_characters']) - -Stats['annotation_strings'].index.name = 'annotation_string' -Stats['annotation_characters'].index.name = 'annotation_character' -Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv') -Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv') - -#Write balanced sample of chunks to file, based on annotation strings -#Duration of each chunk in seconds -chunkDuration = 4 -sampleRate = 48000 -nRandomSamples = 5 -AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) -for s in Annotations['annotation'].unique(): - R = Annotations[Annotations['annotation'] == s] - I = np.random.choice(len(R), nRandomSamples) - R = R.iloc[I] - R['framestart'] = R['chunk'] * sampleRate * chunkDuration - R['chunk'] = R['chunk'].apply(str) - R['framestart'] = R['framestart'].apply(str) - #R.drop('annotation') - AnnotationSample = AnnotationSample.append(R, ignore_index=True) -AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py Tue Jul 07 16:03:52 2015 +0100 @@ -0,0 +1,86 @@ +#!/usr/bin/python +# +# evaluate_annotations_random_excerpts.py: +# Analyse preliminary set of annotations; subsequently obtain balanced sample +# of chunks for annotator`warm-up' phase +# +# Script used in preliminary evaluations +# +# Author: Peter Foster +# (c) 2014 Peter Foster +# + +from pandas import Series, DataFrame +import pandas.io.parsers +import re +import numpy as np +from collections import defaultdict + +AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' +OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/' +Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None) +Annotations.columns = ['audiofile', 'chunk', 'annotation'] +#Check integrity -- only specific characters allowed +permittedCharacters = 'cmfvns' +assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True)) + +#Get unique, sorted strings +Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s))) + +#Set random seed for bootstrap sampling +np.random.seed(4756) + +def bootstrap_statistic(Vector, Statistic, nSamples): + #Compute statistic across bootstrap samples + S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)] + return S + +#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors +Stats = {} +Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}} +Stats['annotation_strings_bootstrap'] = {} +for s in Annotations['annotation'].unique(): + Statistic = lambda V,I: sum(V[I] == s) / float(len(I)) + #Get sample statistic + #print('Bootstrapping sample statistic for annotation string ' + s) + Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations))) + Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) +Stats['annotation_strings'] = DataFrame(Stats['annotation_strings']) +Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) + +#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors +Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}} +for c in permittedCharacters: + Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I)) + #print('Bootstrapping sample statistic for annotation character ' + c) + Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations))) + Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) +Stats['annotation_characters'] = DataFrame(Stats['annotation_characters']) +Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) + +print('Sampling distribution of annotation strings') +print(Stats['annotation_strings']) +print('Sampling distribution of annotation characters') +print(Stats['annotation_characters']) + +Stats['annotation_strings'].index.name = 'annotation_string' +Stats['annotation_characters'].index.name = 'annotation_character' +Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv') +Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv') + +#Write balanced sample of chunks to file, based on annotation strings +#Duration of each chunk in seconds +chunkDuration = 4 +sampleRate = 48000 +nRandomSamples = 5 +AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) +for s in Annotations['annotation'].unique(): + R = Annotations[Annotations['annotation'] == s] + I = np.random.choice(len(R), nRandomSamples) + R = R.iloc[I] + R['framestart'] = R['chunk'] * sampleRate * chunkDuration + R['chunk'] = R['chunk'].apply(str) + R['framestart'] = R['framestart'].apply(str) + #R.drop('annotation') + AnnotationSample = AnnotationSample.append(R, ignore_index=True) +AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
--- a/annotation_scripts/warmup_phase/play_random_excerpts.py Tue Jul 07 14:42:09 2015 +0100 +++ b/annotation_scripts/warmup_phase/play_random_excerpts.py Tue Jul 07 16:03:52 2015 +0100 @@ -1,3 +1,17 @@ +#!/usr/bin/python +# +# play_random_excerpts.py: +# Play random sample of excerpts from a list of audio files +# +# Script used in preliminary annotation experiment, with aim of obtaining +# balanced sample of chunks for annotator `warm-up' phase +# (see evaluate_annotations_random_excerpts.py) +# +# Author: Peter Foster +# (c) 2014 Peter Foster +# + + #!/usr/bin/python #