Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE	Tue Jul 07 16:03:52 2015 +0100
@@ -0,0 +1,4 @@
+This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+
+Copyright for annotation data (c) 2015 Queen Mary University of London.
+Copyright for audio recordings (c) 2010 University of Sheffield.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/VERSION	Tue Jul 07 16:03:52 2015 +0100
@@ -0,0 +1,1 @@
+Version 0.9.0
--- a/analysis_of_annotations/evaluate_annotations.py	Tue Jul 07 14:42:09 2015 +0100
+++ b/analysis_of_annotations/evaluate_annotations.py	Tue Jul 07 16:03:52 2015 +0100
@@ -1,3 +1,13 @@
+#!/usr/bin/python
+#
+# evaluate_annotations.py:
+#    Compute descriptive statistics of annotations, including annotator
+#    agreement
+#
+# Author: Peter Foster
+# (c) 2015 Peter Foster
+#
+
 from pandas import Series, DataFrame
 import glob
 import pandas.io.parsers
--- a/annotation_scripts/annotationkit_create_annotation_protocol.py	Tue Jul 07 14:42:09 2015 +0100
+++ b/annotation_scripts/annotationkit_create_annotation_protocol.py	Tue Jul 07 16:03:52 2015 +0100
@@ -1,12 +1,13 @@
 #!/usr/bin/python
 #
 # annotationkit_create_annotation_protocol.py:
-#    Prepare annotation protocol from list of audio files
-#    Used for obtaining CHiME challenge dataset annotations
-#    Read files from standard input and write csv to standard output
+#    Prepare annotation protocol CSV from list of audio files
+#
+#    Read list of files from standard input and write CSV to standard output
+#    See annotationkit_create_annotation_protocol_wrapper.sh for usage example
 #
 # Author: Peter Foster
-# (c) 2014 Peter Foster
+# (c) 2015 Peter Foster
 #

 import fileinput
--- a/annotation_scripts/annotationkit_play_chunks.py	Tue Jul 07 14:42:09 2015 +0100
+++ b/annotation_scripts/annotationkit_play_chunks.py	Tue Jul 07 16:03:52 2015 +0100
@@ -2,11 +2,13 @@

 #
 # annotationkit_play_chunks.py:
-#    Play excerpts from a list of audio files as defined by a protocol file
-#    Used for obtaining CHiME challenge dataset annotations
+#    Play excerpts from a list of audio files as specified by protocol CSV file
+#    and obtain annotations using text interface
+#
+#    See annotationkit_play_chunks_wrapper.sh for usage example
 #
 # Author: Peter Foster
-# (c) 2014 Peter Foster
+# (c) 2015 Peter Foster
 #

 from scikits.audiolab import Sndfile
@@ -114,4 +116,4 @@
     #Amend and write annotations to csv
     AnnotatedChunks['annotation'].ix[i] = annotationString
     AnnotatedChunks.to_csv(OutputFile)
-
\ No newline at end of file
+
--- a/annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py	Tue Jul 07 14:42:09 2015 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,84 +0,0 @@
-#!/usr/bin/python
-
-#
-# evaluate_annotaions_random_excerpts.py::
-#    Play random excerpts from a list of audio files
-#
-# Author: Peter Foster
-# (c) 2014 Peter Foster
-#
-from pandas import Series, DataFrame
-import pandas.io.parsers
-import re
-import numpy as np
-from collections import defaultdict
-
-AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
-#OutputDir = '/import/c4dm-scratch/peterf/audex/results/exploratory/'
-OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
-Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
-Annotations.columns = ['audiofile', 'chunk', 'annotation']
-#Check integrity -- only specific characters allowed
-permittedCharacters = 'cmfvns'
-assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))
-
-#Get unique, sorted strings
-Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))
-
-#Set random seed for bootstrap sampling
-np.random.seed(4756)
-
-def bootstrap_statistic(Vector, Statistic, nSamples):
-    #Compute statistic across bootstrap samples
-    S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
-    return S
-
-#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
-Stats = {}
-Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
-Stats['annotation_strings_bootstrap'] = {}
-for s in Annotations['annotation'].unique():
-    Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
-    #Get sample statistic
-    #print('Bootstrapping sample statistic for annotation string ' + s)
-    Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
-    Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
-Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
-Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
-
-#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
-Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
-for c in permittedCharacters:
-    Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
-    #print('Bootstrapping sample statistic for annotation character ' + c)
-    Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
-    Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
-Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
-Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
-
-print('Sampling distribution of annotation strings')
-print(Stats['annotation_strings'])
-print('Sampling distribution of annotation characters')
-print(Stats['annotation_characters'])
-
-Stats['annotation_strings'].index.name = 'annotation_string'
-Stats['annotation_characters'].index.name = 'annotation_character'
-Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
-Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')
-
-#Write balanced sample of chunks to file, based on annotation strings
-#Duration of each chunk in seconds
-chunkDuration = 4
-sampleRate = 48000
-nRandomSamples = 5
-AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart'])
-for s in Annotations['annotation'].unique():
-    R = Annotations[Annotations['annotation'] == s]
-    I = np.random.choice(len(R), nRandomSamples)
-    R = R.iloc[I]
-    R['framestart'] = R['chunk'] * sampleRate * chunkDuration
-    R['chunk'] = R['chunk'].apply(str)
-    R['framestart'] = R['framestart'].apply(str)
-    #R.drop('annotation')
-    AnnotationSample = AnnotationSample.append(R, ignore_index=True)
-AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py	Tue Jul 07 16:03:52 2015 +0100
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+#
+# evaluate_annotations_random_excerpts.py:
+#    Analyse preliminary set of annotations; subsequently obtain balanced sample
+#    of chunks for annotator`warm-up' phase
+#
+#    Script used in preliminary evaluations
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+
+from pandas import Series, DataFrame
+import pandas.io.parsers
+import re
+import numpy as np
+from collections import defaultdict
+
+AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
+OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
+Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
+Annotations.columns = ['audiofile', 'chunk', 'annotation']
+#Check integrity -- only specific characters allowed
+permittedCharacters = 'cmfvns'
+assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))
+
+#Get unique, sorted strings
+Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))
+
+#Set random seed for bootstrap sampling
+np.random.seed(4756)
+
+def bootstrap_statistic(Vector, Statistic, nSamples):
+    #Compute statistic across bootstrap samples
+    S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
+    return S
+
+#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
+Stats = {}
+Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
+Stats['annotation_strings_bootstrap'] = {}
+for s in Annotations['annotation'].unique():
+    Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
+    #Get sample statistic
+    #print('Bootstrapping sample statistic for annotation string ' + s)
+    Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
+    Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
+Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
+Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
+
+#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
+Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
+for c in permittedCharacters:
+    Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
+    #print('Bootstrapping sample statistic for annotation character ' + c)
+    Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
+    Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
+Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
+Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
+
+print('Sampling distribution of annotation strings')
+print(Stats['annotation_strings'])
+print('Sampling distribution of annotation characters')
+print(Stats['annotation_characters'])
+
+Stats['annotation_strings'].index.name = 'annotation_string'
+Stats['annotation_characters'].index.name = 'annotation_character'
+Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
+Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')
+
+#Write balanced sample of chunks to file, based on annotation strings
+#Duration of each chunk in seconds
+chunkDuration = 4
+sampleRate = 48000
+nRandomSamples = 5
+AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart'])
+for s in Annotations['annotation'].unique():
+    R = Annotations[Annotations['annotation'] == s]
+    I = np.random.choice(len(R), nRandomSamples)
+    R = R.iloc[I]
+    R['framestart'] = R['chunk'] * sampleRate * chunkDuration
+    R['chunk'] = R['chunk'].apply(str)
+    R['framestart'] = R['framestart'].apply(str)
+    #R.drop('annotation')
+    AnnotationSample = AnnotationSample.append(R, ignore_index=True)
+AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
--- a/annotation_scripts/warmup_phase/play_random_excerpts.py	Tue Jul 07 14:42:09 2015 +0100
+++ b/annotation_scripts/warmup_phase/play_random_excerpts.py	Tue Jul 07 16:03:52 2015 +0100
@@ -1,3 +1,17 @@
+#!/usr/bin/python
+#
+# play_random_excerpts.py:
+#    Play random sample of excerpts from a list of audio files
+#
+#    Script used in preliminary annotation experiment, with aim of obtaining
+#    balanced sample of chunks for annotator `warm-up' phase
+#    (see evaluate_annotations_random_excerpts.py)
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+
+
 #!/usr/bin/python

 #