Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/analysis_of_annotations/evaluate_annotations.py	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,209 @@
+from pandas import Series, DataFrame
+import glob
+import pandas.io.parsers
+from pylab import *
+import numpy as np
+import re
+
+DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/'
+
+#Read in annotations
+Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None))
+Annotations = []
+for chunk in Chunks:
+    Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
+Annotations = DataFrame(Annotations)
+
+Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3')
+
+#Check label integrity
+Labels = set('cmfvpboSU')
+for a in Annotators:
+    I = Annotations[a].notnull()
+    Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s)))
+    assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0)))
+
+#Annotator-wise label counts
+CountStats = {'annotatorwise':{}, 'majorityvote':{}}
+for annotator in Annotators:
+    CountStats['annotatorwise'][annotator] = {}
+    for label in Labels:
+        V1 = Annotations[annotator][Annotations[annotator].notnull()]
+        V1 = V1.apply(lambda s: label in s)
+        CountStats['annotatorwise'][annotator][label] = sum(V1)
+CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise'])
+#Rearrange index for plotting histogram
+CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']]
+CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean())
+CountStats['annotatorwise_coefficient_of_variation'].sort()
+
+#Histogram of label counts
+fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
+inches_per_pt = 1.0/72.27 # Convert pt to inch
+golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio
+fig_width = fig_width_pt*inches_per_pt # width in inches
+fig_height = fig_width*golden_mean # height in inches
+fig_size =  [fig_width,fig_height]
+params = {'backend': 'ps',
+            'axes.labelsize': 8,
+            'text.fontsize': 8,
+            'legend.fontsize': 7.0,
+            'xtick.labelsize': 8,
+            'ytick.labelsize': 8,
+            'text.usetex': False,
+            'figure.figsize': fig_size}
+rcParams.update(params)
+ind = np.arange(len(CountStats['annotatorwise']))  # the x locations for the groups
+width = 0.29       # the width of the bars
+fig, ax = plt.subplots()
+rects = []
+colours = ('r', 'y', 'g')
+for annotator, i in zip(Annotators, range(len(Annotators))):
+    rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center'))
+# add text for labels, title and axes ticks
+ax.set_ylabel('Count')
+ax.set_xlabel('Label')
+#ax.set_title('Annotator-wise label histogram')
+ax.set_xticks(ind+width)
+ax.set_xticklabels(CountStats['annotatorwise'].index)
+ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') )
+#Tweak x-axis limit
+ax.set_xlim(left=-0.5)
+ax.set_ylim(top=3500)
+plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off
+def autolabel(r):
+    for rects in r:
+        for rect in rects:
+            height = rect.get_height()
+            ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0)
+autolabel(rects)
+plt.draw()
+plt.savefig('figures/annotation_histogram.pdf')
+
+#Generalised Jaccard index
+def jaccardIndex(*Sets):
+    SetN = Sets[0]
+    SetD = Sets[0]
+    for S in Sets:
+        SetN = SetN.intersection(S)
+        SetD = SetD.union(S)
+    if len(SetD) == 0:
+        return 1
+    else:
+        return len(SetN) / float(len(SetD))
+
+AgreementStats = {'jaccardindex_pos':{}}
+for l in Labels:
+    V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
+    V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
+    V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
+    AgreementStats['jaccardindex_pos'][l] = {}
+    AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+    AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+#Experiment with combining label classes
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
+l = '{o,p}'
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,b}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,S}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+
+AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
+AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
+print('Agreement about label presence (unfiltered dataset)')
+print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
+#Coefficients of variation across pairs of annotators
+A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
+print('Coefficients of variation')
+print(A.std() / A.mean())
+
+#Read in annotations for refined dataset
+Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None))
+Annotations = []
+for chunk in Chunks:
+    Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
+Annotations = DataFrame(Annotations)
+
+AgreementStats = {'jaccardindex_pos':{}}
+for l in Labels:
+    V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
+    V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
+    V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
+
+    AgreementStats['jaccardindex_pos'][l] = {}
+    AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+    AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+#Experiment with combining label classes
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
+l = '{o,p}'
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,b}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,S}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+
+AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
+AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
+print('Agreement about label presence (refined dataset)')
+print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
+#Coefficients of variation across pairs of annotators
+A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
+print('Coefficients of variation')
+print(A.std() / A.mean())
+
+#Label count stats for majority vote
+CountStats['majorityvote'] = {}
+for label in 'cmfvpboS':
+    CountStats['majorityvote'][label] = {}
+    for comparisonLabel in 'cmfvpboS':
+        V1 = Annotations['majorityvote']
+        V1 = V1.apply(lambda s: label in s and comparisonLabel in s)
+        CountStats['majorityvote'][label][comparisonLabel] = sum(V1)
+CountStats['majorityvote'] = DataFrame(CountStats['majorityvote'])
+print('Label co-occurrences')
+CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/annotationkit_create_annotation_protocol.py	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+#
+# annotationkit_create_annotation_protocol.py:
+#    Prepare annotation protocol from list of audio files
+#    Used for obtaining CHiME challenge dataset annotations
+#    Read files from standard input and write csv to standard output
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+
+import fileinput
+from scikits.audiolab import Sndfile
+from pandas import DataFrame
+import numpy as np
+import sys
+
+#Maximum number of chunks to sample from each file
+nChunksPerFile = np.inf
+#Duration of each chunk in seconds
+chunkDuration = 4
+#Expected sample rate
+sampleRate = 48000
+
+AudioChunks = []
+np.random.seed(4756)
+
+for audioFile in fileinput.input():
+    audioFile = audioFile.strip()
+    sf = Sndfile(audioFile, "r")
+    if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate))
+
+    nChunksInFile = int(sf.nframes / (sf.samplerate * chunkDuration))
+    #Sample without replacement random chunks from file
+    sampledChunks = np.random.choice(nChunksInFile, min(nChunksInFile,nChunksPerFile), replace=False)
+
+    for chunk in sampledChunks:
+        frameStart = chunk * chunkDuration * sf.samplerate
+        AudioChunks.append((audioFile, chunk, frameStart))
+
+    sf.close()
+
+
+#Create DataFrame
+AudioChunks = DataFrame(AudioChunks)
+AudioChunks.columns = ['audiofile', 'chunk', 'framestart']
+
+sys.stderr.write("Processed " + str(len(AudioChunks)) + " chunks in total, corresponding to " + str(len(AudioChunks) * chunkDuration / float(60)) + " minutes of audio.\n")
+
+#Write to CSV (stdout)
+AudioChunks.to_csv(sys.stdout)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/annotationkit_create_annotation_protocol_wrapper.sh	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,2 @@
+#!/bin/sh
+find /import/c4dm-datasets/chime/noise_background/PCCdata48kHz/train/background/ | grep wav$ | shuf | python annotationkit_create_annotation_protocol.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/annotationkit_play_chunks.py	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+
+#
+# annotationkit_play_chunks.py:
+#    Play excerpts from a list of audio files as defined by a protocol file
+#    Used for obtaining CHiME challenge dataset annotations
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+
+from scikits.audiolab import Sndfile
+from scikits.audiolab import play
+from pandas import DataFrame
+import argparse
+import os
+import sys
+from threading import Thread
+
+#Duration of each chunk in seconds
+chunkDuration = 4
+#Expected sample rate
+sampleRate = 48000
+
+#Annotations containing characters outside this set will not validate
+PermittedAnnotationCharacters = set('cmfvpbosU')
+#These characters may only appear by themself
+LoneAnnotationCharacters = set('sU')
+LabelMap = "\
+c:   child speech\n\
+m:   adult male speech\n\
+f:   adult female speech\n\
+v:   video Game/TV\n\
+p:   percussive sounds, e.g. crash, bang, knock, footsteps\n\
+b:   broadband noise, e.g. household appliances\n\
+o:   other identifiable sounds\n\
+s:   silence / background noise only\n\
+U:   flag chunk (unidentifiable sounds, not sure how to label)\
+"
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--protocolfile', help="Path to annotation protocol file")
+parser.add_argument('--annotationfile', help="Path to annotation file")
+args = vars(parser.parse_args())
+
+#InputFile = '/import/c4dm-scratch/peterf/audex/results/exploratory/annotation_protocol.csv'
+InputFile = args['protocolfile']
+#OutputFile = '/import/c4dm-scratch/peterf/audex/results/exploratory/annotations.csv'
+OutputFile = args['annotationfile']
+
+assert(os.path.isfile(InputFile))
+
+AudioChunks = DataFrame.from_csv(InputFile)
+
+if not(os.path.isfile(OutputFile)):
+    #Initialise annotation file
+    AnnotatedChunks = DataFrame(index=AudioChunks.index, columns=['annotation'])
+    AnnotatedChunks.to_csv(OutputFile)
+
+AnnotatedChunks = DataFrame.from_csv(OutputFile)
+
+#Check index integrity
+assert(all(AnnotatedChunks.index  == AudioChunks.index))
+
+#Audio playback
+for i in AnnotatedChunks.index[AnnotatedChunks['annotation'].isnull()]:
+    sf = Sndfile(AudioChunks['audiofile'].ix[i], "r")
+    if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate))
+    sf.seek(AudioChunks['framestart'].ix[i], mode='r')
+    frames = sf.read_frames(chunkDuration * sf.samplerate)
+    sf.close()
+
+    annotationString = ""
+    isValidated = False
+    while not(isValidated):
+        print("Starting playback of chunk  " + str(i+1) + " of " + str(len(AudioChunks)))
+
+        #Play chunk in background thread; this allows annotation to begin immediately
+        #We set stdout to devnull, to suppress any output originating from play() function
+        F = open(os.devnull, 'w')
+        old_stdout = sys.stdout
+        sys.stdout = F
+        myFunction = lambda frames, sampleRate: play(frames, sampleRate)
+        myThread = Thread(target=myFunction, args=(frames.T, sampleRate))
+        myThread.start()
+
+        #Unthreaded code here
+        #play(frames.T, sampleRate)
+
+        old_stdout.write("Enter annotation string, or simply hit return to replay chunk. > ")
+        annotationString = raw_input()
+        annotationString = annotationString.strip()
+
+        #Wait for playback thread to finish; we don't allow simultaneous playback
+        myThread.join()
+        #Recover old stdout, so we can use print()
+        sys.stdout = old_stdout
+
+
+        #Validate annotation string
+        if PermittedAnnotationCharacters.issuperset(annotationString):
+            if any([c in LoneAnnotationCharacters for c in annotationString]) and len(annotationString) > 1:
+                print("Invalid annotation string. Characters in " + str(LoneAnnotationCharacters) + " may only appear in isolation.")
+            elif len(annotationString) > 0:
+                isValidated = True
+        else:
+            if annotationString not in {'?', 'help'}:
+                print("Invalid annotation string.")
+            print("Valid characters are: ")
+            print("------------------------------")
+            print(LabelMap)
+            print("------------------------------")
+
+    #Amend and write annotations to csv
+    AnnotatedChunks['annotation'].ix[i] = annotationString
+    AnnotatedChunks.to_csv(OutputFile)
+
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/annotationkit_play_chunks_wrapper.sh	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+python annotationkit_play_chunks.py --protocolfile /import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/annotator_1_ana/annotation_protocol.csv --annotationfile /import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/annotator_1_ana/annotations.csv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,84 @@
+#!/usr/bin/python
+
+#
+# evaluate_annotaions_random_excerpts.py::
+#    Play random excerpts from a list of audio files
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+from pandas import Series, DataFrame
+import pandas.io.parsers
+import re
+import numpy as np
+from collections import defaultdict
+
+AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
+#OutputDir = '/import/c4dm-scratch/peterf/audex/results/exploratory/'
+OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
+Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
+Annotations.columns = ['audiofile', 'chunk', 'annotation']
+#Check integrity -- only specific characters allowed
+permittedCharacters = 'cmfvns'
+assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))
+
+#Get unique, sorted strings
+Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))
+
+#Set random seed for bootstrap sampling
+np.random.seed(4756)
+
+def bootstrap_statistic(Vector, Statistic, nSamples):
+    #Compute statistic across bootstrap samples
+    S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
+    return S
+
+#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
+Stats = {}
+Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
+Stats['annotation_strings_bootstrap'] = {}
+for s in Annotations['annotation'].unique():
+    Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
+    #Get sample statistic
+    #print('Bootstrapping sample statistic for annotation string ' + s)
+    Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
+    Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
+Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
+Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
+
+#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
+Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
+for c in permittedCharacters:
+    Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
+    #print('Bootstrapping sample statistic for annotation character ' + c)
+    Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
+    Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
+Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
+Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
+
+print('Sampling distribution of annotation strings')
+print(Stats['annotation_strings'])
+print('Sampling distribution of annotation characters')
+print(Stats['annotation_characters'])
+
+Stats['annotation_strings'].index.name = 'annotation_string'
+Stats['annotation_characters'].index.name = 'annotation_character'
+Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
+Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')
+
+#Write balanced sample of chunks to file, based on annotation strings
+#Duration of each chunk in seconds
+chunkDuration = 4
+sampleRate = 48000
+nRandomSamples = 5
+AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart'])
+for s in Annotations['annotation'].unique():
+    R = Annotations[Annotations['annotation'] == s]
+    I = np.random.choice(len(R), nRandomSamples)
+    R = R.iloc[I]
+    R['framestart'] = R['chunk'] * sampleRate * chunkDuration
+    R['chunk'] = R['chunk'].apply(str)
+    R['framestart'] = R['framestart'].apply(str)
+    #R.drop('annotation')
+    AnnotationSample = AnnotationSample.append(R, ignore_index=True)
+AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotation_scripts/warmup_phase/play_random_excerpts.py	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+
+#
+# play_random_excerpts.py:
+#    Play random excerpts from a list of audio files
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+
+import glob
+from scikits.audiolab import Sndfile
+from scikits.audiolab import play
+from pandas import DataFrame
+import numpy as np
+
+AudioPath = '/import/c4dm-datasets/chime/noise_background/PCCdata48kHz/train/background/'
+OutputFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
+FileList = glob.glob(AudioPath + '*.wav')
+#Number of chunks to sample from each file
+nChunksPerFile = 16
+#Duration of each chunk in seconds
+chunkDuration = 4
+sampleRate = 48000
+
+AudioChunks = []
+np.random.seed(4756)
+
+for audioFile in FileList:
+    sf = Sndfile(audioFile, "r")
+    if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate))
+
+    nChunksInFile = int(sf.nframes / (sf.samplerate * chunkDuration))
+    #Sample random chunks from file
+    sampledChunks = np.random.choice(nChunksInFile, min(nChunksInFile,nChunksPerFile), replace=False)
+
+    for chunk in sampledChunks:
+        frameStart = chunk * chunkDuration * sf.samplerate
+        AudioChunks.append((audioFile, chunk, frameStart))
+
+    sf.close()
+
+
+#Create DataFrame
+AudioChunks = DataFrame(AudioChunks)
+AudioChunks.columns = ['audiofile', 'chunk', 'framestart']
+
+print("Sampled " + str(len(AudioChunks)) + " in total, corresponding to " + str(len(AudioChunks) * chunkDuration / float(60)) + " minutes of audio.")
+
+
+#Audio playback
+for c in range(len(AudioChunks)):
+    sf = Sndfile(AudioChunks['audiofile'][c], "r")
+    if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate))
+    sf.seek(AudioChunks['framestart'][c], mode='r')
+    frames = sf.read_frames(chunkDuration * sf.samplerate)
+    sf.close()
+
+    S = ""
+
+    while len(S) == 0:
+        n = c + 1
+        print("Starting playback of chunk  " + str(n) + " of " + str(len(AudioChunks)))
+        play(frames.T, sampleRate);
+        S = raw_input("Enter annotation string, or simply hit return to replay chunk. > ")
+
+    f = open(OutputFile, 'a')
+    f.write(AudioChunks['audiofile'][c] + "," + str(AudioChunks['chunk'][c]) + "," + S + "\n")
+    f.close()