Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code
changeset 0:75c79305d794
Scripts for obtaining and analysing annotations
author | peterf |
---|---|
date | Tue, 07 Jul 2015 14:42:09 +0100 |
parents | |
children | f079d2de4aa2 |
files | analysis_of_annotations/evaluate_annotations.py annotation_scripts/annotationkit_create_annotation_protocol.py annotation_scripts/annotationkit_create_annotation_protocol_wrapper.sh annotation_scripts/annotationkit_play_chunks.py annotation_scripts/annotationkit_play_chunks_wrapper.sh annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py annotation_scripts/warmup_phase/play_random_excerpts.py |
diffstat | 7 files changed, 535 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/analysis_of_annotations/evaluate_annotations.py Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,209 @@ +from pandas import Series, DataFrame +import glob +import pandas.io.parsers +from pylab import * +import numpy as np +import re + +DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/' + +#Read in annotations +Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None)) +Annotations = [] +for chunk in Chunks: + Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv')) +Annotations = DataFrame(Annotations) + +Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3') + +#Check label integrity +Labels = set('cmfvpboSU') +for a in Annotators: + I = Annotations[a].notnull() + Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s))) + assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0))) + +#Annotator-wise label counts +CountStats = {'annotatorwise':{}, 'majorityvote':{}} +for annotator in Annotators: + CountStats['annotatorwise'][annotator] = {} + for label in Labels: + V1 = Annotations[annotator][Annotations[annotator].notnull()] + V1 = V1.apply(lambda s: label in s) + CountStats['annotatorwise'][annotator][label] = sum(V1) +CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise']) +#Rearrange index for plotting histogram +CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']] +CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean()) +CountStats['annotatorwise_coefficient_of_variation'].sort() + +#Histogram of label counts +fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth +inches_per_pt = 1.0/72.27 # Convert pt to inch +golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio +fig_width = fig_width_pt*inches_per_pt # width in inches +fig_height = fig_width*golden_mean # height in inches +fig_size = [fig_width,fig_height] +params = {'backend': 'ps', + 'axes.labelsize': 8, + 'text.fontsize': 8, + 'legend.fontsize': 7.0, + 'xtick.labelsize': 8, + 'ytick.labelsize': 8, + 'text.usetex': False, + 'figure.figsize': fig_size} +rcParams.update(params) +ind = np.arange(len(CountStats['annotatorwise'])) # the x locations for the groups +width = 0.29 # the width of the bars +fig, ax = plt.subplots() +rects = [] +colours = ('r', 'y', 'g') +for annotator, i in zip(Annotators, range(len(Annotators))): + rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center')) +# add text for labels, title and axes ticks +ax.set_ylabel('Count') +ax.set_xlabel('Label') +#ax.set_title('Annotator-wise label histogram') +ax.set_xticks(ind+width) +ax.set_xticklabels(CountStats['annotatorwise'].index) +ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') ) +#Tweak x-axis limit +ax.set_xlim(left=-0.5) +ax.set_ylim(top=3500) +plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off +def autolabel(r): + for rects in r: + for rect in rects: + height = rect.get_height() + ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0) +autolabel(rects) +plt.draw() +plt.savefig('figures/annotation_histogram.pdf') + +#Generalised Jaccard index +def jaccardIndex(*Sets): + SetN = Sets[0] + SetD = Sets[0] + for S in Sets: + SetN = SetN.intersection(S) + SetD = SetD.union(S) + if len(SetD) == 0: + return 1 + else: + return len(SetN) / float(len(SetD)) + +AgreementStats = {'jaccardindex_pos':{}} +for l in Labels: + V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0]) + V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0]) + V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) + AgreementStats['jaccardindex_pos'][l] = {} + AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) + AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +#Experiment with combining label classes +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0]) +l = '{o,p}' +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,b}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,S}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) + +AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T +AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']] +print('Agreement about label presence (unfiltered dataset)') +print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']]) +#Coefficients of variation across pairs of annotators +A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T) +print('Coefficients of variation') +print(A.std() / A.mean()) + +#Read in annotations for refined dataset +Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None)) +Annotations = [] +for chunk in Chunks: + Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv')) +Annotations = DataFrame(Annotations) + +AgreementStats = {'jaccardindex_pos':{}} +for l in Labels: + V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0]) + V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0]) + V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) + + AgreementStats['jaccardindex_pos'][l] = {} + AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) + AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +#Experiment with combining label classes +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0]) +l = '{o,p}' +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,b}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,S}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) + +AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T +AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']] +print('Agreement about label presence (refined dataset)') +print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']]) +#Coefficients of variation across pairs of annotators +A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T) +print('Coefficients of variation') +print(A.std() / A.mean()) + +#Label count stats for majority vote +CountStats['majorityvote'] = {} +for label in 'cmfvpboS': + CountStats['majorityvote'][label] = {} + for comparisonLabel in 'cmfvpboS': + V1 = Annotations['majorityvote'] + V1 = V1.apply(lambda s: label in s and comparisonLabel in s) + CountStats['majorityvote'][label][comparisonLabel] = sum(V1) +CountStats['majorityvote'] = DataFrame(CountStats['majorityvote']) +print('Label co-occurrences') +CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/annotationkit_create_annotation_protocol.py Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,51 @@ +#!/usr/bin/python +# +# annotationkit_create_annotation_protocol.py: +# Prepare annotation protocol from list of audio files +# Used for obtaining CHiME challenge dataset annotations +# Read files from standard input and write csv to standard output +# +# Author: Peter Foster +# (c) 2014 Peter Foster +# + +import fileinput +from scikits.audiolab import Sndfile +from pandas import DataFrame +import numpy as np +import sys + +#Maximum number of chunks to sample from each file +nChunksPerFile = np.inf +#Duration of each chunk in seconds +chunkDuration = 4 +#Expected sample rate +sampleRate = 48000 + +AudioChunks = [] +np.random.seed(4756) + +for audioFile in fileinput.input(): + audioFile = audioFile.strip() + sf = Sndfile(audioFile, "r") + if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate)) + + nChunksInFile = int(sf.nframes / (sf.samplerate * chunkDuration)) + #Sample without replacement random chunks from file + sampledChunks = np.random.choice(nChunksInFile, min(nChunksInFile,nChunksPerFile), replace=False) + + for chunk in sampledChunks: + frameStart = chunk * chunkDuration * sf.samplerate + AudioChunks.append((audioFile, chunk, frameStart)) + + sf.close() + + +#Create DataFrame +AudioChunks = DataFrame(AudioChunks) +AudioChunks.columns = ['audiofile', 'chunk', 'framestart'] + +sys.stderr.write("Processed " + str(len(AudioChunks)) + " chunks in total, corresponding to " + str(len(AudioChunks) * chunkDuration / float(60)) + " minutes of audio.\n") + +#Write to CSV (stdout) +AudioChunks.to_csv(sys.stdout)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/annotationkit_create_annotation_protocol_wrapper.sh Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,2 @@ +#!/bin/sh +find /import/c4dm-datasets/chime/noise_background/PCCdata48kHz/train/background/ | grep wav$ | shuf | python annotationkit_create_annotation_protocol.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/annotationkit_play_chunks.py Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,117 @@ +#!/usr/bin/python + +# +# annotationkit_play_chunks.py: +# Play excerpts from a list of audio files as defined by a protocol file +# Used for obtaining CHiME challenge dataset annotations +# +# Author: Peter Foster +# (c) 2014 Peter Foster +# + +from scikits.audiolab import Sndfile +from scikits.audiolab import play +from pandas import DataFrame +import argparse +import os +import sys +from threading import Thread + +#Duration of each chunk in seconds +chunkDuration = 4 +#Expected sample rate +sampleRate = 48000 + +#Annotations containing characters outside this set will not validate +PermittedAnnotationCharacters = set('cmfvpbosU') +#These characters may only appear by themself +LoneAnnotationCharacters = set('sU') +LabelMap = "\ +c: child speech\n\ +m: adult male speech\n\ +f: adult female speech\n\ +v: video Game/TV\n\ +p: percussive sounds, e.g. crash, bang, knock, footsteps\n\ +b: broadband noise, e.g. household appliances\n\ +o: other identifiable sounds\n\ +s: silence / background noise only\n\ +U: flag chunk (unidentifiable sounds, not sure how to label)\ +" + +parser = argparse.ArgumentParser() +parser.add_argument('--protocolfile', help="Path to annotation protocol file") +parser.add_argument('--annotationfile', help="Path to annotation file") +args = vars(parser.parse_args()) + +#InputFile = '/import/c4dm-scratch/peterf/audex/results/exploratory/annotation_protocol.csv' +InputFile = args['protocolfile'] +#OutputFile = '/import/c4dm-scratch/peterf/audex/results/exploratory/annotations.csv' +OutputFile = args['annotationfile'] + +assert(os.path.isfile(InputFile)) + +AudioChunks = DataFrame.from_csv(InputFile) + +if not(os.path.isfile(OutputFile)): + #Initialise annotation file + AnnotatedChunks = DataFrame(index=AudioChunks.index, columns=['annotation']) + AnnotatedChunks.to_csv(OutputFile) + +AnnotatedChunks = DataFrame.from_csv(OutputFile) + +#Check index integrity +assert(all(AnnotatedChunks.index == AudioChunks.index)) + +#Audio playback +for i in AnnotatedChunks.index[AnnotatedChunks['annotation'].isnull()]: + sf = Sndfile(AudioChunks['audiofile'].ix[i], "r") + if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate)) + sf.seek(AudioChunks['framestart'].ix[i], mode='r') + frames = sf.read_frames(chunkDuration * sf.samplerate) + sf.close() + + annotationString = "" + isValidated = False + while not(isValidated): + print("Starting playback of chunk " + str(i+1) + " of " + str(len(AudioChunks))) + + #Play chunk in background thread; this allows annotation to begin immediately + #We set stdout to devnull, to suppress any output originating from play() function + F = open(os.devnull, 'w') + old_stdout = sys.stdout + sys.stdout = F + myFunction = lambda frames, sampleRate: play(frames, sampleRate) + myThread = Thread(target=myFunction, args=(frames.T, sampleRate)) + myThread.start() + + #Unthreaded code here + #play(frames.T, sampleRate) + + old_stdout.write("Enter annotation string, or simply hit return to replay chunk. > ") + annotationString = raw_input() + annotationString = annotationString.strip() + + #Wait for playback thread to finish; we don't allow simultaneous playback + myThread.join() + #Recover old stdout, so we can use print() + sys.stdout = old_stdout + + + #Validate annotation string + if PermittedAnnotationCharacters.issuperset(annotationString): + if any([c in LoneAnnotationCharacters for c in annotationString]) and len(annotationString) > 1: + print("Invalid annotation string. Characters in " + str(LoneAnnotationCharacters) + " may only appear in isolation.") + elif len(annotationString) > 0: + isValidated = True + else: + if annotationString not in {'?', 'help'}: + print("Invalid annotation string.") + print("Valid characters are: ") + print("------------------------------") + print(LabelMap) + print("------------------------------") + + #Amend and write annotations to csv + AnnotatedChunks['annotation'].ix[i] = annotationString + AnnotatedChunks.to_csv(OutputFile) + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/annotationkit_play_chunks_wrapper.sh Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,3 @@ +#!/bin/sh + +python annotationkit_play_chunks.py --protocolfile /import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/annotator_1_ana/annotation_protocol.csv --annotationfile /import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/annotator_1_ana/annotations.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,84 @@ +#!/usr/bin/python + +# +# evaluate_annotaions_random_excerpts.py:: +# Play random excerpts from a list of audio files +# +# Author: Peter Foster +# (c) 2014 Peter Foster +# +from pandas import Series, DataFrame +import pandas.io.parsers +import re +import numpy as np +from collections import defaultdict + +AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' +#OutputDir = '/import/c4dm-scratch/peterf/audex/results/exploratory/' +OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/' +Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None) +Annotations.columns = ['audiofile', 'chunk', 'annotation'] +#Check integrity -- only specific characters allowed +permittedCharacters = 'cmfvns' +assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True)) + +#Get unique, sorted strings +Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s))) + +#Set random seed for bootstrap sampling +np.random.seed(4756) + +def bootstrap_statistic(Vector, Statistic, nSamples): + #Compute statistic across bootstrap samples + S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)] + return S + +#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors +Stats = {} +Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}} +Stats['annotation_strings_bootstrap'] = {} +for s in Annotations['annotation'].unique(): + Statistic = lambda V,I: sum(V[I] == s) / float(len(I)) + #Get sample statistic + #print('Bootstrapping sample statistic for annotation string ' + s) + Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations))) + Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) +Stats['annotation_strings'] = DataFrame(Stats['annotation_strings']) +Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) + +#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors +Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}} +for c in permittedCharacters: + Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I)) + #print('Bootstrapping sample statistic for annotation character ' + c) + Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations))) + Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) +Stats['annotation_characters'] = DataFrame(Stats['annotation_characters']) +Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) + +print('Sampling distribution of annotation strings') +print(Stats['annotation_strings']) +print('Sampling distribution of annotation characters') +print(Stats['annotation_characters']) + +Stats['annotation_strings'].index.name = 'annotation_string' +Stats['annotation_characters'].index.name = 'annotation_character' +Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv') +Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv') + +#Write balanced sample of chunks to file, based on annotation strings +#Duration of each chunk in seconds +chunkDuration = 4 +sampleRate = 48000 +nRandomSamples = 5 +AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) +for s in Annotations['annotation'].unique(): + R = Annotations[Annotations['annotation'] == s] + I = np.random.choice(len(R), nRandomSamples) + R = R.iloc[I] + R['framestart'] = R['chunk'] * sampleRate * chunkDuration + R['chunk'] = R['chunk'].apply(str) + R['framestart'] = R['framestart'].apply(str) + #R.drop('annotation') + AnnotationSample = AnnotationSample.append(R, ignore_index=True) +AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/annotation_scripts/warmup_phase/play_random_excerpts.py Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,69 @@ +#!/usr/bin/python + +# +# play_random_excerpts.py: +# Play random excerpts from a list of audio files +# +# Author: Peter Foster +# (c) 2014 Peter Foster +# + +import glob +from scikits.audiolab import Sndfile +from scikits.audiolab import play +from pandas import DataFrame +import numpy as np + +AudioPath = '/import/c4dm-datasets/chime/noise_background/PCCdata48kHz/train/background/' +OutputFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' +FileList = glob.glob(AudioPath + '*.wav') +#Number of chunks to sample from each file +nChunksPerFile = 16 +#Duration of each chunk in seconds +chunkDuration = 4 +sampleRate = 48000 + +AudioChunks = [] +np.random.seed(4756) + +for audioFile in FileList: + sf = Sndfile(audioFile, "r") + if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate)) + + nChunksInFile = int(sf.nframes / (sf.samplerate * chunkDuration)) + #Sample random chunks from file + sampledChunks = np.random.choice(nChunksInFile, min(nChunksInFile,nChunksPerFile), replace=False) + + for chunk in sampledChunks: + frameStart = chunk * chunkDuration * sf.samplerate + AudioChunks.append((audioFile, chunk, frameStart)) + + sf.close() + + +#Create DataFrame +AudioChunks = DataFrame(AudioChunks) +AudioChunks.columns = ['audiofile', 'chunk', 'framestart'] + +print("Sampled " + str(len(AudioChunks)) + " in total, corresponding to " + str(len(AudioChunks) * chunkDuration / float(60)) + " minutes of audio.") + + +#Audio playback +for c in range(len(AudioChunks)): + sf = Sndfile(AudioChunks['audiofile'][c], "r") + if sf.samplerate != sampleRate: raise ValueError("wanted sample rate %g - got %g." % (sampleRate, sf.samplerate)) + sf.seek(AudioChunks['framestart'][c], mode='r') + frames = sf.read_frames(chunkDuration * sf.samplerate) + sf.close() + + S = "" + + while len(S) == 0: + n = c + 1 + print("Starting playback of chunk " + str(n) + " of " + str(len(AudioChunks))) + play(frames.T, sampleRate); + S = raw_input("Enter annotation string, or simply hit return to replay chunk. > ") + + f = open(OutputFile, 'a') + f.write(AudioChunks['audiofile'][c] + "," + str(AudioChunks['chunk'][c]) + "," + S + "\n") + f.close()