chime-home-dataset-annotation-and-baseline-evaluation-code: annotation_scripts/warmup_phase/evaluate_annotaions_random

comparison annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py @ 0:75c79305d794

Scripts for obtaining and analysing annotations

author	peterf
date	Tue, 07 Jul 2015 14:42:09 +0100
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:75c79305d794
+#!/usr/bin/python
+#
+# evaluate_annotaions_random_excerpts.py::
+#    Play random excerpts from a list of audio files
+#
+# Author: Peter Foster
+# (c) 2014 Peter Foster
+#
+from pandas import Series, DataFrame
+import pandas.io.parsers
+import re
+import numpy as np
+from collections import defaultdict
+AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
+#OutputDir = '/import/c4dm-scratch/peterf/audex/results/exploratory/'
+OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
+Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
+Annotations.columns = ['audiofile', 'chunk', 'annotation']
+#Check integrity -- only specific characters allowed
+permittedCharacters = 'cmfvns'
+assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))
+#Get unique, sorted strings
+Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))
+#Set random seed for bootstrap sampling
+np.random.seed(4756)
+def bootstrap_statistic(Vector, Statistic, nSamples):
+#Compute statistic across bootstrap samples
+S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
+return S
+#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
+Stats = {}
+Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
+Stats['annotation_strings_bootstrap'] = {}
+for s in Annotations['annotation'].unique():
+Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
+#Get sample statistic
+#print('Bootstrapping sample statistic for annotation string ' + s)
+Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
+Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
+Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
+Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
+#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
+Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
+for c in permittedCharacters:
+Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
+#print('Bootstrapping sample statistic for annotation character ' + c)
+Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
+Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
+Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
+Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
+print('Sampling distribution of annotation strings')
+print(Stats['annotation_strings'])
+print('Sampling distribution of annotation characters')
+print(Stats['annotation_characters'])
+Stats['annotation_strings'].index.name = 'annotation_string'
+Stats['annotation_characters'].index.name = 'annotation_character'
+Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
+Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')
+#Write balanced sample of chunks to file, based on annotation strings
+#Duration of each chunk in seconds
+chunkDuration = 4
+sampleRate = 48000
+nRandomSamples = 5
+AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart'])
+for s in Annotations['annotation'].unique():
+R = Annotations[Annotations['annotation'] == s]
+I = np.random.choice(len(R), nRandomSamples)
+R = R.iloc[I]
+R['framestart'] = R['chunk'] * sampleRate * chunkDuration
+R['chunk'] = R['chunk'].apply(str)
+R['framestart'] = R['framestart'].apply(str)
+#R.drop('annotation')
+AnnotationSample = AnnotationSample.append(R, ignore_index=True)
+AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')

Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code

comparison annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py @ 0:75c79305d794