Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code
comparison annotation_scripts/warmup_phase/evaluate_annotaions_random_excerpts.py @ 0:75c79305d794
Scripts for obtaining and analysing annotations
author | peterf |
---|---|
date | Tue, 07 Jul 2015 14:42:09 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:75c79305d794 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 # | |
4 # evaluate_annotaions_random_excerpts.py:: | |
5 # Play random excerpts from a list of audio files | |
6 # | |
7 # Author: Peter Foster | |
8 # (c) 2014 Peter Foster | |
9 # | |
10 from pandas import Series, DataFrame | |
11 import pandas.io.parsers | |
12 import re | |
13 import numpy as np | |
14 from collections import defaultdict | |
15 | |
16 AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv' | |
17 #OutputDir = '/import/c4dm-scratch/peterf/audex/results/exploratory/' | |
18 OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/' | |
19 Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None) | |
20 Annotations.columns = ['audiofile', 'chunk', 'annotation'] | |
21 #Check integrity -- only specific characters allowed | |
22 permittedCharacters = 'cmfvns' | |
23 assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True)) | |
24 | |
25 #Get unique, sorted strings | |
26 Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s))) | |
27 | |
28 #Set random seed for bootstrap sampling | |
29 np.random.seed(4756) | |
30 | |
31 def bootstrap_statistic(Vector, Statistic, nSamples): | |
32 #Compute statistic across bootstrap samples | |
33 S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)] | |
34 return S | |
35 | |
36 #Get sampling distribution of annotation strings, in addition to bootstrapped standard errors | |
37 Stats = {} | |
38 Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}} | |
39 Stats['annotation_strings_bootstrap'] = {} | |
40 for s in Annotations['annotation'].unique(): | |
41 Statistic = lambda V,I: sum(V[I] == s) / float(len(I)) | |
42 #Get sample statistic | |
43 #print('Bootstrapping sample statistic for annotation string ' + s) | |
44 Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations))) | |
45 Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) | |
46 Stats['annotation_strings'] = DataFrame(Stats['annotation_strings']) | |
47 Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) | |
48 | |
49 #Get sampling distribution of annotation characters, in addition to bootstrapped standard errors | |
50 Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}} | |
51 for c in permittedCharacters: | |
52 Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I)) | |
53 #print('Bootstrapping sample statistic for annotation character ' + c) | |
54 Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations))) | |
55 Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1) | |
56 Stats['annotation_characters'] = DataFrame(Stats['annotation_characters']) | |
57 Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True) | |
58 | |
59 print('Sampling distribution of annotation strings') | |
60 print(Stats['annotation_strings']) | |
61 print('Sampling distribution of annotation characters') | |
62 print(Stats['annotation_characters']) | |
63 | |
64 Stats['annotation_strings'].index.name = 'annotation_string' | |
65 Stats['annotation_characters'].index.name = 'annotation_character' | |
66 Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv') | |
67 Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv') | |
68 | |
69 #Write balanced sample of chunks to file, based on annotation strings | |
70 #Duration of each chunk in seconds | |
71 chunkDuration = 4 | |
72 sampleRate = 48000 | |
73 nRandomSamples = 5 | |
74 AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) | |
75 for s in Annotations['annotation'].unique(): | |
76 R = Annotations[Annotations['annotation'] == s] | |
77 I = np.random.choice(len(R), nRandomSamples) | |
78 R = R.iloc[I] | |
79 R['framestart'] = R['chunk'] * sampleRate * chunkDuration | |
80 R['chunk'] = R['chunk'].apply(str) | |
81 R['framestart'] = R['framestart'].apply(str) | |
82 #R.drop('annotation') | |
83 AnnotationSample = AnnotationSample.append(R, ignore_index=True) | |
84 AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv') |