peterf@0
|
1 #!/usr/bin/python
|
peterf@0
|
2 #
|
peterf@1
|
3 # evaluate_annotations_random_excerpts.py:
|
peterf@1
|
4 # Analyse preliminary set of annotations; subsequently obtain balanced sample
|
peterf@1
|
5 # of chunks for annotator`warm-up' phase
|
peterf@1
|
6 #
|
peterf@1
|
7 # Script used in preliminary evaluations
|
peterf@0
|
8 #
|
peterf@0
|
9 # Author: Peter Foster
|
peterf@0
|
10 # (c) 2014 Peter Foster
|
peterf@0
|
11 #
|
peterf@1
|
12
|
peterf@0
|
13 from pandas import Series, DataFrame
|
peterf@0
|
14 import pandas.io.parsers
|
peterf@0
|
15 import re
|
peterf@0
|
16 import numpy as np
|
peterf@0
|
17 from collections import defaultdict
|
peterf@0
|
18
|
peterf@0
|
19 AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
|
peterf@0
|
20 OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
|
peterf@0
|
21 Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
|
peterf@0
|
22 Annotations.columns = ['audiofile', 'chunk', 'annotation']
|
peterf@0
|
23 #Check integrity -- only specific characters allowed
|
peterf@0
|
24 permittedCharacters = 'cmfvns'
|
peterf@0
|
25 assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))
|
peterf@0
|
26
|
peterf@0
|
27 #Get unique, sorted strings
|
peterf@0
|
28 Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))
|
peterf@0
|
29
|
peterf@0
|
30 #Set random seed for bootstrap sampling
|
peterf@0
|
31 np.random.seed(4756)
|
peterf@0
|
32
|
peterf@0
|
33 def bootstrap_statistic(Vector, Statistic, nSamples):
|
peterf@0
|
34 #Compute statistic across bootstrap samples
|
peterf@0
|
35 S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
|
peterf@0
|
36 return S
|
peterf@0
|
37
|
peterf@0
|
38 #Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
|
peterf@0
|
39 Stats = {}
|
peterf@0
|
40 Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
|
peterf@0
|
41 Stats['annotation_strings_bootstrap'] = {}
|
peterf@0
|
42 for s in Annotations['annotation'].unique():
|
peterf@0
|
43 Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
|
peterf@0
|
44 #Get sample statistic
|
peterf@0
|
45 #print('Bootstrapping sample statistic for annotation string ' + s)
|
peterf@0
|
46 Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
|
peterf@0
|
47 Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
|
peterf@0
|
48 Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
|
peterf@0
|
49 Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
|
peterf@0
|
50
|
peterf@0
|
51 #Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
|
peterf@0
|
52 Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
|
peterf@0
|
53 for c in permittedCharacters:
|
peterf@0
|
54 Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
|
peterf@0
|
55 #print('Bootstrapping sample statistic for annotation character ' + c)
|
peterf@0
|
56 Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
|
peterf@0
|
57 Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
|
peterf@0
|
58 Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
|
peterf@0
|
59 Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)
|
peterf@0
|
60
|
peterf@0
|
61 print('Sampling distribution of annotation strings')
|
peterf@0
|
62 print(Stats['annotation_strings'])
|
peterf@0
|
63 print('Sampling distribution of annotation characters')
|
peterf@0
|
64 print(Stats['annotation_characters'])
|
peterf@0
|
65
|
peterf@0
|
66 Stats['annotation_strings'].index.name = 'annotation_string'
|
peterf@0
|
67 Stats['annotation_characters'].index.name = 'annotation_character'
|
peterf@0
|
68 Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
|
peterf@0
|
69 Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')
|
peterf@0
|
70
|
peterf@0
|
71 #Write balanced sample of chunks to file, based on annotation strings
|
peterf@0
|
72 #Duration of each chunk in seconds
|
peterf@0
|
73 chunkDuration = 4
|
peterf@0
|
74 sampleRate = 48000
|
peterf@0
|
75 nRandomSamples = 5
|
peterf@0
|
76 AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart'])
|
peterf@0
|
77 for s in Annotations['annotation'].unique():
|
peterf@0
|
78 R = Annotations[Annotations['annotation'] == s]
|
peterf@0
|
79 I = np.random.choice(len(R), nRandomSamples)
|
peterf@0
|
80 R = R.iloc[I]
|
peterf@0
|
81 R['framestart'] = R['chunk'] * sampleRate * chunkDuration
|
peterf@0
|
82 R['chunk'] = R['chunk'].apply(str)
|
peterf@0
|
83 R['framestart'] = R['framestart'].apply(str)
|
peterf@0
|
84 #R.drop('annotation')
|
peterf@0
|
85 AnnotationSample = AnnotationSample.append(R, ignore_index=True)
|
peterf@0
|
86 AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')
|