peterf@1
|
1 #!/usr/bin/python
|
peterf@1
|
2 #
|
peterf@1
|
3 # evaluate_annotations.py:
|
peterf@1
|
4 # Compute descriptive statistics of annotations, including annotator
|
peterf@1
|
5 # agreement
|
peterf@1
|
6 #
|
peterf@1
|
7 # Author: Peter Foster
|
peterf@1
|
8 # (c) 2015 Peter Foster
|
peterf@1
|
9 #
|
peterf@1
|
10
|
peterf@0
|
11 from pandas import Series, DataFrame
|
peterf@0
|
12 import glob
|
peterf@0
|
13 import pandas.io.parsers
|
peterf@0
|
14 from pylab import *
|
peterf@0
|
15 import numpy as np
|
peterf@0
|
16 import re
|
peterf@0
|
17
|
peterf@0
|
18 DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/'
|
peterf@0
|
19
|
peterf@0
|
20 #Read in annotations
|
peterf@0
|
21 Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None))
|
peterf@0
|
22 Annotations = []
|
peterf@0
|
23 for chunk in Chunks:
|
peterf@0
|
24 Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
|
peterf@0
|
25 Annotations = DataFrame(Annotations)
|
peterf@0
|
26
|
peterf@0
|
27 Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3')
|
peterf@0
|
28
|
peterf@0
|
29 #Check label integrity
|
peterf@0
|
30 Labels = set('cmfvpboSU')
|
peterf@0
|
31 for a in Annotators:
|
peterf@0
|
32 I = Annotations[a].notnull()
|
peterf@0
|
33 Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s)))
|
peterf@0
|
34 assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0)))
|
peterf@0
|
35
|
peterf@0
|
36 #Annotator-wise label counts
|
peterf@0
|
37 CountStats = {'annotatorwise':{}, 'majorityvote':{}}
|
peterf@0
|
38 for annotator in Annotators:
|
peterf@0
|
39 CountStats['annotatorwise'][annotator] = {}
|
peterf@0
|
40 for label in Labels:
|
peterf@0
|
41 V1 = Annotations[annotator][Annotations[annotator].notnull()]
|
peterf@0
|
42 V1 = V1.apply(lambda s: label in s)
|
peterf@0
|
43 CountStats['annotatorwise'][annotator][label] = sum(V1)
|
peterf@0
|
44 CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise'])
|
peterf@0
|
45 #Rearrange index for plotting histogram
|
peterf@0
|
46 CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']]
|
peterf@0
|
47 CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean())
|
peterf@0
|
48 CountStats['annotatorwise_coefficient_of_variation'].sort()
|
peterf@0
|
49
|
peterf@0
|
50 #Histogram of label counts
|
peterf@0
|
51 fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
|
peterf@0
|
52 inches_per_pt = 1.0/72.27 # Convert pt to inch
|
peterf@0
|
53 golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio
|
peterf@0
|
54 fig_width = fig_width_pt*inches_per_pt # width in inches
|
peterf@0
|
55 fig_height = fig_width*golden_mean # height in inches
|
peterf@0
|
56 fig_size = [fig_width,fig_height]
|
peterf@0
|
57 params = {'backend': 'ps',
|
peterf@0
|
58 'axes.labelsize': 8,
|
peterf@0
|
59 'text.fontsize': 8,
|
peterf@0
|
60 'legend.fontsize': 7.0,
|
peterf@0
|
61 'xtick.labelsize': 8,
|
peterf@0
|
62 'ytick.labelsize': 8,
|
peterf@0
|
63 'text.usetex': False,
|
peterf@0
|
64 'figure.figsize': fig_size}
|
peterf@0
|
65 rcParams.update(params)
|
peterf@0
|
66 ind = np.arange(len(CountStats['annotatorwise'])) # the x locations for the groups
|
peterf@0
|
67 width = 0.29 # the width of the bars
|
peterf@0
|
68 fig, ax = plt.subplots()
|
peterf@0
|
69 rects = []
|
peterf@0
|
70 colours = ('r', 'y', 'g')
|
peterf@0
|
71 for annotator, i in zip(Annotators, range(len(Annotators))):
|
peterf@0
|
72 rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center'))
|
peterf@0
|
73 # add text for labels, title and axes ticks
|
peterf@0
|
74 ax.set_ylabel('Count')
|
peterf@0
|
75 ax.set_xlabel('Label')
|
peterf@0
|
76 #ax.set_title('Annotator-wise label histogram')
|
peterf@0
|
77 ax.set_xticks(ind+width)
|
peterf@0
|
78 ax.set_xticklabels(CountStats['annotatorwise'].index)
|
peterf@0
|
79 ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') )
|
peterf@0
|
80 #Tweak x-axis limit
|
peterf@0
|
81 ax.set_xlim(left=-0.5)
|
peterf@0
|
82 ax.set_ylim(top=3500)
|
peterf@0
|
83 plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off
|
peterf@0
|
84 def autolabel(r):
|
peterf@0
|
85 for rects in r:
|
peterf@0
|
86 for rect in rects:
|
peterf@0
|
87 height = rect.get_height()
|
peterf@0
|
88 ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0)
|
peterf@0
|
89 autolabel(rects)
|
peterf@0
|
90 plt.draw()
|
peterf@0
|
91 plt.savefig('figures/annotation_histogram.pdf')
|
peterf@0
|
92
|
peterf@0
|
93 #Generalised Jaccard index
|
peterf@0
|
94 def jaccardIndex(*Sets):
|
peterf@0
|
95 SetN = Sets[0]
|
peterf@0
|
96 SetD = Sets[0]
|
peterf@0
|
97 for S in Sets:
|
peterf@0
|
98 SetN = SetN.intersection(S)
|
peterf@0
|
99 SetD = SetD.union(S)
|
peterf@0
|
100 if len(SetD) == 0:
|
peterf@0
|
101 return 1
|
peterf@0
|
102 else:
|
peterf@0
|
103 return len(SetN) / float(len(SetD))
|
peterf@0
|
104
|
peterf@0
|
105 AgreementStats = {'jaccardindex_pos':{}}
|
peterf@0
|
106 for l in Labels:
|
peterf@0
|
107 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
|
peterf@0
|
108 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
|
peterf@0
|
109 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
|
peterf@0
|
110 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
111 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
112 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
113 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
114 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
115 #Experiment with combining label classes
|
peterf@0
|
116 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
|
peterf@0
|
117 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
|
peterf@0
|
118 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
|
peterf@0
|
119 l = '{o,p}'
|
peterf@0
|
120 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
121 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
122 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
123 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
124 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
125 l = '{o,b}'
|
peterf@0
|
126 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
|
peterf@0
|
127 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
|
peterf@0
|
128 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
|
peterf@0
|
129 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
130 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
131 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
132 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
133 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
134 l = '{o,S}'
|
peterf@0
|
135 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
|
peterf@0
|
136 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
|
peterf@0
|
137 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
|
peterf@0
|
138 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
139 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
140 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
141 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
142 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
143
|
peterf@0
|
144 AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
|
peterf@0
|
145 AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
|
peterf@0
|
146 print('Agreement about label presence (unfiltered dataset)')
|
peterf@0
|
147 print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
|
peterf@0
|
148 #Coefficients of variation across pairs of annotators
|
peterf@0
|
149 A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
|
peterf@0
|
150 print('Coefficients of variation')
|
peterf@0
|
151 print(A.std() / A.mean())
|
peterf@0
|
152
|
peterf@0
|
153 #Read in annotations for refined dataset
|
peterf@0
|
154 Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None))
|
peterf@0
|
155 Annotations = []
|
peterf@0
|
156 for chunk in Chunks:
|
peterf@0
|
157 Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
|
peterf@0
|
158 Annotations = DataFrame(Annotations)
|
peterf@0
|
159
|
peterf@0
|
160 AgreementStats = {'jaccardindex_pos':{}}
|
peterf@0
|
161 for l in Labels:
|
peterf@0
|
162 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
|
peterf@0
|
163 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
|
peterf@0
|
164 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
|
peterf@0
|
165
|
peterf@0
|
166 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
167 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
168 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
169 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
170 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
171 #Experiment with combining label classes
|
peterf@0
|
172 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
|
peterf@0
|
173 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
|
peterf@0
|
174 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
|
peterf@0
|
175 l = '{o,p}'
|
peterf@0
|
176 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
177 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
178 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
179 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
180 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
181 l = '{o,b}'
|
peterf@0
|
182 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
|
peterf@0
|
183 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
|
peterf@0
|
184 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
|
peterf@0
|
185 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
186 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
187 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
188 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
189 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
190 l = '{o,S}'
|
peterf@0
|
191 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
|
peterf@0
|
192 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
|
peterf@0
|
193 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
|
peterf@0
|
194 AgreementStats['jaccardindex_pos'][l] = {}
|
peterf@0
|
195 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
|
peterf@0
|
196 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
|
peterf@0
|
197 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
|
peterf@0
|
198 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
|
peterf@0
|
199
|
peterf@0
|
200 AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
|
peterf@0
|
201 AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
|
peterf@0
|
202 print('Agreement about label presence (refined dataset)')
|
peterf@0
|
203 print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
|
peterf@0
|
204 #Coefficients of variation across pairs of annotators
|
peterf@0
|
205 A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
|
peterf@0
|
206 print('Coefficients of variation')
|
peterf@0
|
207 print(A.std() / A.mean())
|
peterf@0
|
208
|
peterf@0
|
209 #Label count stats for majority vote
|
peterf@0
|
210 CountStats['majorityvote'] = {}
|
peterf@0
|
211 for label in 'cmfvpboS':
|
peterf@0
|
212 CountStats['majorityvote'][label] = {}
|
peterf@0
|
213 for comparisonLabel in 'cmfvpboS':
|
peterf@0
|
214 V1 = Annotations['majorityvote']
|
peterf@0
|
215 V1 = V1.apply(lambda s: label in s and comparisonLabel in s)
|
peterf@0
|
216 CountStats['majorityvote'][label][comparisonLabel] = sum(V1)
|
peterf@0
|
217 CountStats['majorityvote'] = DataFrame(CountStats['majorityvote'])
|
peterf@0
|
218 print('Label co-occurrences')
|
peterf@0
|
219 CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]
|