Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code

diff analysis_of_annotations/evaluate_annotations.py @ 0:75c79305d794
Scripts for obtaining and analysing annotations
author: peterf
date: Tue, 07 Jul 2015 14:42:09 +0100
children: f079d2de4aa2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/analysis_of_annotations/evaluate_annotations.py	Tue Jul 07 14:42:09 2015 +0100
@@ -0,0 +1,209 @@
+from pandas import Series, DataFrame
+import glob
+import pandas.io.parsers
+from pylab import *
+import numpy as np
+import re
+
+DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/'
+
+#Read in annotations
+Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None))
+Annotations = []
+for chunk in Chunks: 
+    Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
+Annotations = DataFrame(Annotations)
+
+Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3')
+
+#Check label integrity
+Labels = set('cmfvpboSU')
+for a in Annotators:
+    I = Annotations[a].notnull()
+    Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s)))
+    assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0)))    
+
+#Annotator-wise label counts
+CountStats = {'annotatorwise':{}, 'majorityvote':{}}
+for annotator in Annotators:
+    CountStats['annotatorwise'][annotator] = {}
+    for label in Labels:
+        V1 = Annotations[annotator][Annotations[annotator].notnull()]
+        V1 = V1.apply(lambda s: label in s)
+        CountStats['annotatorwise'][annotator][label] = sum(V1)
+CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise'])
+#Rearrange index for plotting histogram
+CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']]
+CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean())
+CountStats['annotatorwise_coefficient_of_variation'].sort()
+
+#Histogram of label counts
+fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
+inches_per_pt = 1.0/72.27 # Convert pt to inch
+golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio
+fig_width = fig_width_pt*inches_per_pt # width in inches
+fig_height = fig_width*golden_mean # height in inches
+fig_size =  [fig_width,fig_height]
+params = {'backend': 'ps',
+            'axes.labelsize': 8,
+            'text.fontsize': 8,
+            'legend.fontsize': 7.0,
+            'xtick.labelsize': 8,
+            'ytick.labelsize': 8,
+            'text.usetex': False,
+            'figure.figsize': fig_size}
+rcParams.update(params)
+ind = np.arange(len(CountStats['annotatorwise']))  # the x locations for the groups
+width = 0.29       # the width of the bars
+fig, ax = plt.subplots()
+rects = []
+colours = ('r', 'y', 'g')
+for annotator, i in zip(Annotators, range(len(Annotators))):
+    rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center'))
+# add text for labels, title and axes ticks
+ax.set_ylabel('Count')
+ax.set_xlabel('Label')
+#ax.set_title('Annotator-wise label histogram')
+ax.set_xticks(ind+width)
+ax.set_xticklabels(CountStats['annotatorwise'].index)
+ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') )
+#Tweak x-axis limit
+ax.set_xlim(left=-0.5)
+ax.set_ylim(top=3500)
+plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off
+def autolabel(r):
+    for rects in r:
+        for rect in rects:
+            height = rect.get_height()
+            ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0)
+autolabel(rects)
+plt.draw()
+plt.savefig('figures/annotation_histogram.pdf')
+
+#Generalised Jaccard index
+def jaccardIndex(*Sets):
+    SetN = Sets[0]
+    SetD = Sets[0]
+    for S in Sets:
+        SetN = SetN.intersection(S)
+        SetD = SetD.union(S)
+    if len(SetD) == 0:
+        return 1
+    else:
+        return len(SetN) / float(len(SetD))        
+
+AgreementStats = {'jaccardindex_pos':{}}
+for l in Labels:
+    V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
+    V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
+    V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])   
+    AgreementStats['jaccardindex_pos'][l] = {}
+    AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+    AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+#Experiment with combining label classes
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
+l = '{o,p}'
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,b}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) 
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,S}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) 
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+
+AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
+AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
+print('Agreement about label presence (unfiltered dataset)')
+print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
+#Coefficients of variation across pairs of annotators
+A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
+print('Coefficients of variation')
+print(A.std() / A.mean())
+
+#Read in annotations for refined dataset
+Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None))
+Annotations = []
+for chunk in Chunks: 
+    Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
+Annotations = DataFrame(Annotations)
+
+AgreementStats = {'jaccardindex_pos':{}}
+for l in Labels:
+    V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
+    V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
+    V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) 
+
+    AgreementStats['jaccardindex_pos'][l] = {}
+    AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+    AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+    AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+#Experiment with combining label classes
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
+l = '{o,p}'
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,b}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) 
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+l = '{o,S}'
+V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
+V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) 
+AgreementStats['jaccardindex_pos'][l] = {}
+AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
+AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
+AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
+
+AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
+AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
+print('Agreement about label presence (refined dataset)')
+print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
+#Coefficients of variation across pairs of annotators
+A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
+print('Coefficients of variation')
+print(A.std() / A.mean())
+
+#Label count stats for majority vote
+CountStats['majorityvote'] = {}
+for label in 'cmfvpboS':
+    CountStats['majorityvote'][label] = {}
+    for comparisonLabel in 'cmfvpboS':
+        V1 = Annotations['majorityvote']
+        V1 = V1.apply(lambda s: label in s and comparisonLabel in s)
+        CountStats['majorityvote'][label][comparisonLabel] = sum(V1)
+CountStats['majorityvote'] = DataFrame(CountStats['majorityvote'])
+print('Label co-occurrences')
+CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]
author	peterf
date	Tue, 07 Jul 2015 14:42:09 +0100
parents
children	f079d2de4aa2