annotate analysis_of_annotations/evaluate_annotations.py @ 0:75c79305d794

Scripts for obtaining and analysing annotations
author peterf
date Tue, 07 Jul 2015 14:42:09 +0100
parents
children f079d2de4aa2
rev   line source
peterf@0 1 from pandas import Series, DataFrame
peterf@0 2 import glob
peterf@0 3 import pandas.io.parsers
peterf@0 4 from pylab import *
peterf@0 5 import numpy as np
peterf@0 6 import re
peterf@0 7
peterf@0 8 DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/'
peterf@0 9
peterf@0 10 #Read in annotations
peterf@0 11 Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None))
peterf@0 12 Annotations = []
peterf@0 13 for chunk in Chunks:
peterf@0 14 Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
peterf@0 15 Annotations = DataFrame(Annotations)
peterf@0 16
peterf@0 17 Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3')
peterf@0 18
peterf@0 19 #Check label integrity
peterf@0 20 Labels = set('cmfvpboSU')
peterf@0 21 for a in Annotators:
peterf@0 22 I = Annotations[a].notnull()
peterf@0 23 Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s)))
peterf@0 24 assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0)))
peterf@0 25
peterf@0 26 #Annotator-wise label counts
peterf@0 27 CountStats = {'annotatorwise':{}, 'majorityvote':{}}
peterf@0 28 for annotator in Annotators:
peterf@0 29 CountStats['annotatorwise'][annotator] = {}
peterf@0 30 for label in Labels:
peterf@0 31 V1 = Annotations[annotator][Annotations[annotator].notnull()]
peterf@0 32 V1 = V1.apply(lambda s: label in s)
peterf@0 33 CountStats['annotatorwise'][annotator][label] = sum(V1)
peterf@0 34 CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise'])
peterf@0 35 #Rearrange index for plotting histogram
peterf@0 36 CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']]
peterf@0 37 CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean())
peterf@0 38 CountStats['annotatorwise_coefficient_of_variation'].sort()
peterf@0 39
peterf@0 40 #Histogram of label counts
peterf@0 41 fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
peterf@0 42 inches_per_pt = 1.0/72.27 # Convert pt to inch
peterf@0 43 golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio
peterf@0 44 fig_width = fig_width_pt*inches_per_pt # width in inches
peterf@0 45 fig_height = fig_width*golden_mean # height in inches
peterf@0 46 fig_size = [fig_width,fig_height]
peterf@0 47 params = {'backend': 'ps',
peterf@0 48 'axes.labelsize': 8,
peterf@0 49 'text.fontsize': 8,
peterf@0 50 'legend.fontsize': 7.0,
peterf@0 51 'xtick.labelsize': 8,
peterf@0 52 'ytick.labelsize': 8,
peterf@0 53 'text.usetex': False,
peterf@0 54 'figure.figsize': fig_size}
peterf@0 55 rcParams.update(params)
peterf@0 56 ind = np.arange(len(CountStats['annotatorwise'])) # the x locations for the groups
peterf@0 57 width = 0.29 # the width of the bars
peterf@0 58 fig, ax = plt.subplots()
peterf@0 59 rects = []
peterf@0 60 colours = ('r', 'y', 'g')
peterf@0 61 for annotator, i in zip(Annotators, range(len(Annotators))):
peterf@0 62 rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center'))
peterf@0 63 # add text for labels, title and axes ticks
peterf@0 64 ax.set_ylabel('Count')
peterf@0 65 ax.set_xlabel('Label')
peterf@0 66 #ax.set_title('Annotator-wise label histogram')
peterf@0 67 ax.set_xticks(ind+width)
peterf@0 68 ax.set_xticklabels(CountStats['annotatorwise'].index)
peterf@0 69 ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') )
peterf@0 70 #Tweak x-axis limit
peterf@0 71 ax.set_xlim(left=-0.5)
peterf@0 72 ax.set_ylim(top=3500)
peterf@0 73 plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off
peterf@0 74 def autolabel(r):
peterf@0 75 for rects in r:
peterf@0 76 for rect in rects:
peterf@0 77 height = rect.get_height()
peterf@0 78 ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0)
peterf@0 79 autolabel(rects)
peterf@0 80 plt.draw()
peterf@0 81 plt.savefig('figures/annotation_histogram.pdf')
peterf@0 82
peterf@0 83 #Generalised Jaccard index
peterf@0 84 def jaccardIndex(*Sets):
peterf@0 85 SetN = Sets[0]
peterf@0 86 SetD = Sets[0]
peterf@0 87 for S in Sets:
peterf@0 88 SetN = SetN.intersection(S)
peterf@0 89 SetD = SetD.union(S)
peterf@0 90 if len(SetD) == 0:
peterf@0 91 return 1
peterf@0 92 else:
peterf@0 93 return len(SetN) / float(len(SetD))
peterf@0 94
peterf@0 95 AgreementStats = {'jaccardindex_pos':{}}
peterf@0 96 for l in Labels:
peterf@0 97 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
peterf@0 98 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
peterf@0 99 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
peterf@0 100 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 101 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 102 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 103 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 104 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 105 #Experiment with combining label classes
peterf@0 106 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0 107 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0 108 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0 109 l = '{o,p}'
peterf@0 110 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 111 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 112 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 113 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 114 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 115 l = '{o,b}'
peterf@0 116 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0 117 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0 118 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0 119 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 120 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 121 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 122 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 123 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 124 l = '{o,S}'
peterf@0 125 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0 126 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0 127 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0 128 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 129 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 130 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 131 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 132 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 133
peterf@0 134 AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
peterf@0 135 AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
peterf@0 136 print('Agreement about label presence (unfiltered dataset)')
peterf@0 137 print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
peterf@0 138 #Coefficients of variation across pairs of annotators
peterf@0 139 A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
peterf@0 140 print('Coefficients of variation')
peterf@0 141 print(A.std() / A.mean())
peterf@0 142
peterf@0 143 #Read in annotations for refined dataset
peterf@0 144 Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None))
peterf@0 145 Annotations = []
peterf@0 146 for chunk in Chunks:
peterf@0 147 Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
peterf@0 148 Annotations = DataFrame(Annotations)
peterf@0 149
peterf@0 150 AgreementStats = {'jaccardindex_pos':{}}
peterf@0 151 for l in Labels:
peterf@0 152 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
peterf@0 153 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
peterf@0 154 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
peterf@0 155
peterf@0 156 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 157 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 158 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 159 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 160 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 161 #Experiment with combining label classes
peterf@0 162 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0 163 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0 164 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0 165 l = '{o,p}'
peterf@0 166 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 167 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 168 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 169 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 170 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 171 l = '{o,b}'
peterf@0 172 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0 173 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0 174 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0 175 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 176 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 177 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 178 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 179 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 180 l = '{o,S}'
peterf@0 181 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0 182 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0 183 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0 184 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0 185 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0 186 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0 187 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0 188 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0 189
peterf@0 190 AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
peterf@0 191 AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
peterf@0 192 print('Agreement about label presence (refined dataset)')
peterf@0 193 print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
peterf@0 194 #Coefficients of variation across pairs of annotators
peterf@0 195 A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
peterf@0 196 print('Coefficients of variation')
peterf@0 197 print(A.std() / A.mean())
peterf@0 198
peterf@0 199 #Label count stats for majority vote
peterf@0 200 CountStats['majorityvote'] = {}
peterf@0 201 for label in 'cmfvpboS':
peterf@0 202 CountStats['majorityvote'][label] = {}
peterf@0 203 for comparisonLabel in 'cmfvpboS':
peterf@0 204 V1 = Annotations['majorityvote']
peterf@0 205 V1 = V1.apply(lambda s: label in s and comparisonLabel in s)
peterf@0 206 CountStats['majorityvote'][label][comparisonLabel] = sum(V1)
peterf@0 207 CountStats['majorityvote'] = DataFrame(CountStats['majorityvote'])
peterf@0 208 print('Label co-occurrences')
peterf@0 209 CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]