chime-home-dataset-annotation-and-baseline-evaluation-code: analysis_of_annotations/evaluate

annotate analysis_of_annotations/evaluate_annotations.py @ 5:b523456082ca tip

Update path to dataset and reflect modified chunk naming convention.

author	peterf
date	Mon, 01 Feb 2016 21:35:27 +0000
parents	f079d2de4aa2
children

rev	line source
peterf@1	1 #!/usr/bin/python
peterf@1	2 #
peterf@1	3 # evaluate_annotations.py:
peterf@1	4 # Compute descriptive statistics of annotations, including annotator
peterf@1	5 # agreement
peterf@1	6 #
peterf@1	7 # Author: Peter Foster
peterf@1	8 # (c) 2015 Peter Foster
peterf@1	9 #
peterf@1	10
peterf@0	11 from pandas import Series, DataFrame
peterf@0	12 import glob
peterf@0	13 import pandas.io.parsers
peterf@0	14 from pylab import *
peterf@0	15 import numpy as np
peterf@0	16 import re
peterf@0	17
peterf@0	18 DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/'
peterf@0	19
peterf@0	20 #Read in annotations
peterf@0	21 Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None))
peterf@0	22 Annotations = []
peterf@0	23 for chunk in Chunks:
peterf@0	24 Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
peterf@0	25 Annotations = DataFrame(Annotations)
peterf@0	26
peterf@0	27 Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3')
peterf@0	28
peterf@0	29 #Check label integrity
peterf@0	30 Labels = set('cmfvpboSU')
peterf@0	31 for a in Annotators:
peterf@0	32 I = Annotations[a].notnull()
peterf@0	33 Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s)))
peterf@0	34 assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0)))
peterf@0	35
peterf@0	36 #Annotator-wise label counts
peterf@0	37 CountStats = {'annotatorwise':{}, 'majorityvote':{}}
peterf@0	38 for annotator in Annotators:
peterf@0	39 CountStats['annotatorwise'][annotator] = {}
peterf@0	40 for label in Labels:
peterf@0	41 V1 = Annotations[annotator][Annotations[annotator].notnull()]
peterf@0	42 V1 = V1.apply(lambda s: label in s)
peterf@0	43 CountStats['annotatorwise'][annotator][label] = sum(V1)
peterf@0	44 CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise'])
peterf@0	45 #Rearrange index for plotting histogram
peterf@0	46 CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']]
peterf@0	47 CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean())
peterf@0	48 CountStats['annotatorwise_coefficient_of_variation'].sort()
peterf@0	49
peterf@0	50 #Histogram of label counts
peterf@0	51 fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth
peterf@0	52 inches_per_pt = 1.0/72.27 # Convert pt to inch
peterf@0	53 golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio
peterf@0	54 fig_width = fig_width_pt*inches_per_pt # width in inches
peterf@0	55 fig_height = fig_width*golden_mean # height in inches
peterf@0	56 fig_size = [fig_width,fig_height]
peterf@0	57 params = {'backend': 'ps',
peterf@0	58 'axes.labelsize': 8,
peterf@0	59 'text.fontsize': 8,
peterf@0	60 'legend.fontsize': 7.0,
peterf@0	61 'xtick.labelsize': 8,
peterf@0	62 'ytick.labelsize': 8,
peterf@0	63 'text.usetex': False,
peterf@0	64 'figure.figsize': fig_size}
peterf@0	65 rcParams.update(params)
peterf@0	66 ind = np.arange(len(CountStats['annotatorwise'])) # the x locations for the groups
peterf@0	67 width = 0.29 # the width of the bars
peterf@0	68 fig, ax = plt.subplots()
peterf@0	69 rects = []
peterf@0	70 colours = ('r', 'y', 'g')
peterf@0	71 for annotator, i in zip(Annotators, range(len(Annotators))):
peterf@0	72 rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center'))
peterf@0	73 # add text for labels, title and axes ticks
peterf@0	74 ax.set_ylabel('Count')
peterf@0	75 ax.set_xlabel('Label')
peterf@0	76 #ax.set_title('Annotator-wise label histogram')
peterf@0	77 ax.set_xticks(ind+width)
peterf@0	78 ax.set_xticklabels(CountStats['annotatorwise'].index)
peterf@0	79 ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') )
peterf@0	80 #Tweak x-axis limit
peterf@0	81 ax.set_xlim(left=-0.5)
peterf@0	82 ax.set_ylim(top=3500)
peterf@0	83 plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off
peterf@0	84 def autolabel(r):
peterf@0	85 for rects in r:
peterf@0	86 for rect in rects:
peterf@0	87 height = rect.get_height()
peterf@0	88 ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0)
peterf@0	89 autolabel(rects)
peterf@0	90 plt.draw()
peterf@0	91 plt.savefig('figures/annotation_histogram.pdf')
peterf@0	92
peterf@0	93 #Generalised Jaccard index
peterf@0	94 def jaccardIndex(*Sets):
peterf@0	95 SetN = Sets[0]
peterf@0	96 SetD = Sets[0]
peterf@0	97 for S in Sets:
peterf@0	98 SetN = SetN.intersection(S)
peterf@0	99 SetD = SetD.union(S)
peterf@0	100 if len(SetD) == 0:
peterf@0	101 return 1
peterf@0	102 else:
peterf@0	103 return len(SetN) / float(len(SetD))
peterf@0	104
peterf@0	105 AgreementStats = {'jaccardindex_pos':{}}
peterf@0	106 for l in Labels:
peterf@0	107 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
peterf@0	108 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
peterf@0	109 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
peterf@0	110 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	111 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	112 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	113 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	114 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	115 #Experiment with combining label classes
peterf@0	116 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0	117 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0	118 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0	119 l = '{o,p}'
peterf@0	120 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	121 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	122 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	123 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	124 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	125 l = '{o,b}'
peterf@0	126 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0	127 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0	128 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0	129 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	130 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	131 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	132 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	133 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	134 l = '{o,S}'
peterf@0	135 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0	136 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0	137 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0	138 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	139 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	140 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	141 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	142 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	143
peterf@0	144 AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
peterf@0	145 AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
peterf@0	146 print('Agreement about label presence (unfiltered dataset)')
peterf@0	147 print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
peterf@0	148 #Coefficients of variation across pairs of annotators
peterf@0	149 A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
peterf@0	150 print('Coefficients of variation')
peterf@0	151 print(A.std() / A.mean())
peterf@0	152
peterf@0	153 #Read in annotations for refined dataset
peterf@0	154 Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None))
peterf@0	155 Annotations = []
peterf@0	156 for chunk in Chunks:
peterf@0	157 Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv'))
peterf@0	158 Annotations = DataFrame(Annotations)
peterf@0	159
peterf@0	160 AgreementStats = {'jaccardindex_pos':{}}
peterf@0	161 for l in Labels:
peterf@0	162 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0])
peterf@0	163 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0])
peterf@0	164 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0])
peterf@0	165
peterf@0	166 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	167 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	168 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	169 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	170 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	171 #Experiment with combining label classes
peterf@0	172 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0	173 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0	174 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0])
peterf@0	175 l = '{o,p}'
peterf@0	176 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	177 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	178 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	179 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	180 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	181 l = '{o,b}'
peterf@0	182 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0	183 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0	184 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0])
peterf@0	185 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	186 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	187 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	188 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	189 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	190 l = '{o,S}'
peterf@0	191 V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0	192 V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0	193 V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0])
peterf@0	194 AgreementStats['jaccardindex_pos'][l] = {}
peterf@0	195 AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3)
peterf@0	196 AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2)
peterf@0	197 AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3)
peterf@0	198 AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3)
peterf@0	199
peterf@0	200 AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T
peterf@0	201 AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']]
peterf@0	202 print('Agreement about label presence (refined dataset)')
peterf@0	203 print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']])
peterf@0	204 #Coefficients of variation across pairs of annotators
peterf@0	205 A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T)
peterf@0	206 print('Coefficients of variation')
peterf@0	207 print(A.std() / A.mean())
peterf@0	208
peterf@0	209 #Label count stats for majority vote
peterf@0	210 CountStats['majorityvote'] = {}
peterf@0	211 for label in 'cmfvpboS':
peterf@0	212 CountStats['majorityvote'][label] = {}
peterf@0	213 for comparisonLabel in 'cmfvpboS':
peterf@0	214 V1 = Annotations['majorityvote']
peterf@0	215 V1 = V1.apply(lambda s: label in s and comparisonLabel in s)
peterf@0	216 CountStats['majorityvote'][label][comparisonLabel] = sum(V1)
peterf@0	217 CountStats['majorityvote'] = DataFrame(CountStats['majorityvote'])
peterf@0	218 print('Label co-occurrences')
peterf@0	219 CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]

Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code

annotate analysis_of_annotations/evaluate_annotations.py @ 5:b523456082ca tip