Mercurial > hg > chime-home-dataset-annotation-and-baseline-evaluation-code
diff analysis_of_annotations/evaluate_annotations.py @ 0:75c79305d794
Scripts for obtaining and analysing annotations
author | peterf |
---|---|
date | Tue, 07 Jul 2015 14:42:09 +0100 |
parents | |
children | f079d2de4aa2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/analysis_of_annotations/evaluate_annotations.py Tue Jul 07 14:42:09 2015 +0100 @@ -0,0 +1,209 @@ +from pandas import Series, DataFrame +import glob +import pandas.io.parsers +from pylab import * +import numpy as np +import re + +DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/' + +#Read in annotations +Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None)) +Annotations = [] +for chunk in Chunks: + Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv')) +Annotations = DataFrame(Annotations) + +Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3') + +#Check label integrity +Labels = set('cmfvpboSU') +for a in Annotators: + I = Annotations[a].notnull() + Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s))) + assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0))) + +#Annotator-wise label counts +CountStats = {'annotatorwise':{}, 'majorityvote':{}} +for annotator in Annotators: + CountStats['annotatorwise'][annotator] = {} + for label in Labels: + V1 = Annotations[annotator][Annotations[annotator].notnull()] + V1 = V1.apply(lambda s: label in s) + CountStats['annotatorwise'][annotator][label] = sum(V1) +CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise']) +#Rearrange index for plotting histogram +CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']] +CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean()) +CountStats['annotatorwise_coefficient_of_variation'].sort() + +#Histogram of label counts +fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth +inches_per_pt = 1.0/72.27 # Convert pt to inch +golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio +fig_width = fig_width_pt*inches_per_pt # width in inches +fig_height = fig_width*golden_mean # height in inches +fig_size = [fig_width,fig_height] +params = {'backend': 'ps', + 'axes.labelsize': 8, + 'text.fontsize': 8, + 'legend.fontsize': 7.0, + 'xtick.labelsize': 8, + 'ytick.labelsize': 8, + 'text.usetex': False, + 'figure.figsize': fig_size} +rcParams.update(params) +ind = np.arange(len(CountStats['annotatorwise'])) # the x locations for the groups +width = 0.29 # the width of the bars +fig, ax = plt.subplots() +rects = [] +colours = ('r', 'y', 'g') +for annotator, i in zip(Annotators, range(len(Annotators))): + rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center')) +# add text for labels, title and axes ticks +ax.set_ylabel('Count') +ax.set_xlabel('Label') +#ax.set_title('Annotator-wise label histogram') +ax.set_xticks(ind+width) +ax.set_xticklabels(CountStats['annotatorwise'].index) +ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') ) +#Tweak x-axis limit +ax.set_xlim(left=-0.5) +ax.set_ylim(top=3500) +plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off +def autolabel(r): + for rects in r: + for rect in rects: + height = rect.get_height() + ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0) +autolabel(rects) +plt.draw() +plt.savefig('figures/annotation_histogram.pdf') + +#Generalised Jaccard index +def jaccardIndex(*Sets): + SetN = Sets[0] + SetD = Sets[0] + for S in Sets: + SetN = SetN.intersection(S) + SetD = SetD.union(S) + if len(SetD) == 0: + return 1 + else: + return len(SetN) / float(len(SetD)) + +AgreementStats = {'jaccardindex_pos':{}} +for l in Labels: + V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0]) + V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0]) + V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) + AgreementStats['jaccardindex_pos'][l] = {} + AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) + AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +#Experiment with combining label classes +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0]) +l = '{o,p}' +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,b}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,S}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) + +AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T +AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']] +print('Agreement about label presence (unfiltered dataset)') +print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']]) +#Coefficients of variation across pairs of annotators +A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T) +print('Coefficients of variation') +print(A.std() / A.mean()) + +#Read in annotations for refined dataset +Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None)) +Annotations = [] +for chunk in Chunks: + Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv')) +Annotations = DataFrame(Annotations) + +AgreementStats = {'jaccardindex_pos':{}} +for l in Labels: + V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0]) + V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0]) + V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) + + AgreementStats['jaccardindex_pos'][l] = {} + AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) + AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) + AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +#Experiment with combining label classes +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0]) +l = '{o,p}' +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,b}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) +l = '{o,S}' +V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0]) +V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) +AgreementStats['jaccardindex_pos'][l] = {} +AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) +AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) +AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) + +AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T +AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']] +print('Agreement about label presence (refined dataset)') +print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']]) +#Coefficients of variation across pairs of annotators +A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T) +print('Coefficients of variation') +print(A.std() / A.mean()) + +#Label count stats for majority vote +CountStats['majorityvote'] = {} +for label in 'cmfvpboS': + CountStats['majorityvote'][label] = {} + for comparisonLabel in 'cmfvpboS': + V1 = Annotations['majorityvote'] + V1 = V1.apply(lambda s: label in s and comparisonLabel in s) + CountStats['majorityvote'][label][comparisonLabel] = sum(V1) +CountStats['majorityvote'] = DataFrame(CountStats['majorityvote']) +print('Label co-occurrences') +CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]