peterf@1: #!/usr/bin/python peterf@1: # peterf@1: # evaluate_annotations.py: peterf@1: # Compute descriptive statistics of annotations, including annotator peterf@1: # agreement peterf@1: # peterf@1: # Author: Peter Foster peterf@1: # (c) 2015 Peter Foster peterf@1: # peterf@1: peterf@0: from pandas import Series, DataFrame peterf@0: import glob peterf@0: import pandas.io.parsers peterf@0: from pylab import * peterf@0: import numpy as np peterf@0: import re peterf@0: peterf@0: DatasetPath = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/' peterf@0: peterf@0: #Read in annotations peterf@0: Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_raw.csv',header=None)) peterf@0: Annotations = [] peterf@0: for chunk in Chunks: peterf@0: Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv')) peterf@0: Annotations = DataFrame(Annotations) peterf@0: peterf@0: Annotators = ('annotation_a1', 'annotation_a2', 'annotation_a3') peterf@0: peterf@0: #Check label integrity peterf@0: Labels = set('cmfvpboSU') peterf@0: for a in Annotators: peterf@0: I = Annotations[a].notnull() peterf@0: Annotations[a].ix[I] = Annotations[a][I].apply(lambda s: ''.join(sorted(s))) peterf@0: assert(all(Annotations[a].ix[I].apply(lambda s: len(set(s).difference(Labels)) == 0))) peterf@0: peterf@0: #Annotator-wise label counts peterf@0: CountStats = {'annotatorwise':{}, 'majorityvote':{}} peterf@0: for annotator in Annotators: peterf@0: CountStats['annotatorwise'][annotator] = {} peterf@0: for label in Labels: peterf@0: V1 = Annotations[annotator][Annotations[annotator].notnull()] peterf@0: V1 = V1.apply(lambda s: label in s) peterf@0: CountStats['annotatorwise'][annotator][label] = sum(V1) peterf@0: CountStats['annotatorwise'] = DataFrame(CountStats['annotatorwise']) peterf@0: #Rearrange index for plotting histogram peterf@0: CountStats['annotatorwise'] = CountStats['annotatorwise'].ix[['c','m','f','v','p','b','o','S','U']] peterf@0: CountStats['annotatorwise_coefficient_of_variation'] = (CountStats['annotatorwise'].T.std() / CountStats['annotatorwise'].T.mean()) peterf@0: CountStats['annotatorwise_coefficient_of_variation'].sort() peterf@0: peterf@0: #Histogram of label counts peterf@0: fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth peterf@0: inches_per_pt = 1.0/72.27 # Convert pt to inch peterf@0: golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio peterf@0: fig_width = fig_width_pt*inches_per_pt # width in inches peterf@0: fig_height = fig_width*golden_mean # height in inches peterf@0: fig_size = [fig_width,fig_height] peterf@0: params = {'backend': 'ps', peterf@0: 'axes.labelsize': 8, peterf@0: 'text.fontsize': 8, peterf@0: 'legend.fontsize': 7.0, peterf@0: 'xtick.labelsize': 8, peterf@0: 'ytick.labelsize': 8, peterf@0: 'text.usetex': False, peterf@0: 'figure.figsize': fig_size} peterf@0: rcParams.update(params) peterf@0: ind = np.arange(len(CountStats['annotatorwise'])) # the x locations for the groups peterf@0: width = 0.29 # the width of the bars peterf@0: fig, ax = plt.subplots() peterf@0: rects = [] peterf@0: colours = ('r', 'y', 'g') peterf@0: for annotator, i in zip(Annotators, range(len(Annotators))): peterf@0: rects.append(ax.bar(ind+width*i, CountStats['annotatorwise'][annotator], width, color=colours[i], align='center')) peterf@0: # add text for labels, title and axes ticks peterf@0: ax.set_ylabel('Count') peterf@0: ax.set_xlabel('Label') peterf@0: #ax.set_title('Annotator-wise label histogram') peterf@0: ax.set_xticks(ind+width) peterf@0: ax.set_xticklabels(CountStats['annotatorwise'].index) peterf@0: ax.legend( (rect[0] for rect in rects), ('Annotator 1', 'Annotator 2', 'Annotator 3') ) peterf@0: #Tweak x-axis limit peterf@0: ax.set_xlim(left=-0.5) peterf@0: ax.set_ylim(top=3500) peterf@0: plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off peterf@0: def autolabel(r): peterf@0: for rects in r: peterf@0: for rect in rects: peterf@0: height = rect.get_height() peterf@0: ax.text(rect.get_x()+0.180,100+height,'%d'%int(height),ha='center',va='bottom',rotation='vertical',size=6.0) peterf@0: autolabel(rects) peterf@0: plt.draw() peterf@0: plt.savefig('figures/annotation_histogram.pdf') peterf@0: peterf@0: #Generalised Jaccard index peterf@0: def jaccardIndex(*Sets): peterf@0: SetN = Sets[0] peterf@0: SetD = Sets[0] peterf@0: for S in Sets: peterf@0: SetN = SetN.intersection(S) peterf@0: SetD = SetD.union(S) peterf@0: if len(SetD) == 0: peterf@0: return 1 peterf@0: else: peterf@0: return len(SetN) / float(len(SetD)) peterf@0: peterf@0: AgreementStats = {'jaccardindex_pos':{}} peterf@0: for l in Labels: peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: #Experiment with combining label classes peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0]) peterf@0: l = '{o,p}' peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: l = '{o,b}' peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: l = '{o,S}' peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: peterf@0: AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T peterf@0: AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']] peterf@0: print('Agreement about label presence (unfiltered dataset)') peterf@0: print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']]) peterf@0: #Coefficients of variation across pairs of annotators peterf@0: A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T) peterf@0: print('Coefficients of variation') peterf@0: print(A.std() / A.mean()) peterf@0: peterf@0: #Read in annotations for refined dataset peterf@0: Chunks = list(Series.from_csv(DatasetPath + 'release_chunks_refined.csv',header=None)) peterf@0: Annotations = [] peterf@0: for chunk in Chunks: peterf@0: Annotations.append(Series.from_csv(DatasetPath + 'chunks/' + chunk + '.csv')) peterf@0: Annotations = DataFrame(Annotations) peterf@0: peterf@0: AgreementStats = {'jaccardindex_pos':{}} peterf@0: for l in Labels: peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: l in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: l in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: l in s))[0]) peterf@0: peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: #Experiment with combining label classes peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'p' in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'p' in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'p' in s))[0]) peterf@0: l = '{o,p}' peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: l = '{o,b}' peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'b' in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'b' in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'b' in s))[0]) peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: l = '{o,S}' peterf@0: V1 = set(np.where(Annotations['annotation_a1'].apply(lambda s: 'o' in s or 'S' in s))[0]) peterf@0: V2 = set(np.where(Annotations['annotation_a2'].apply(lambda s: 'o' in s or 'S' in s))[0]) peterf@0: V3 = set(np.where(Annotations['annotation_a3'].apply(lambda s: 'o' in s or 'S' in s))[0]) peterf@0: AgreementStats['jaccardindex_pos'][l] = {} peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2_a3'] = jaccardIndex(V1, V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a2'] = jaccardIndex(V1, V2) peterf@0: AgreementStats['jaccardindex_pos'][l]['a2_a3'] = jaccardIndex(V2, V3) peterf@0: AgreementStats['jaccardindex_pos'][l]['a1_a3'] = jaccardIndex(V1, V3) peterf@0: peterf@0: AgreementStats['jaccardindex_pos'] = DataFrame(AgreementStats['jaccardindex_pos']).T peterf@0: AgreementStats['jaccardindex_pos'] = AgreementStats['jaccardindex_pos'][['a1_a2', 'a1_a3', 'a2_a3', 'a1_a2_a3']] peterf@0: print('Agreement about label presence (refined dataset)') peterf@0: print(AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']]) peterf@0: #Coefficients of variation across pairs of annotators peterf@0: A = (AgreementStats['jaccardindex_pos'].ix[['c','m','f','v','p','b','o','S','{o,p}','{o,b}','{o,S}']][['a1_a2', 'a1_a3', 'a2_a3']].T) peterf@0: print('Coefficients of variation') peterf@0: print(A.std() / A.mean()) peterf@0: peterf@0: #Label count stats for majority vote peterf@0: CountStats['majorityvote'] = {} peterf@0: for label in 'cmfvpboS': peterf@0: CountStats['majorityvote'][label] = {} peterf@0: for comparisonLabel in 'cmfvpboS': peterf@0: V1 = Annotations['majorityvote'] peterf@0: V1 = V1.apply(lambda s: label in s and comparisonLabel in s) peterf@0: CountStats['majorityvote'][label][comparisonLabel] = sum(V1) peterf@0: CountStats['majorityvote'] = DataFrame(CountStats['majorityvote']) peterf@0: print('Label co-occurrences') peterf@0: CountStats['majorityvote'].loc[['c','m','f','v','p','b','o','S'],['c','m','f','v','p','b','o','S']]