view annotation_scripts/warmup_phase/evaluate_annotations_random_excerpts.py @ 5:b523456082ca tip

Update path to dataset and reflect modified chunk naming convention.
author peterf
date Mon, 01 Feb 2016 21:35:27 +0000
parents f079d2de4aa2
children
line wrap: on
line source
#!/usr/bin/python
#
# evaluate_annotations_random_excerpts.py:
#    Analyse preliminary set of annotations; subsequently obtain balanced sample
#    of chunks for annotator`warm-up' phase
#
#    Script used in preliminary evaluations
#
# Author: Peter Foster
# (c) 2014 Peter Foster
#

from pandas import Series, DataFrame
import pandas.io.parsers
import re
import numpy as np
from collections import defaultdict

AnnotationsFile = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/exploratory_labelling.csv'
OutputDir = '/import/c4dm-02/people/peterf/audex/datasets/chime_home/raw_data/exploratory/'
Annotations = pandas.io.parsers.read_csv(AnnotationsFile, header=None)
Annotations.columns = ['audiofile', 'chunk', 'annotation']
#Check integrity -- only specific characters allowed
permittedCharacters = 'cmfvns'
assert(all(Annotations['annotation'].apply(lambda s: re.search('[^'+permittedCharacters+']', s) == None) == True))

#Get unique, sorted strings
Annotations['annotation'] = Annotations['annotation'].apply(lambda s: ''.join(set(s)))

#Set random seed for bootstrap sampling
np.random.seed(4756)

def bootstrap_statistic(Vector, Statistic, nSamples):
    #Compute statistic across bootstrap samples
    S = [Statistic(Vector, np.random.choice(len(Vector), len(Vector), replace=True)) for i in range(nSamples)]
    return S

#Get sampling distribution of annotation strings, in addition to bootstrapped standard errors
Stats = {}
Stats['annotation_strings'] = {'proportion':{}, 'standarderror':{}}
Stats['annotation_strings_bootstrap'] = {}
for s in Annotations['annotation'].unique():
    Statistic = lambda V,I: sum(V[I] == s) / float(len(I))
    #Get sample statistic
    #print('Bootstrapping sample statistic for annotation string ' + s)
    Stats['annotation_strings']['proportion'][s] = Statistic(Annotations['annotation'], range(len(Annotations)))
    Stats['annotation_strings']['standarderror'][s] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
Stats['annotation_strings'] = DataFrame(Stats['annotation_strings'])
Stats['annotation_strings'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)

#Get sampling distribution of annotation characters, in addition to bootstrapped standard errors
Stats['annotation_characters'] = {'proportion':{}, 'standarderror':{}}
for c in permittedCharacters:
    Statistic = lambda V,I: sum(V[I].apply(lambda s: c in s)) / float(len(I))
    #print('Bootstrapping sample statistic for annotation character ' + c)
    Stats['annotation_characters']['proportion'][c] = Statistic(Annotations['annotation'], range(len(Annotations)))
    Stats['annotation_characters']['standarderror'][c] = np.std(bootstrap_statistic(Annotations['annotation'], Statistic, 10000), ddof=1)
Stats['annotation_characters'] = DataFrame(Stats['annotation_characters'])
Stats['annotation_characters'].sort(['proportion', 'standarderror'], ascending=[False,False], inplace=True)

print('Sampling distribution of annotation strings')
print(Stats['annotation_strings'])
print('Sampling distribution of annotation characters')
print(Stats['annotation_characters'])

Stats['annotation_strings'].index.name = 'annotation_string'
Stats['annotation_characters'].index.name = 'annotation_character'
Stats['annotation_strings'].to_csv(OutputDir + 'exploratory_labelling_annotation_strings_hist.csv')
Stats['annotation_characters'].to_csv(OutputDir + 'exploratory_labelling_annotation_characters_hist.csv')

#Write balanced sample of chunks to file, based on annotation strings
#Duration of each chunk in seconds
chunkDuration = 4
sampleRate = 48000
nRandomSamples = 5
AnnotationSample = DataFrame(columns=['audiofile','chunk','framestart']) 
for s in Annotations['annotation'].unique():
    R = Annotations[Annotations['annotation'] == s]
    I = np.random.choice(len(R), nRandomSamples)
    R = R.iloc[I]
    R['framestart'] = R['chunk'] * sampleRate * chunkDuration
    R['chunk'] = R['chunk'].apply(str)
    R['framestart'] = R['framestart'].apply(str)
    #R.drop('annotation')
    AnnotationSample = AnnotationSample.append(R, ignore_index=True)
AnnotationSample.to_csv(OutputDir + 'exploratory_labelling_annotation_chunksample.csv')