peterf@2: #!/usr/bin/python peterf@2: peterf@2: # peterf@2: # run_experiments.py: peterf@2: # Main script for CHiME-Home dataset baseline GMM evaluation peterf@2: # peterf@2: # Author: Peter Foster peterf@2: # (c) 2015 Peter Foster peterf@2: # peterf@2: peterf@2: from pylab import * peterf@2: from sklearn import cross_validation peterf@2: import os peterf@2: from pandas import Series, DataFrame peterf@2: from collections import defaultdict peterf@2: from extract_features import FeatureExtractor peterf@2: import exper002 peterf@2: import custompickler peterf@2: from compute_performance_statistics import compute_performance_statistics peterf@2: import pdb peterf@2: peterf@2: Settings = {'paths':{}, 'algorithms':{}} peterf@2: Settings['paths'] = {'chime_home': {}, 'resultsdir':'/import/c4dm-scratch/peterf/audex/results/', 'featuresdir':'/import/c4dm-scratch/peterf/audex/features/'} peterf@5: Settings['paths']['chime_home'] = {'basepath':'/import/c4dm-02/people/peterf/audex/datasets/chime_home/release/'} peterf@2: peterf@2: #Read data sets and class assignments peterf@2: Datasets = {'chime_home':{}} peterf@2: peterf@2: #Read in annotations peterf@5: Chunks = list(Series.from_csv(Settings['paths']['chime_home']['basepath'] + 'chunks_refined.csv',header=None)) peterf@2: Annotations = [] peterf@2: for chunk in Chunks: peterf@2: Annotations.append(Series.from_csv(Settings['paths']['chime_home']['basepath'] + 'chunks/' + chunk + '.csv')) peterf@2: Datasets['chime_home']['dataset'] = DataFrame(Annotations) peterf@2: peterf@2: #Compute label statistics peterf@2: Datasets['chime_home']['labelstats'] = defaultdict(lambda: 0) peterf@2: for item in Datasets['chime_home']['dataset']['majorityvote']: peterf@2: for label in item: peterf@2: Datasets['chime_home']['labelstats'][label] += 1 peterf@5: #Labels to consider for multilabel classification peterf@2: Datasets['chime_home']['consideredlabels'] = ['c', 'b', 'f', 'm', 'o', 'p', 'v'] peterf@2: #Populate binary label assignments peterf@2: for label in Datasets['chime_home']['consideredlabels']: peterf@2: Datasets['chime_home']['dataset'][label] = [label in item for item in Datasets['chime_home']['dataset']['majorityvote']] peterf@2: #Obtain statistics for considered labels peterf@2: sum(Datasets['chime_home']['dataset'][Datasets['chime_home']['consideredlabels']]) / len(Datasets['chime_home']['dataset']) peterf@5: #Create partition for 10-fold cross-validation. Shuffling ensures each fold has approximately equal proportion of label occurrences peterf@2: np.random.seed(475686) peterf@2: Datasets['chime_home']['crossval_10fold'] = cross_validation.KFold(len(Datasets['chime_home']['dataset']), 10, shuffle=True) peterf@2: peterf@5: Datasets['chime_home']['dataset']['wavfile'] = Datasets['chime_home']['dataset']['chunkname'].apply(lambda s: Settings['paths']['chime_home']['basepath'] + 'chunks/' + s + '.48kHz.wav') peterf@2: peterf@2: #Extract features and assign them to Datasets structure peterf@2: for dataset in Datasets.keys(): peterf@2: picklepath = os.path.join(Settings['paths']['featuresdir'],'features_' + dataset) peterf@2: if not(os.path.isfile(picklepath)): peterf@2: if dataset == 'chime_home': peterf@2: featureExtractor = FeatureExtractor(samplingRate=48000, frameLength=1024, hopLength=512) peterf@2: else: peterf@2: raise NotImplementedError() peterf@2: FeatureList = featureExtractor.files_to_features(Datasets[dataset]['dataset']['wavfile']) peterf@2: custompickler.pickle_save(FeatureList,picklepath) peterf@2: else: peterf@2: FeatureList = custompickler.pickle_load(picklepath) peterf@2: #Integrity check peterf@2: for features in FeatureList: peterf@2: for feature in features.values(): peterf@2: assert(all(isfinite(feature.ravel()))) peterf@2: Datasets[dataset]['dataset']['features'] = FeatureList peterf@2: peterf@2: #GMM experiments using CHiME home dataset peterf@2: EXPER005 = {} peterf@2: EXPER005['name'] = 'GMM_Baseline_EXPER005' peterf@2: EXPER005['path'] = os.path.join(Settings['paths']['resultsdir'],'exploratory','saved_objects','EXPER005') peterf@2: EXPER005['settings'] = {'numcomponents': (1,2,4,8), 'features': ('librosa_mfccs',)} peterf@2: EXPER005['datasets'] = {} peterf@2: EXPER005['datasets']['chime_home'] = exper002.exper002_multilabelclassification(Datasets['chime_home']['dataset'], Datasets['chime_home']['consideredlabels'], Datasets['chime_home']['crossval_10fold'], Settings, numComponentValues=EXPER005['settings']['numcomponents'], featureTypeValues=EXPER005['settings']['features']) peterf@2: EXPER005 = compute_performance_statistics(EXPER005, Datasets, Settings, iterableParameters=['numcomponents', 'features']) peterf@2: custompickler.pickle_save(EXPER005, EXPER005['path']) peterf@2: peterf@2: #Collate results peterf@2: def accumulate_results(EXPER): peterf@2: EXPER['summaryresults'] = {} peterf@2: ds = EXPER['datasets'].keys()[0] peterf@2: for numComponents in EXPER['settings']['numcomponents']: peterf@2: EXPER['summaryresults'][numComponents] = {} peterf@2: for label in Datasets[ds]['consideredlabels']: peterf@2: EXPER['summaryresults'][numComponents][label] = EXPER['datasets'][ds][(numComponents, 'librosa_mfccs')]['performance']['classwise'][label]['auc_precisionrecall'] peterf@2: EXPER['summaryresults'] = DataFrame(EXPER['summaryresults']) peterf@2: accumulate_results(EXPER005) peterf@2: peterf@2: #Generate plot peterf@2: def plot_performance(EXPER): peterf@2: fig_width_pt = 246.0 # Get this from LaTeX using \showthe\columnwidth peterf@2: inches_per_pt = 1.0/72.27 # Convert pt to inch peterf@2: golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio peterf@2: fig_width = fig_width_pt*inches_per_pt # width in inches peterf@2: fig_height = fig_width*golden_mean # height in inches peterf@2: fig_size = [fig_width,fig_height] peterf@2: params = {'backend': 'ps', peterf@2: 'axes.labelsize': 8, peterf@2: 'text.fontsize': 8, peterf@2: 'legend.fontsize': 7.0, peterf@2: 'xtick.labelsize': 8, peterf@2: 'ytick.labelsize': 8, peterf@2: 'text.usetex': False, peterf@2: 'figure.figsize': fig_size} peterf@2: rcParams.update(params) peterf@2: ind = np.arange(len(EXPER['summaryresults'][1])) # the x locations for the groups peterf@2: width = 0.22 # the width of the bars peterf@2: fig, ax = plt.subplots() peterf@2: rects = [] peterf@2: colours = ('r', 'y', 'g', 'b', 'c') peterf@2: for numComponents, i in zip(EXPER['summaryresults'],range(len(EXPER['summaryresults']))): peterf@2: rects.append(ax.bar(ind+width*i, EXPER['summaryresults'][numComponents][['c','m','f','v','p','b','o']], width, color=colours[i], align='center')) peterf@2: # add text for labels, title and axes ticks peterf@2: ax.set_ylabel('AUC') peterf@2: ax.set_xlabel('Label') peterf@2: ax.set_xticks(ind+width) peterf@2: ax.set_xticklabels(('c','m','f','v','p','b','o')) peterf@2: ax.legend( (rect[0] for rect in rects), ('k=1', 'k=2', 'k=4','k=8') ,loc='lower right') peterf@2: #Tweak x-axis limit peterf@2: ax.set_xlim(left=-0.5) peterf@2: ax.set_ylim(top=1.19) peterf@2: plt.gcf().subplots_adjust(left=0.15) #Prevent y-axis label from being chopped off peterf@2: def autolabel(r): peterf@2: for rects in r: peterf@2: for rect in rects: peterf@2: height = rect.get_height() peterf@2: ax.text(rect.get_x()+0.14,0.04+height,'%1.2f'%float(height),ha='center',va='bottom',rotation='vertical',size=6.0) peterf@2: autolabel(rects) peterf@2: plt.draw() peterf@2: plt.savefig('figures/predictionperformance' + EXPER['name'] +'.pdf') peterf@5: plot_performance(EXPER005)