view evaluate.py @ 1:c4ef4a02fc19

core functions
author Maria Panteli
date Mon, 01 Aug 2016 21:10:31 -0400
parents
children 2732137aa9b5
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 18:56:28 2016

@author: mariapanteli
"""
"""Run classification and retrieval experiments"""

import os
import numpy
import pandas
import sklearn.metrics.pairwise as PW
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import classifiers as cc


def post_process_frames(frames, pca_frames=True, n_pcas=20):
    """Standardize and PCA data."""
    frames = StandardScaler().fit_transform(frames.T).T  # standardise n_samples
    if pca_frames:
        frames = PCA(n_components=n_pcas).fit_transform(frames)
    return frames


def classification_experiments(features, labels, feat_labels, group_labels, nfolds=5):
    """ classify rhythms/melodies and average accuracy by label grouping,
        eg, average accuracy per transformation or transformation value
    """
    tlabels, inds = numpy.unique(group_labels, return_index=True)
    tlabels = tlabels[numpy.argsort(inds)]
    tlabelinds = [numpy.where(group_labels==tt)[0] for tt in tlabels]
    
    results_classification = []
    classifiers = ["KNN", "LDA", "NB", "SVM"]
    for feat, feat_label in zip(features, feat_labels):
        for cl in classifiers:
            if cl == "KNN":
                accuracies = cc.classifyKNN(feat, labels, kfold=nfolds)
            elif cl == "LDA":
                accuracies = cc.classifyLDA(feat, labels, kfold=nfolds)
            elif cl == "NB":
                accuracies = cc.classifyNB(feat, labels, kfold=nfolds)
            elif cl == "SVM":
                accuracies = cc.classifySVM(feat, labels, kfold=nfolds)
            group_accuracy = [numpy.nanmean(accuracies[labelinds]) for labelinds in tlabelinds]
            group_accuracy.append(numpy.mean(accuracies))
            group_accuracy.append(cl)
            group_accuracy.append(feat_label)
            results_classification.append(group_accuracy)
    return results_classification, tlabels


def topK_experiments(features, labels, feat_labels, group_labels, K=99):
    """ query rhythms/melodies and assess recall rate at top K , 
        average accuracy by label grouping, eg, by transformation or transformation value
    """
    tlabels, inds = numpy.unique(group_labels, return_index=True)
    tlabels = tlabels[numpy.argsort(inds)]
    tlabelinds = [numpy.where(group_labels==tt)[0] for tt in tlabels]
    
    results_topK = []
    dist_metrics = ["euclidean", "cosine", "correlation", "mahalanobis"]
    for feat, feat_label in zip(features, feat_labels):
        for metric in dist_metrics:
            D = PW.pairwise_distances(feat, metric=metric)
            accuracies = numpy.ones((len(labels), 1), dtype=float) * numpy.nan
            for label in numpy.unique(labels):
                queryind = numpy.where(labels == label)[0]
                truematchinds = numpy.where(labels == label)[0]
                truematchinds = set(truematchinds) - set(queryind)  # remove queryind
                sortindex = numpy.argsort(D[queryind, :]).flatten()
                sortindex = sortindex[1:]  # remove queryind (top of list)
                topKinds = set(sortindex[:K])
                correctinds = truematchinds & topKinds
                wronginds = truematchinds - correctinds
                accuracies[list(correctinds)] = 1
                accuracies[list(wronginds)] = 0
            group_accuracy = [numpy.nanmean(accuracies[labelinds]) for labelinds in tlabelinds]     
            group_accuracy.append(numpy.mean(accuracies[numpy.where(numpy.isnan(accuracies) == False)[0]]))
            group_accuracy.append(metric)
            group_accuracy.append(feat_label)
            results_topK.append(group_accuracy)
    return results_topK, tlabels


if __name__ == '__main__':
    # Load metadata
    meta = pandas.read_csv(os.path.join('data', 'Metadata.csv'), sep=',')
    labels = numpy.array(meta["family"].get_values(), dtype=str)
    
    # Load features and post process
    st = post_process_frames(pandas.read_csv(os.path.join('data','ST.csv'),header=None).get_values())
    op = post_process_frames(pandas.read_csv(os.path.join('data','OP.csv'),header=None).get_values())
    fp = post_process_frames(pandas.read_csv(os.path.join('data','FP.csv'),header=None).get_values())
    pb = post_process_frames(pandas.read_csv(os.path.join('data','PB.csv'),header=None).get_values())
    ig = post_process_frames(pandas.read_csv(os.path.join('data','IG.csv'),header=None).get_values())
    fmt = post_process_frames(pandas.read_csv(os.path.join('data','FMT.csv'),header=None).get_values())
    
    features = [st, op, fp, pb, ig, fmt]
    feat_labels = ["ST", "OP", "FP", "PB", "IG", "FMT"]
    test_classes = ["transformation", "value", "style", "monopoly"]
        
    write_file = False  # set it to True if you want to write output file
    for test_class in test_classes:
        group_labels = meta[test_class].get_values()
        results_class, tlabels = classification_experiments(features, labels, feat_labels, group_labels) 
        results_topK, tlabels = topK_experiments(features, labels, feat_labels, group_labels) 
        header = numpy.append(tlabels, ['mean accuracy', 'metric', 'feature'])
        results = numpy.concatenate((header[None, :], numpy.array(results_class), numpy.array(results_topK)))
        
        if write_file:
            filename = os.path.join('data','results_' + test_class + '.csv')
            numpy.savetxt(filename, results, fmt='%s', delimiter=',')