view evaluate.py @ 7:b7169083b9ea tip

fix typo in variable name
author Maria Panteli
date Tue, 01 Jan 2019 15:51:38 +0200
parents 2732137aa9b5
children
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 12 18:56:28 2016

@author: mariapanteli
"""
"""Run classification and retrieval experiments"""

import os
import numpy
import pandas
import pickle
import sklearn.metrics.pairwise as PW
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import classifiers as cc


def post_process_frames(frames, pca_frames=True, n_pcas=20):
    """Standardize and PCA data."""
    frames = StandardScaler().fit_transform(frames.T).T  # standardise n_samples
    if pca_frames:
        frames = PCA(n_components=n_pcas).fit_transform(frames)
    return frames


def classification_experiments(features, labels, feat_labels, group_labels, nfolds=5):
    """ classify rhythms/melodies and average accuracy by label grouping,
        eg, average accuracy per transformation or transformation value
    """
    tlabels, inds = numpy.unique(group_labels, return_index=True)
    tlabels = tlabels[numpy.argsort(inds)]
    tlabelinds = [numpy.where(group_labels==tt)[0] for tt in tlabels]
    
    results_classification = []
    classifiers = ["KNN", "LDA", "NB", "SVM"]
    for feat, feat_label in zip(features, feat_labels):
        for cl in classifiers:
            if cl == "KNN":
                accuracies = cc.classifyKNN(feat, labels, kfold=nfolds)
            elif cl == "LDA":
                accuracies = cc.classifyLDA(feat, labels, kfold=nfolds)
            elif cl == "NB":
                accuracies = cc.classifyNB(feat, labels, kfold=nfolds)
            elif cl == "SVM":
                accuracies = cc.classifySVM(feat, labels, kfold=nfolds)
            group_accuracy = [numpy.nanmean(accuracies[labelinds]) for labelinds in tlabelinds]
            group_accuracy.append(numpy.mean(accuracies))
            group_accuracy.append(cl)
            group_accuracy.append(feat_label)
            results_classification.append(group_accuracy)
    return results_classification, tlabels


def topK_experiments(features, labels, feat_labels, group_labels, K=99):
    """ query rhythms/melodies and assess recall rate at top K , 
        average accuracy by label grouping, eg, by transformation or transformation value
    """
    tlabels, inds = numpy.unique(group_labels, return_index=True)
    tlabels = tlabels[numpy.argsort(inds)]
    tlabelinds = [numpy.where(group_labels==tt)[0] for tt in tlabels]
    
    results_topK = []
    dist_metrics = ["euclidean", "cosine", "correlation", "mahalanobis"]
    for feat, feat_label in zip(features, feat_labels):
        for metric in dist_metrics:
            D = PW.pairwise_distances(feat, metric=metric)
            accuracies = numpy.ones((len(labels), 1), dtype=float) * numpy.nan
            for label in numpy.unique(labels):
                truematchinds = numpy.where(labels == label)[0]
                # default timbre is the first filename of the family (eg. 1_2_1.wav for family 2)
                queryind = numpy.array([truematchinds[0]])
                truematchinds = set(truematchinds) - set(queryind)  # remove queryind
                sortindex = numpy.argsort(D[queryind, :]).flatten()
                sortindex = sortindex[1:]  # remove queryind (top of list)
                topKinds = set(sortindex[:K])
                correctinds = truematchinds & topKinds
                wronginds = truematchinds - correctinds
                accuracies[list(correctinds)] = 1
                accuracies[list(wronginds)] = 0
            group_accuracy = [numpy.nanmean(accuracies[labelinds]) for labelinds in tlabelinds]     
            group_accuracy.append(numpy.mean(accuracies[numpy.where(numpy.isnan(accuracies) == False)[0]]))
            group_accuracy.append(metric)
            group_accuracy.append(feat_label)
            results_topK.append(group_accuracy)
    return results_topK, tlabels


if __name__ == '__main__':
    # Load metadata
    meta = pandas.read_csv(os.path.join('data', 'Metadata.csv'), sep=',')
    labels = numpy.array(meta["family"].get_values(), dtype=str)
    
    # Load features and post process
    try:
        st = post_process_frames(pandas.read_csv(os.path.join('data','ST.csv'),header=None).get_values())
        op = post_process_frames(pandas.read_csv(os.path.join('data','OP.csv'),header=None).get_values())
        fp = post_process_frames(pandas.read_csv(os.path.join('data','FP.csv'),header=None).get_values())
        pb = post_process_frames(pandas.read_csv(os.path.join('data','PB.csv'),header=None).get_values())
        ig = post_process_frames(pandas.read_csv(os.path.join('data','IG.csv'),header=None).get_values())
        fmt = post_process_frames(pandas.read_csv(os.path.join('data','FMT.csv'),header=None).get_values())
        features = [st, op, fp, pb, ig, fmt]
    except Exception as e:
        with open(os.path.join('data', 'features.pickle'), 'rb') as f:
            features = pickle.load(f)
    
    feat_labels = ["ST", "OP", "FP", "PB", "IG", "FMT"]
    test_classes = ["transformation", "value", "style", "monopoly"]
        
    write_file = False  # set it to True if you want to write output file
    for test_class in test_classes:
        group_labels = meta[test_class].get_values()
        results_class, tlabels = classification_experiments(features, labels, feat_labels, group_labels) 
        results_topK, tlabels = topK_experiments(features, labels, feat_labels, group_labels) 
        header = numpy.append(tlabels, ['mean accuracy', 'metric', 'feature'])
        results = numpy.concatenate((header[None, :], numpy.array(results_class), numpy.array(results_topK)))
        print results

        if write_file:
            filename = os.path.join('data','results_' + test_class + '.csv')
            numpy.savetxt(filename, results, fmt='%s', delimiter=',')