view scripts/map_and_average.py @ 58:d118b6ca8370 branch-tests

some changes in classification with random train/test split
author Maria Panteli <m.x.panteli@gmail.com>
date Thu, 21 Sep 2017 15:24:18 +0100
parents c4841876a8ff
children 4425a4918102
line wrap: on
line source
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 16 02:44:07 2017

@author: mariapanteli
"""

import numpy as np
import pickle

import util_feature_learning
    
WIN_SIZE = 8
INPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', 
               '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', 
               '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle']
OUTPUT_FILES = ['/import/c4dm-04/mariap/lda_data_melodia_'+str(WIN_SIZE)+'.pickle', 
                '/import/c4dm-04/mariap/pca_data_melodia_'+str(WIN_SIZE)+'.pickle', 
                '/import/c4dm-04/mariap/nmf_data_melodia_'+str(WIN_SIZE)+'.pickle', 
                '/import/c4dm-04/mariap/ssnmf_data_melodia_'+str(WIN_SIZE)+'.pickle', 
                '/import/c4dm-04/mariap/na_data_melodia_'+str(WIN_SIZE)+'.pickle']


def remove_inds(features, labels, audiolabels):
    '''remove instances with unknown country
    '''
    remove_inds1 = np.where(labels=='unknown')[0]
    remove_inds2 = np.where(labels=='Unidentified')[0]
    keep_inds = np.array(list(set(range(len(labels))) - (set(remove_inds1) | set(remove_inds2))))
    features = features[keep_inds, :]
    labels = labels[keep_inds]
    audiolabels = audiolabels[keep_inds]
    return features, labels, audiolabels


def averageframes(features, audiolabels, classlabels):
    '''average frame-based features for each recording
    '''
    u, ind = np.unique(audiolabels, return_index=True)
    uniqsorted = u[np.argsort(ind)]
    newfeatures = []
    newclasslabels = []
    newaudiolabels = []
    for aulabel in uniqsorted:
        inds = np.where(audiolabels == aulabel)[0]
        newfeatures.append(np.mean(features[inds, :], axis=0))
        newclasslabels.append(classlabels[inds[0]])
        newaudiolabels.append(aulabel)
    newfeatures = np.array(newfeatures)
    newaudiolabels = np.array(newaudiolabels)
    newclasslabels = np.array(newclasslabels)
    return newfeatures, newaudiolabels, newclasslabels


def load_data_from_pickle(pickle_file=None):
    '''load frame based features and labels from pickle file
    '''
    with open(pickle_file,'rb') as f:
        data, labels, audiolabels = pickle.load(f)
    # remove 'unknown' and 'unidentified' country
    data, labels, audiolabels = remove_inds(data, labels, audiolabels)
    # avoid nan which gives error in feature learning 
    data[np.isnan(data)] = 0
    return data, labels, audiolabels


def load_train_val_test_sets():
    '''load train, val, test sets
    '''
    trainset = load_data_from_pickle(INPUT_FILES[0])
    valset = load_data_from_pickle(INPUT_FILES[1])
    testset = load_data_from_pickle(INPUT_FILES[2])
    return trainset, valset, testset


def get_feat_inds(n_dim=840):
    '''assume frame with 840 features and return indices for each feature
    '''
    if n_dim == 840:
        rhy_inds = np.arange(400)
        mel_inds = np.arange(400, 640)
        mfc_inds = np.arange(640, 720)
        chr_inds = np.arange(720, 840)
    elif n_dim == 640:
        rhy_inds = np.arange(200)
        mel_inds = np.arange(200, 440)
        mfc_inds = np.arange(440, 520)
        chr_inds = np.arange(520, 640)
    elif n_dim == 460:
        rhy_inds = np.arange(200)
        mel_inds = np.arange(200, 260)
        mfc_inds = np.arange(260, 340)
        chr_inds = np.arange(340, 460)
    elif n_dim == 660:
        rhy_inds = np.arange(400)
        mel_inds = np.arange(400, 460)
        mfc_inds = np.arange(460, 540)
        chr_inds = np.arange(540, 660)
    feat_inds = [rhy_inds, mel_inds, mfc_inds, chr_inds] 
    feat_labels = ['rhy', 'mel', 'mfc', 'chr']
    return feat_labels, feat_inds


def map_and_average_frames(dataset=None, n_components=None, min_variance=None):
    if dataset is None:
        trainset, valset, testset = load_train_val_test_sets()
    else:
        trainset, valset, testset = dataset
    traindata, trainlabels, trainaudiolabels = trainset
    valdata, vallabels, valaudiolabels = valset
    testdata, testlabels, testaudiolabels = testset
    print traindata.shape, valdata.shape, testdata.shape
    labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()
    audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()
    
    feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1])
    ldadata_list = []
    pcadata_list = []
    nmfdata_list = []
    ssnmfdata_list = []
    data_list = []
    for i in range(len(feat_inds)):
        print "mapping " + feat_labels[i]
        inds = feat_inds[i]
        ssm_feat = util_feature_learning.Transformer()
        if min_variance is not None:
            ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)
            n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1
            print n_components, len(inds)
            ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components)
        elif n_components is not None:
            ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components)
        else:
            ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds))
        all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0)
        transformed_data_dict = ssm_feat.transform_data(all_data)
        for key in transformed_data_dict.keys():
            average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels)
            transformed_data_dict[key] = average_data
        data_list.append(transformed_data_dict['none'])
        pcadata_list.append(transformed_data_dict['pca'])
        ldadata_list.append(transformed_data_dict['lda'])
        nmfdata_list.append(transformed_data_dict['nmf'])
        ssnmfdata_list.append(transformed_data_dict['ssnmf'])
    return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs


def lda_map_and_average_frames(dataset=None, n_components=None, min_variance=None):
    if dataset is None:
        trainset, valset, testset = load_train_val_test_sets()
    else:
        trainset, valset, testset = dataset
    traindata, trainlabels, trainaudiolabels = trainset
    valdata, vallabels, valaudiolabels = valset
    testdata, testlabels, testaudiolabels = testset
    print traindata.shape, valdata.shape, testdata.shape
    labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()
    audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()
    
    feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1])
    ldadata_list = []
    pcadata_list = []
    nmfdata_list = []
    ssnmfdata_list = []
    data_list = []
    for i in range(len(feat_inds)):
        print "mapping " + feat_labels[i]
        inds = feat_inds[i]
        ssm_feat = util_feature_learning.Transformer()
        if min_variance is not None:
            ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)
            n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1
            print n_components, len(inds)
            ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)
        elif n_components is not None:
            ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)
        else:
            ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds))
        all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0)
        transformed_data_dict = ssm_feat.transform_lda_data(all_data)
        for key in transformed_data_dict.keys():
            if len(transformed_data_dict[key])==0:
                continue
            average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels)
            transformed_data_dict[key] = average_data
        data_list.append(transformed_data_dict['none'])
        pcadata_list.append(transformed_data_dict['pca'])
        ldadata_list.append(transformed_data_dict['lda'])
        nmfdata_list.append(transformed_data_dict['nmf'])
        ssnmfdata_list.append(transformed_data_dict['ssnmf'])
    return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs


def write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs):
    pickle.dump([ldadata_list, classlabs, audiolabs], open(OUTPUT_FILES[0], 'wb'))
    pickle.dump([pcadata_list, classlabs, audiolabs], open(OUTPUT_FILES[1], 'wb'))
    pickle.dump([nmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[2], 'wb'))
    pickle.dump([ssnmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[3], 'wb'))
    pickle.dump([data_list, classlabs, audiolabs], open(OUTPUT_FILES[4], 'wb'))
    

if __name__ == '__main__':
    # first only lda - because it goes fast
    data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = lda_map_and_average_frames(min_variance=0.99)    
    write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs)
    # then add nmf,ssnmf    
    #data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = map_and_average_frames(min_variance=0.99)
    #write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs)