# HG changeset patch # User Maria Panteli # Date 1505153200 -3600 # Node ID c4841876a8ff40579d332427cd8cac6cf1dadb9c # Parent 0f3eba42b425c1cb4a05015f5c54620a185642a2 adding notebooks and trying to explain classifier coefficients diff -r 0f3eba42b425 -r c4841876a8ff notebooks/explain_components.ipynb --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/notebooks/explain_components.ipynb Mon Sep 11 19:06:40 2017 +0100 @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA\n", + "\n", + "import sys\n", + "sys.path.append('../')\n", + "import scripts.map_and_average as mapper\n", + "import scripts.util_feature_learning as util_feature_learning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/import/c4dm-04/mariap/train_data_melodia_8.pickle\n" + ] + }, + { + "ename": "IOError", + "evalue": "[Errno 2] No such file or directory: '/import/c4dm-04/mariap/train_data_melodia_8.pickle'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIOError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrainset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_train_val_test_sets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_train_val_test_sets\u001b[0;34m()\u001b[0m\n\u001b[1;32m 69\u001b[0m '''\n\u001b[1;32m 70\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 71\u001b[0;31m \u001b[0mtrainset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 72\u001b[0m \u001b[0mvalset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0mtestset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_data_from_pickle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mINPUT_FILES\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/mariapanteli/Documents/QMUL/Code/MyPythonCode/plosone_underreview/scripts/map_and_average.pyc\u001b[0m in \u001b[0;36mload_data_from_pickle\u001b[0;34m(pickle_file)\u001b[0m\n\u001b[1;32m 56\u001b[0m '''load frame based features and labels from pickle file\n\u001b[1;32m 57\u001b[0m '''\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_file\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudiolabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;31m# remove 'unknown' and 'unidentified' country\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIOError\u001b[0m: [Errno 2] No such file or directory: '/import/c4dm-04/mariap/train_data_melodia_8.pickle'" + ] + } + ], + "source": [ + "trainset, valset, testset = mapper.load_train_val_test_sets()\n", + "traindata, trainlabels, trainaudiolabels = trainset\n", + "valdata, vallabels, valaudiolabels = valset\n", + "testdata, testlabels, testaudiolabels = testset\n", + "labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel()\n", + "audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel()\n", + "print traindata.shape, valdata.shape, testdata.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## explain LDA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "min_variance = 0.99\n", + "feat_labels, feat_inds = mapper.get_feat_inds(n_dim=traindata.shape[1])\n", + "for i in range(len(feat_inds)):\n", + " print \"mapping \" + feat_labels[i]\n", + " inds = feat_inds[i]\n", + " ssm_feat = util_feature_learning.Transformer()\n", + " if min_variance is not None:\n", + " ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True)\n", + " n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1\n", + " print n_components, len(inds)\n", + " ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components)\n", + "\n", + " WW = ssm_feat.lda_transformer.scalings_\n", + " plt.figure()\n", + " plt.imshow(WW[:, :n_components].T, aspect='auto')\n", + " plt.colorbar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## explain classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X_list, Y, Yaudio = pickle.load(open('../data/lda_data_melodia_8.pickle','rb'))\n", + "Xrhy, Xmel, Xmfc, Xchr = X_list\n", + "X = np.concatenate((Xrhy, Xmel, Xmfc, Xchr), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "ssm_feat.classify_and_save(X_train, Y_train, X_test, Y_test, transform_label=\" \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def components_plot(lda_transformer, XX, n_comp=42, figurename=None):\n", + " WW=lda_transformer.scalings_\n", + " Xlda=lda_transformer.transform(XX)\n", + " Xww=numpy.dot(XX, WW[:, :n_comp])\n", + " plt.figure()\n", + " plt.imshow(Xlda - Xww, aspect='auto')\n", + " plt.figure()\n", + " plt.imshow(Xlda, aspect='auto')\n", + " plt.figure()\n", + " plt.imshow(Xww, aspect='auto')\n", + " plt.figure()\n", + " plt.imshow(WW[:, :n_comp], aspect='auto') # this explains the weights up to n_components=64\n", + " if figurename is not None:\n", + " plt.savefig(figurename)\n", + "\n", + "XX = traindata[:, inds]\n", + "components_plot(ssm_feat.lda_transformer, XX, n_comp=n_components)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff -r 0f3eba42b425 -r c4841876a8ff scripts/map_and_average.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/map_and_average.py Mon Sep 11 19:06:40 2017 +0100 @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Mar 16 02:44:07 2017 + +@author: mariapanteli +""" + +import numpy as np +import pickle + +import util_feature_learning + +WIN_SIZE = 8 +INPUT_FILES = ['/import/c4dm-04/mariap/train_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/val_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/test_data_melodia_'+str(WIN_SIZE)+'.pickle'] +OUTPUT_FILES = ['/import/c4dm-04/mariap/lda_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/pca_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/nmf_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/ssnmf_data_melodia_'+str(WIN_SIZE)+'.pickle', + '/import/c4dm-04/mariap/na_data_melodia_'+str(WIN_SIZE)+'.pickle'] + + +def remove_inds(features, labels, audiolabels): + '''remove instances with unknown country + ''' + remove_inds1 = np.where(labels=='unknown')[0] + remove_inds2 = np.where(labels=='Unidentified')[0] + keep_inds = np.array(list(set(range(len(labels))) - (set(remove_inds1) | set(remove_inds2)))) + features = features[keep_inds, :] + labels = labels[keep_inds] + audiolabels = audiolabels[keep_inds] + return features, labels, audiolabels + + +def averageframes(features, audiolabels, classlabels): + '''average frame-based features for each recording + ''' + u, ind = np.unique(audiolabels, return_index=True) + uniqsorted = u[np.argsort(ind)] + newfeatures = [] + newclasslabels = [] + newaudiolabels = [] + for aulabel in uniqsorted: + inds = np.where(audiolabels == aulabel)[0] + newfeatures.append(np.mean(features[inds, :], axis=0)) + newclasslabels.append(classlabels[inds[0]]) + newaudiolabels.append(aulabel) + newfeatures = np.array(newfeatures) + newaudiolabels = np.array(newaudiolabels) + newclasslabels = np.array(newclasslabels) + return newfeatures, newaudiolabels, newclasslabels + + +def load_data_from_pickle(pickle_file=None): + '''load frame based features and labels from pickle file + ''' + with open(pickle_file,'rb') as f: + data, labels, audiolabels = pickle.load(f) + # remove 'unknown' and 'unidentified' country + data, labels, audiolabels = remove_inds(data, labels, audiolabels) + # avoid nan which gives error in feature learning + data[np.isnan(data)] = 0 + return data, labels, audiolabels + + +def load_train_val_test_sets(): + '''load train, val, test sets + ''' + print INPUT_FILES[0] + trainset = load_data_from_pickle(INPUT_FILES[0]) + valset = load_data_from_pickle(INPUT_FILES[1]) + testset = load_data_from_pickle(INPUT_FILES[2]) + return trainset, valset, testset + + +def get_feat_inds(n_dim=840): + '''assume frame with 840 features and return indices for each feature + ''' + if n_dim == 840: + rhy_inds = np.arange(400) + mel_inds = np.arange(400, 640) + mfc_inds = np.arange(640, 720) + chr_inds = np.arange(720, 840) + elif n_dim == 640: + rhy_inds = np.arange(200) + mel_inds = np.arange(200, 440) + mfc_inds = np.arange(440, 520) + chr_inds = np.arange(520, 640) + elif n_dim == 460: + rhy_inds = np.arange(200) + mel_inds = np.arange(200, 260) + mfc_inds = np.arange(260, 340) + chr_inds = np.arange(340, 460) + elif n_dim == 660: + rhy_inds = np.arange(400) + mel_inds = np.arange(400, 460) + mfc_inds = np.arange(460, 540) + chr_inds = np.arange(540, 660) + feat_inds = [rhy_inds, mel_inds, mfc_inds, chr_inds] + feat_labels = ['rhy', 'mel', 'mfc', 'chr'] + return feat_labels, feat_inds + + +def map_and_average_frames(dataset=None, n_components=None, min_variance=None): + if dataset is None: + trainset, valset, testset = load_train_val_test_sets() + else: + trainset, valset, testset = dataset + traindata, trainlabels, trainaudiolabels = trainset + valdata, vallabels, valaudiolabels = valset + testdata, testlabels, testaudiolabels = testset + print traindata.shape, valdata.shape, testdata.shape + labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel() + audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel() + + feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1]) + ldadata_list = [] + pcadata_list = [] + nmfdata_list = [] + ssnmfdata_list = [] + data_list = [] + for i in range(len(feat_inds)): + print "mapping " + feat_labels[i] + inds = feat_inds[i] + ssm_feat = util_feature_learning.Transformer() + if min_variance is not None: + ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True) + n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1 + print n_components, len(inds) + ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components) + elif n_components is not None: + ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=n_components) + else: + ssm_feat.fit_data(traindata[:, inds], trainlabels, n_components=len(inds)) + all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0) + transformed_data_dict = ssm_feat.transform_data(all_data) + for key in transformed_data_dict.keys(): + average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels) + transformed_data_dict[key] = average_data + data_list.append(transformed_data_dict['none']) + pcadata_list.append(transformed_data_dict['pca']) + ldadata_list.append(transformed_data_dict['lda']) + nmfdata_list.append(transformed_data_dict['nmf']) + ssnmfdata_list.append(transformed_data_dict['ssnmf']) + return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs + + +def lda_map_and_average_frames(dataset=None, n_components=None, min_variance=None): + if dataset is None: + trainset, valset, testset = load_train_val_test_sets() + else: + trainset, valset, testset = dataset + traindata, trainlabels, trainaudiolabels = trainset + valdata, vallabels, valaudiolabels = valset + testdata, testlabels, testaudiolabels = testset + print traindata.shape, valdata.shape, testdata.shape + labels = np.concatenate((trainlabels, vallabels, testlabels)).ravel() + audiolabels = np.concatenate((trainaudiolabels, valaudiolabels, testaudiolabels)).ravel() + + feat_labels, feat_inds = get_feat_inds(n_dim=traindata.shape[1]) + ldadata_list = [] + pcadata_list = [] + nmfdata_list = [] + ssnmfdata_list = [] + data_list = [] + for i in range(len(feat_inds)): + print "mapping " + feat_labels[i] + inds = feat_inds[i] + ssm_feat = util_feature_learning.Transformer() + if min_variance is not None: + ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds), pca_only=True) + n_components = np.where(ssm_feat.pca_transformer.explained_variance_ratio_.cumsum()>min_variance)[0][0]+1 + print n_components, len(inds) + ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components) + elif n_components is not None: + ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=n_components) + else: + ssm_feat.fit_lda_data(traindata[:, inds], trainlabels, n_components=len(inds)) + all_data = np.concatenate((traindata[:, inds], valdata[:, inds], testdata[:, inds]), axis=0) + transformed_data_dict = ssm_feat.transform_lda_data(all_data) + for key in transformed_data_dict.keys(): + if len(transformed_data_dict[key])==0: + continue + average_data, audiolabs, classlabs = averageframes(transformed_data_dict[key], audiolabels, labels) + transformed_data_dict[key] = average_data + data_list.append(transformed_data_dict['none']) + pcadata_list.append(transformed_data_dict['pca']) + ldadata_list.append(transformed_data_dict['lda']) + nmfdata_list.append(transformed_data_dict['nmf']) + ssnmfdata_list.append(transformed_data_dict['ssnmf']) + return data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs + + +def write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs): + pickle.dump([ldadata_list, classlabs, audiolabs], open(OUTPUT_FILES[0], 'wb')) + pickle.dump([pcadata_list, classlabs, audiolabs], open(OUTPUT_FILES[1], 'wb')) + pickle.dump([nmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[2], 'wb')) + pickle.dump([ssnmfdata_list, classlabs, audiolabs], open(OUTPUT_FILES[3], 'wb')) + pickle.dump([data_list, classlabs, audiolabs], open(OUTPUT_FILES[4], 'wb')) + + +if __name__ == '__main__': + # first only lda - because it goes fast + data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = lda_map_and_average_frames(min_variance=0.99) + write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs) + # then add nmf,ssnmf + #data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs = map_and_average_frames(min_variance=0.99) + #write_output(data_list, pcadata_list, ldadata_list, nmfdata_list, ssnmfdata_list, classlabs, audiolabs) + \ No newline at end of file diff -r 0f3eba42b425 -r c4841876a8ff scripts/nmftools.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/nmftools.py Mon Sep 11 19:06:40 2017 +0100 @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Oct 26 12:46:13 2016 + +@author: https://github.com/keik/nmftools/blob/master/nmftools/core.py +""" + +import numpy as np + + +def nmf(Y, R=3, n_iter=50, init_H=[], init_U=[], verbose=False): + """ + decompose non-negative matrix to components and activation with NMF + Y ≈ HU + Y ∈ R (m, n) + H ∈ R (m, k) + HU ∈ R (k, n) + parameters + ---- + Y: target matrix to decompose + R: number of bases to decompose + n_iter: number for executing objective function to optimize + init_H: initial value of H matrix. default value is random matrix + init_U: initial value of U matrix. default value is random matrix + return + ---- + Array of: + 0: matrix of H + 1: matrix of U + 2: array of cost transition + """ + + eps = np.spacing(1) + + # size of input spectrogram + M = Y.shape[0] + N = Y.shape[1] + + # initialization + if len(init_U): + U = init_U + R = init_U.shape[0] + else: + U = np.random.rand(R,N); + + if len(init_H): + H = init_H; + R = init_H.shape[1] + else: + H = np.random.rand(M,R) + + # array to save the value of the euclid divergence + cost = np.zeros(n_iter) + + # computation of Lambda (estimate of Y) + Lambda = np.dot(H, U) + + # iterative computation + for i in range(n_iter): + + # compute euclid divergence + cost[i] = euclid_divergence(Y, Lambda) + + # update H + H *= np.dot(Y, U.T) / (np.dot(np.dot(H, U), U.T) + eps) + + # update U + U *= np.dot(H.T, Y) / (np.dot(np.dot(H.T, H), U) + eps) + + # recomputation of Lambda + Lambda = np.dot(H, U) + + return [H, U, cost] + + +def ssnmf(Y, R=3, n_iter=50, F=[], init_G=[], init_H=[], init_U=[], verbose=False): + """ + decompose non-negative matrix to components and activation with semi-supervised NMF + Y ≈ FG + HU + Y ∈ R (m, n) + F ∈ R (m, x) + G ∈ R (x, n) + H ∈ R (m, k) + U ∈ R (k, n) + parameters + ---- + Y: target matrix to decompose + R: number of bases to decompose + n_iter: number for executing objective function to optimize + F: matrix as supervised base components + init_W: initial value of W matrix. default value is random matrix + init_H: initial value of W matrix. default value is random matrix + return + ---- + Array of: + 0: matrix of F + 1: matrix of G + 2: matrix of H + 3: matrix of U + 4: array of cost transition + """ + + eps = np.spacing(1) + + # size of input spectrogram + M = Y.shape[0]; + N = Y.shape[1]; + X = F.shape[1] + + # initialization + if len(init_G): + G = init_G + X = init_G.shape[1] + else: + G = np.random.rand(X, N) + + if len(init_U): + U = init_U + R = init_U.shape[0] + else: + U = np.random.rand(R, N) + + if len(init_H): + H = init_H + R = init_H.shape[1] + else: + H = np.random.rand(M, R) + + # array to save the value of the euclid divergence + cost = np.zeros(n_iter) + + # computation of Lambda (estimate of Y) + Lambda = np.dot(F, G) + np.dot(H, U) + + # iterative computation + for it in range(n_iter): + + # compute euclid divergence + cost[it] = euclid_divergence(Y, Lambda + eps) + + # update of H + H *= (np.dot(Y, U.T) + eps) / (np.dot(np.dot(H, U) + np.dot(F, G), U.T) + eps) + + # update of U + U *= (np.dot(H.T, Y) + eps) / (np.dot(H.T, np.dot(H, U) + np.dot(F, G)) + eps) + + # update of G + G *= (np.dot(F.T, Y) + eps)[np.arange(G.shape[0])] / (np.dot(F.T, np.dot(H, U) + np.dot(F, G)) + eps) + + # recomputation of Lambda (estimate of V) + Lambda = np.dot(H, U) + np.dot(F, G) + + return [F, G, H, U, cost] + + +def euclid_divergence(V, Vh): + d = 1 / 2 * (V ** 2 + Vh ** 2 - 2 * V * Vh).sum() + return d \ No newline at end of file diff -r 0f3eba42b425 -r c4841876a8ff scripts/util_feature_learning.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/util_feature_learning.py Mon Sep 11 19:06:40 2017 +0100 @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Apr 3 15:14:40 2017 + +@author: mariapanteli +""" + + +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelBinarizer +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA +from sklearn.decomposition.pca import PCA +from sklearn.decomposition import NMF +from sklearn.preprocessing import scale +from sklearn.neighbors import KNeighborsClassifier +from sklearn import svm +from sklearn import metrics +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import normalize +from numpy.linalg import pinv + +import nmftools + + +class Transformer: + def __init__(self): + self.pca_transformer = None + self.lda_transformer = None + self.nmf_transformer = None + self.ssnmf_H = None + self.modelKNN = None + self.modelLDA = None + self.modelSVM = None + self.modelRF = None + #self.df_results = None + + + def ssnmf_fit(self, data, labels, npc=None): + binarizer = LabelBinarizer() + F_class = binarizer.fit_transform(labels) + F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200) + ssWH = np.dot(F, G) + np.dot(W, H) + rec_err = np.linalg.norm(data - ssWH) + return G, W, H, rec_err + + + def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False): + X_train = scale(X_train, axis=0) + # then pca + print "training with PCA transform..." + self.pca_transformer = PCA(n_components=n_components).fit(X_train) + print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_)) + if pca_only: + # return pca transformer only + return + # then lda + print "training with LDA transform..." + self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train) + print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_)) + + + def transform_lda_data(self, X_test): + X_test = scale(X_test, axis=0) + print "transform test data..." + pca_testdata = self.pca_transformer.transform(X_test) + lda_testdata = self.lda_transformer.transform(X_test) + #norm_testdata = normalize(X_test - np.min(X_test)) + #nmf_testdata = self.nmf_transformer.transform(norm_testdata) + #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H)) + transformed_data = {'none': X_test, 'pca': pca_testdata, + 'lda': lda_testdata, + 'nmf': [], + 'ssnmf': []} + return transformed_data + + + def fit_data(self, X_train, Y_train, n_components=None, pca_only=False): + if n_components is None: + n_components = X_train.shape[1] + X_train = scale(X_train, axis=0) + # then pca + print "training with PCA transform..." + self.pca_transformer = PCA(n_components=n_components).fit(X_train) + print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_)) + if pca_only: + # return pca transformer only + return + # then lda + print "training with LDA transform..." + self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train) + print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_)) + # then nmf + print "training with NMF transform..." + norm_traindata = normalize(X_train - np.min(X_train)) + self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata) + print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_)) + # then ssnmf + print "training with SSNMF transform..." + G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components) + print "reconstruction error " + str(rec_err) + + + def transform_data(self, X_test): + X_test = scale(X_test, axis=0) + print "transform test data..." + pca_testdata = self.pca_transformer.transform(X_test) + lda_testdata = self.lda_transformer.transform(X_test) + norm_testdata = normalize(X_test - np.min(X_test)) + nmf_testdata = self.nmf_transformer.transform(norm_testdata) + ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H)) + transformed_data = {'none': X_test, 'pca': pca_testdata, + 'lda': lda_testdata, + 'nmf': nmf_testdata, + 'ssnmf': ssnmf_testdata} + return transformed_data + + + def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None): + if model is None: + model = LDA() + model.fit(X_train, Y_train) + predictions = model.predict(X_test) + accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes + return accuracy, predictions + + + def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "): + modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') + modelLDA = LDA() + modelSVM = svm.SVC(kernel='rbf', gamma=0.1) + modelRF = RandomForestClassifier() + model_labels = ['KNN', 'LDA', 'SVM', 'RF'] + models = [modelKNN, modelLDA, modelSVM, modelRF] + df_results = pd.DataFrame() + for model, model_label in zip(models, model_labels): + acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) + print model_label + " " + transform_label + " " + str(acc) + df_results = df_results.append(pd.DataFrame([[model_label, acc]])) + #self.df_results = df_results + return df_results + + + def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "): + self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') + self.modelLDA = LDA() + self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1) + self.modelRF = RandomForestClassifier() + model_labels = ['KNN', 'LDA', 'SVM', 'RF'] + models = [modelKNN, modelLDA, modelSVM, modelRF] + df_results = pd.DataFrame() + for model, model_label in zip(models, model_labels): + acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) + print model_label + " " + transform_label + " " + str(acc) + df_results = df_results.append(pd.DataFrame([[model_label, acc]])) + #self.df_results = df_results + return df_results + + +if __name__ == '__main__': + Transformer() \ No newline at end of file