Mercurial > hg > plosone_underreview
changeset 31:03ff14ba9fa2 branch-tests
merged
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Wed, 13 Sep 2017 19:58:10 +0100 |
parents | e8084526f7e5 (diff) e4736064d282 (current diff) |
children | 928d9bf9224f |
files | notebooks/test_hubness.ipynb |
diffstat | 8 files changed, 139 insertions(+), 42 deletions(-) [+] |
line wrap: on
line diff
--- a/notebooks/test_hubness.ipynb Wed Sep 13 19:56:39 2017 +0100 +++ b/notebooks/test_hubness.ipynb Wed Sep 13 19:58:10 2017 +0100 @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -27,23 +27,17 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 3, + "metadata": { + "collapsed": false + }, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n", - " warnings.warn(\"There are %d disconnected observations\" % ni)\n", - "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n", - " warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n" - ] - }, - { "name": "stdout", "output_type": "stream", "text": [ + "WARNING: there are 21 disconnected observations\n", + "Island ids: [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n", "Antigua and Barbuda\n", "Australia\n", "Cuba\n", @@ -83,7 +77,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": { @@ -114,6 +110,8 @@ { "cell_type": "code", "execution_count": 5, + "collapsed": false + }, "metadata": {}, "outputs": [ { @@ -134,6 +132,8 @@ { "cell_type": "code", "execution_count": 6, + "collapsed": false + }, "metadata": {}, "outputs": [ { @@ -171,6 +171,11 @@ { "cell_type": "code", "execution_count": 8, + "outputs": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-1-0aacb5dec8fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mN_k\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_occurrence_from_D\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mskew\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_k\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_k\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'n_occurrence_from_D' is not defined" "metadata": {}, "outputs": [ { @@ -234,7 +239,9 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": false + }, "outputs": [ { "data": {
--- a/scripts/classification.py Wed Sep 13 19:56:39 2017 +0100 +++ b/scripts/classification.py Wed Sep 13 19:58:10 2017 +0100 @@ -63,8 +63,8 @@ def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False): feat_learner = util_feature_learning.Transformer() - accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train, - X_test, Y_test, model=util_feature_learning.modelLDA) + accuracy, predictions = feat_learner.classification_accuracy(X_train, Y_train, + X_test, Y_test, model=feat_learner.modelLDA) labels = np.unique(Y_test) # TODO: countries in geographical proximity CF = metrics.confusion_matrix(Y_test, predictions, labels=labels) if saveCF:
--- a/scripts/util_feature_learning.py Wed Sep 13 19:56:39 2017 +0100 +++ b/scripts/util_feature_learning.py Wed Sep 13 19:58:10 2017 +0100 @@ -65,9 +65,6 @@ print "transform test data..." pca_testdata = self.pca_transformer.transform(X_test) lda_testdata = self.lda_transformer.transform(X_test) - #norm_testdata = normalize(X_test - np.min(X_test)) - #nmf_testdata = self.nmf_transformer.transform(norm_testdata) - #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H)) transformed_data = {'none': X_test, 'pca': pca_testdata, 'lda': lda_testdata, 'nmf': [], @@ -124,24 +121,8 @@ accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes return accuracy, predictions - + def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "): - modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') - modelLDA = LDA() - modelSVM = svm.SVC(kernel='rbf', gamma=0.1) - modelRF = RandomForestClassifier() - model_labels = ['KNN', 'LDA', 'SVM', 'RF'] - models = [modelKNN, modelLDA, modelSVM, modelRF] - df_results = pd.DataFrame() - for model, model_label in zip(models, model_labels): - acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) - print model_label + " " + transform_label + " " + str(acc) - df_results = df_results.append(pd.DataFrame([[model_label, acc]])) - #self.df_results = df_results - return df_results - - - def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "): self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') self.modelLDA = LDA() self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1) @@ -153,7 +134,6 @@ acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) print model_label + " " + transform_label + " " + str(acc) df_results = df_results.append(pd.DataFrame([[model_label, acc]])) - #self.df_results = df_results return df_results
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_classification.py Wed Sep 13 19:58:10 2017 +0100 @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 1 19:11:52 2017 + +@author: mariapanteli +""" + +import pytest + +import numpy as np +from sklearn.model_selection import train_test_split + +import scripts.classification as classification + + +def test_confusion_matrix(): + X = np.random.randn(100, 3) + # create 2 classes by shifting the entries of half the samples + X[-50:, :] = X[-50:, :] + 10 + Y = np.concatenate([np.repeat('a', 50), np.repeat('b', 50)]) + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.6, random_state=1, stratify=Y) + accuracy, predictions = classification.confusion_matrix(X_train, Y_train, X_test, Y_test) + # expect perfect accuracy for this 'easy' dataset + assert accuracy == 1.0 +
--- a/tests/test_map_and_average.py Wed Sep 13 19:56:39 2017 +0100 +++ b/tests/test_map_and_average.py Wed Sep 13 19:58:10 2017 +0100 @@ -34,4 +34,5 @@ features = np.array([[0, 1], [0,2], [0, 1], [1, 1], [2, 1]]) audiolabels = np.array(['a', 'a', 'b', 'b', 'b']) feat, audio, labels = map_and_average.averageframes(features, audiolabels, classlabels) - feat_true = np.array([[0, 0.5], [1, 1]]) \ No newline at end of file + feat_true = np.array([[0, 1.5], [1, 1]]) + assert np.array_equal(feat, feat_true) \ No newline at end of file
--- a/tests/test_outliers.py Wed Sep 13 19:56:39 2017 +0100 +++ b/tests/test_outliers.py Wed Sep 13 19:58:10 2017 +0100 @@ -8,9 +8,6 @@ import pytest import numpy as np -import pandas as pd -import pickle -import os import scripts.outliers as outliers
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_util_feature_learning.py Wed Sep 13 19:58:10 2017 +0100 @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 1 19:11:52 2017 + +@author: mariapanteli +""" + +import pytest + +import numpy as np + +import scripts.util_feature_learning as util_feature_learning + + +feat_learner = util_feature_learning.Transformer() + + +def test_ssnmf_fit(): + assert True
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_utils.py Wed Sep 13 19:58:10 2017 +0100 @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Sep 1 19:11:52 2017 + +@author: mariapanteli +""" + +import pytest + +import numpy as np +import pandas as pd +import pickle +import os + +import scripts.utils as utils + + +def test_get_outliers(): + np.random.seed(1) + X = np.random.randn(100, 3) + # create outliers by shifting the entries of the last 5 samples + X[-5:, :] = X[-5:, :] + 10 + Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)]) + threshold, y_pred, MD = utils.get_outliers(X) + # expect that items from country 'b' are detected as outliers + assert np.array_equal(y_pred[-5:], np.ones(5)) + + +def test_get_outliers(): + np.random.seed(1) + X = np.random.randn(100, 3) + # create outliers by shifting the entries of the last 5 samples + X[-5:, :] = X[-5:, :] + 10 + Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)]) + threshold, y_pred, MD = utils.get_outliers_Mahal(X) + # expect that items from country 'b' are detected as outliers + assert np.array_equal(y_pred[-5:], np.ones(5)) + + +def test_pca_data(): + np.random.seed(1) + X = np.random.randn(100, 3) + X[-5:, :] = X[-5:, :] + 10 + X_pca, n_pc = utils.pca_data(X, min_variance=0.8) + assert n_pc < X.shape[1] + + +def test_get_local_outliers_from_neighbors_dict(): + np.random.seed(1) + X = np.random.randn(100, 3) + n_outliers = 3 + X[-n_outliers:, :] = X[-n_outliers:, :] + 10 + Y = np.concatenate([np.repeat('a', 20), np.repeat('b', 20), np.repeat('c', 20), + np.repeat('k', 20), np.repeat('l', 20)]) + w_dict = {'a': ['b', 'c'], 'b': ['a', 'c'], 'c': ['b', 'a'], 'k': ['l'], 'l':['k']} + spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict) + # last n samples of 'l' country must be outliers + assert np.array_equal(spatial_outliers[-1][3][-n_outliers:], np.ones(n_outliers)) + + +def test_best_n_clusters_silhouette(): + np.random.seed(1) + X = np.random.randn(100, 3) + X[:30, :] = X[:30, :] + 10 + X[-30:, :] = X[-30:, :] + 20 + bestncl, _ = utils.best_n_clusters_silhouette(X, max_ncl=10) + assert bestncl == 3 +