Mercurial > hg > plosone_underreview

--- a/notebooks/test_hubness.ipynb	Wed Sep 13 19:56:39 2017 +0100
+++ b/notebooks/test_hubness.ipynb	Wed Sep 13 19:58:10 2017 +0100
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {
     "collapsed": true
    },
@@ -27,23 +27,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:189: UserWarning: There are 21 disconnected observations\n",
-      "  warnings.warn(\"There are %d disconnected observations\" % ni)\n",
-      "/homes/mp305/anaconda/lib/python2.7/site-packages/pysal/weights/weights.py:190: UserWarning: Island ids: 3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121\n",
-      "  warnings.warn(\"Island ids: %s\" % ', '.join(str(island) for island in self.islands))\n"
-     ]
-    },
-    {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "WARNING: there are 21 disconnected observations\n",
+      "Island ids:  [3, 6, 26, 35, 39, 45, 52, 61, 62, 66, 77, 85, 94, 97, 98, 102, 103, 107, 110, 120, 121]\n",
       "Antigua and Barbuda\n",
       "Australia\n",
       "Cuba\n",
@@ -83,7 +77,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -114,6 +110,8 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+    "collapsed": false
+   },
    "metadata": {},
    "outputs": [
     {
@@ -134,6 +132,8 @@
   {
    "cell_type": "code",
    "execution_count": 6,
+    "collapsed": false
+   },
    "metadata": {},
    "outputs": [
     {
@@ -171,6 +171,11 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "outputs": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-1-0aacb5dec8fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mN_k\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn_occurrence_from_D\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0mskew\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_k\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mN_k\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'n_occurrence_from_D' is not defined"
    "metadata": {},
    "outputs": [
     {
@@ -234,7 +239,9 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
--- a/scripts/classification.py	Wed Sep 13 19:56:39 2017 +0100
+++ b/scripts/classification.py	Wed Sep 13 19:58:10 2017 +0100
@@ -63,8 +63,8 @@

 def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False):
     feat_learner = util_feature_learning.Transformer()
-    accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train,
-                        X_test, Y_test, model=util_feature_learning.modelLDA)
+    accuracy, predictions = feat_learner.classification_accuracy(X_train, Y_train,
+                        X_test, Y_test, model=feat_learner.modelLDA)
     labels = np.unique(Y_test)  # TODO: countries in geographical proximity
     CF = metrics.confusion_matrix(Y_test, predictions, labels=labels)
     if saveCF:
--- a/scripts/util_feature_learning.py	Wed Sep 13 19:56:39 2017 +0100
+++ b/scripts/util_feature_learning.py	Wed Sep 13 19:58:10 2017 +0100
@@ -65,9 +65,6 @@
         print "transform test data..."
         pca_testdata = self.pca_transformer.transform(X_test)
         lda_testdata = self.lda_transformer.transform(X_test)
-        #norm_testdata = normalize(X_test - np.min(X_test))
-        #nmf_testdata = self.nmf_transformer.transform(norm_testdata)
-        #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
         transformed_data = {'none': X_test, 'pca': pca_testdata,
                                             'lda': lda_testdata,
                                             'nmf': [],
@@ -124,24 +121,8 @@
         accuracy = metrics.f1_score(Y_test, predictions, average='weighted')  # for imbalanced classes
         return accuracy, predictions

-
+
     def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
-        modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
-        modelLDA = LDA()
-        modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
-        modelRF = RandomForestClassifier()
-        model_labels = ['KNN', 'LDA', 'SVM', 'RF']
-        models = [modelKNN, modelLDA, modelSVM, modelRF]
-        df_results = pd.DataFrame()
-        for model, model_label in zip(models, model_labels):
-            acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
-            print model_label + " " + transform_label + " " + str(acc)
-            df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
-        #self.df_results = df_results
-        return df_results
-
-
-    def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
         self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
         self.modelLDA = LDA()
         self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
@@ -153,7 +134,6 @@
             acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
             print model_label + " " + transform_label + " " + str(acc)
             df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
-        #self.df_results = df_results
         return df_results
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_classification.py	Wed Sep 13 19:58:10 2017 +0100
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+import scripts.classification as classification
+
+
+def test_confusion_matrix():
+    X = np.random.randn(100, 3)
+    # create 2 classes by shifting the entries of half the samples
+    X[-50:, :] = X[-50:, :] + 10
+    Y = np.concatenate([np.repeat('a', 50), np.repeat('b', 50)])
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.6, random_state=1, stratify=Y)
+    accuracy, predictions = classification.confusion_matrix(X_train, Y_train, X_test, Y_test)
+    # expect perfect accuracy for this 'easy' dataset
+    assert accuracy == 1.0
+
--- a/tests/test_map_and_average.py	Wed Sep 13 19:56:39 2017 +0100
+++ b/tests/test_map_and_average.py	Wed Sep 13 19:58:10 2017 +0100
@@ -34,4 +34,5 @@
     features = np.array([[0, 1], [0,2], [0, 1], [1, 1], [2, 1]])
     audiolabels = np.array(['a', 'a', 'b', 'b', 'b'])
     feat, audio, labels = map_and_average.averageframes(features, audiolabels, classlabels)
-    feat_true = np.array([[0, 0.5], [1, 1]])
\ No newline at end of file
+    feat_true = np.array([[0, 1.5], [1, 1]])
+    assert np.array_equal(feat, feat_true)
\ No newline at end of file
--- a/tests/test_outliers.py	Wed Sep 13 19:56:39 2017 +0100
+++ b/tests/test_outliers.py	Wed Sep 13 19:58:10 2017 +0100
@@ -8,9 +8,6 @@
 import pytest

 import numpy as np
-import pandas as pd
-import pickle
-import os

 import scripts.outliers as outliers
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_util_feature_learning.py	Wed Sep 13 19:58:10 2017 +0100
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+
+import scripts.util_feature_learning as util_feature_learning
+
+
+feat_learner = util_feature_learning.Transformer()
+
+
+def test_ssnmf_fit():
+    assert True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_utils.py	Wed Sep 13 19:58:10 2017 +0100
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import pickle
+import os
+
+import scripts.utils as utils
+
+
+def test_get_outliers():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    # create outliers by shifting the entries of the last 5 samples
+    X[-5:, :] = X[-5:, :] + 10
+    Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)])
+    threshold, y_pred, MD = utils.get_outliers(X)
+    # expect that items from country 'b' are detected as outliers
+    assert np.array_equal(y_pred[-5:], np.ones(5))
+
+
+def test_get_outliers():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    # create outliers by shifting the entries of the last 5 samples
+    X[-5:, :] = X[-5:, :] + 10
+    Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)])
+    threshold, y_pred, MD = utils.get_outliers_Mahal(X)
+    # expect that items from country 'b' are detected as outliers
+    assert np.array_equal(y_pred[-5:], np.ones(5))
+
+
+def test_pca_data():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    X[-5:, :] = X[-5:, :] + 10
+    X_pca, n_pc = utils.pca_data(X, min_variance=0.8)
+    assert n_pc < X.shape[1]
+
+
+def test_get_local_outliers_from_neighbors_dict():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    n_outliers = 3
+    X[-n_outliers:, :] = X[-n_outliers:, :] + 10
+    Y = np.concatenate([np.repeat('a', 20), np.repeat('b', 20), np.repeat('c', 20),
+                        np.repeat('k', 20), np.repeat('l', 20)])
+    w_dict = {'a': ['b', 'c'], 'b': ['a', 'c'], 'c': ['b', 'a'], 'k': ['l'], 'l':['k']}
+    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict)
+    # last n samples of 'l' country must be outliers
+    assert np.array_equal(spatial_outliers[-1][3][-n_outliers:], np.ones(n_outliers))
+
+
+def test_best_n_clusters_silhouette():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    X[:30, :] = X[:30, :] + 10
+    X[-30:, :] = X[-30:, :] + 20
+    bestncl, _ = utils.best_n_clusters_silhouette(X, max_ncl=10)
+    assert bestncl == 3
+