Mercurial > hg > plosone_underreview

--- a/scripts/classification.py	Wed Sep 13 17:35:06 2017 +0100
+++ b/scripts/classification.py	Wed Sep 13 19:57:49 2017 +0100
@@ -63,8 +63,8 @@

 def confusion_matrix(X_train, Y_train, X_test, Y_test, saveCF=False, plots=False):
     feat_learner = util_feature_learning.Transformer()
-    accuracy, predictions = util_feature_learning.classification_accuracy(X_train, Y_train,
-                        X_test, Y_test, model=util_feature_learning.modelLDA)
+    accuracy, predictions = feat_learner.classification_accuracy(X_train, Y_train,
+                        X_test, Y_test, model=feat_learner.modelLDA)
     labels = np.unique(Y_test)  # TODO: countries in geographical proximity
     CF = metrics.confusion_matrix(Y_test, predictions, labels=labels)
     if saveCF:
--- a/scripts/util_feature_learning.py	Wed Sep 13 17:35:06 2017 +0100
+++ b/scripts/util_feature_learning.py	Wed Sep 13 19:57:49 2017 +0100
@@ -65,9 +65,6 @@
         print "transform test data..."
         pca_testdata = self.pca_transformer.transform(X_test)
         lda_testdata = self.lda_transformer.transform(X_test)
-        #norm_testdata = normalize(X_test - np.min(X_test))
-        #nmf_testdata = self.nmf_transformer.transform(norm_testdata)
-        #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
         transformed_data = {'none': X_test, 'pca': pca_testdata,
                                             'lda': lda_testdata,
                                             'nmf': [],
@@ -124,24 +121,8 @@
         accuracy = metrics.f1_score(Y_test, predictions, average='weighted')  # for imbalanced classes
         return accuracy, predictions

-
+
     def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
-        modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
-        modelLDA = LDA()
-        modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
-        modelRF = RandomForestClassifier()
-        model_labels = ['KNN', 'LDA', 'SVM', 'RF']
-        models = [modelKNN, modelLDA, modelSVM, modelRF]
-        df_results = pd.DataFrame()
-        for model, model_label in zip(models, model_labels):
-            acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
-            print model_label + " " + transform_label + " " + str(acc)
-            df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
-        #self.df_results = df_results
-        return df_results
-
-
-    def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
         self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
         self.modelLDA = LDA()
         self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
@@ -153,7 +134,6 @@
             acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
             print model_label + " " + transform_label + " " + str(acc)
             df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
-        #self.df_results = df_results
         return df_results
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_classification.py	Wed Sep 13 19:57:49 2017 +0100
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+import scripts.classification as classification
+
+
+def test_confusion_matrix():
+    X = np.random.randn(100, 3)
+    # create 2 classes by shifting the entries of half the samples
+    X[-50:, :] = X[-50:, :] + 10
+    Y = np.concatenate([np.repeat('a', 50), np.repeat('b', 50)])
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.6, random_state=1, stratify=Y)
+    accuracy, predictions = classification.confusion_matrix(X_train, Y_train, X_test, Y_test)
+    # expect perfect accuracy for this 'easy' dataset
+    assert accuracy == 1.0
+
--- a/tests/test_map_and_average.py	Wed Sep 13 17:35:06 2017 +0100
+++ b/tests/test_map_and_average.py	Wed Sep 13 19:57:49 2017 +0100
@@ -34,4 +34,5 @@
     features = np.array([[0, 1], [0,2], [0, 1], [1, 1], [2, 1]])
     audiolabels = np.array(['a', 'a', 'b', 'b', 'b'])
     feat, audio, labels = map_and_average.averageframes(features, audiolabels, classlabels)
-    feat_true = np.array([[0, 0.5], [1, 1]])
\ No newline at end of file
+    feat_true = np.array([[0, 1.5], [1, 1]])
+    assert np.array_equal(feat, feat_true)
\ No newline at end of file
--- a/tests/test_outliers.py	Wed Sep 13 17:35:06 2017 +0100
+++ b/tests/test_outliers.py	Wed Sep 13 19:57:49 2017 +0100
@@ -8,9 +8,6 @@
 import pytest

 import numpy as np
-import pandas as pd
-import pickle
-import os

 import scripts.outliers as outliers
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_util_feature_learning.py	Wed Sep 13 19:57:49 2017 +0100
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+
+import scripts.util_feature_learning as util_feature_learning
+
+
+feat_learner = util_feature_learning.Transformer()
+
+
+def test_ssnmf_fit():
+    assert True
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_utils.py	Wed Sep 13 19:57:49 2017 +0100
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep  1 19:11:52 2017
+
+@author: mariapanteli
+"""
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import pickle
+import os
+
+import scripts.utils as utils
+
+
+def test_get_outliers():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    # create outliers by shifting the entries of the last 5 samples
+    X[-5:, :] = X[-5:, :] + 10
+    Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)])
+    threshold, y_pred, MD = utils.get_outliers(X)
+    # expect that items from country 'b' are detected as outliers
+    assert np.array_equal(y_pred[-5:], np.ones(5))
+
+
+def test_get_outliers():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    # create outliers by shifting the entries of the last 5 samples
+    X[-5:, :] = X[-5:, :] + 10
+    Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)])
+    threshold, y_pred, MD = utils.get_outliers_Mahal(X)
+    # expect that items from country 'b' are detected as outliers
+    assert np.array_equal(y_pred[-5:], np.ones(5))
+
+
+def test_pca_data():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    X[-5:, :] = X[-5:, :] + 10
+    X_pca, n_pc = utils.pca_data(X, min_variance=0.8)
+    assert n_pc < X.shape[1]
+
+
+def test_get_local_outliers_from_neighbors_dict():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    n_outliers = 3
+    X[-n_outliers:, :] = X[-n_outliers:, :] + 10
+    Y = np.concatenate([np.repeat('a', 20), np.repeat('b', 20), np.repeat('c', 20),
+                        np.repeat('k', 20), np.repeat('l', 20)])
+    w_dict = {'a': ['b', 'c'], 'b': ['a', 'c'], 'c': ['b', 'a'], 'k': ['l'], 'l':['k']}
+    spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict)
+    # last n samples of 'l' country must be outliers
+    assert np.array_equal(spatial_outliers[-1][3][-n_outliers:], np.ones(n_outliers))
+
+
+def test_best_n_clusters_silhouette():
+    np.random.seed(1)
+    X = np.random.randn(100, 3)
+    X[:30, :] = X[:30, :] + 10
+    X[-30:, :] = X[-30:, :] + 20
+    bestncl, _ = utils.best_n_clusters_silhouette(X, max_ncl=10)
+    assert bestncl == 3
+