m@30
|
1 # -*- coding: utf-8 -*-
|
m@30
|
2 """
|
m@30
|
3 Created on Fri Sep 1 19:11:52 2017
|
m@30
|
4
|
m@30
|
5 @author: mariapanteli
|
m@30
|
6 """
|
m@30
|
7
|
m@30
|
8 import pytest
|
m@30
|
9
|
m@30
|
10 import numpy as np
|
m@30
|
11 import pandas as pd
|
m@30
|
12 import pickle
|
m@30
|
13 import os
|
m@30
|
14
|
m@30
|
15 import scripts.utils as utils
|
m@30
|
16
|
m@30
|
17
|
m@30
|
18 def test_get_outliers():
|
m@30
|
19 np.random.seed(1)
|
m@30
|
20 X = np.random.randn(100, 3)
|
m@30
|
21 # create outliers by shifting the entries of the last 5 samples
|
m@30
|
22 X[-5:, :] = X[-5:, :] + 10
|
m@30
|
23 Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)])
|
m@30
|
24 threshold, y_pred, MD = utils.get_outliers(X)
|
m@30
|
25 # expect that items from country 'b' are detected as outliers
|
m@30
|
26 assert np.array_equal(y_pred[-5:], np.ones(5))
|
m@30
|
27
|
m@30
|
28
|
m@30
|
29 def test_get_outliers():
|
m@30
|
30 np.random.seed(1)
|
m@30
|
31 X = np.random.randn(100, 3)
|
m@30
|
32 # create outliers by shifting the entries of the last 5 samples
|
m@30
|
33 X[-5:, :] = X[-5:, :] + 10
|
m@30
|
34 Y = np.concatenate([np.repeat('a', 95), np.repeat('b', 5)])
|
m@30
|
35 threshold, y_pred, MD = utils.get_outliers_Mahal(X)
|
m@30
|
36 # expect that items from country 'b' are detected as outliers
|
m@30
|
37 assert np.array_equal(y_pred[-5:], np.ones(5))
|
m@30
|
38
|
m@30
|
39
|
m@30
|
40 def test_pca_data():
|
m@30
|
41 np.random.seed(1)
|
m@30
|
42 X = np.random.randn(100, 3)
|
m@30
|
43 X[-5:, :] = X[-5:, :] + 10
|
m@30
|
44 X_pca, n_pc = utils.pca_data(X, min_variance=0.8)
|
m@30
|
45 assert n_pc < X.shape[1]
|
m@30
|
46
|
m@30
|
47
|
m@30
|
48 def test_get_local_outliers_from_neighbors_dict():
|
m@30
|
49 np.random.seed(1)
|
m@30
|
50 X = np.random.randn(100, 3)
|
m@30
|
51 n_outliers = 3
|
m@30
|
52 X[-n_outliers:, :] = X[-n_outliers:, :] + 10
|
m@30
|
53 Y = np.concatenate([np.repeat('a', 20), np.repeat('b', 20), np.repeat('c', 20),
|
m@30
|
54 np.repeat('k', 20), np.repeat('l', 20)])
|
m@30
|
55 w_dict = {'a': ['b', 'c'], 'b': ['a', 'c'], 'c': ['b', 'a'], 'k': ['l'], 'l':['k']}
|
m@30
|
56 spatial_outliers = utils.get_local_outliers_from_neighbors_dict(X, Y, w_dict)
|
m@30
|
57 # last n samples of 'l' country must be outliers
|
m@30
|
58 assert np.array_equal(spatial_outliers[-1][3][-n_outliers:], np.ones(n_outliers))
|
m@30
|
59
|
m@30
|
60
|
m@30
|
61 def test_best_n_clusters_silhouette():
|
m@30
|
62 np.random.seed(1)
|
m@30
|
63 X = np.random.randn(100, 3)
|
m@30
|
64 X[:30, :] = X[:30, :] + 10
|
m@30
|
65 X[-30:, :] = X[-30:, :] + 20
|
m@30
|
66 bestncl, _ = utils.best_n_clusters_silhouette(X, max_ncl=10)
|
m@30
|
67 assert bestncl == 3
|
m@30
|
68
|