Mercurial > hg > plosone_underreview
comparison scripts/util_feature_learning.py @ 9:c4841876a8ff branch-tests
adding notebooks and trying to explain classifier coefficients
author | Maria Panteli <m.x.panteli@gmail.com> |
---|---|
date | Mon, 11 Sep 2017 19:06:40 +0100 |
parents | |
children | 8e897e82af51 |
comparison
equal
deleted
inserted
replaced
8:0f3eba42b425 | 9:c4841876a8ff |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 Created on Mon Apr 3 15:14:40 2017 | |
4 | |
5 @author: mariapanteli | |
6 """ | |
7 | |
8 | |
9 import numpy as np | |
10 import pandas as pd | |
11 from sklearn.preprocessing import LabelBinarizer | |
12 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA | |
13 from sklearn.decomposition.pca import PCA | |
14 from sklearn.decomposition import NMF | |
15 from sklearn.preprocessing import scale | |
16 from sklearn.neighbors import KNeighborsClassifier | |
17 from sklearn import svm | |
18 from sklearn import metrics | |
19 from sklearn.ensemble import RandomForestClassifier | |
20 from sklearn.preprocessing import normalize | |
21 from numpy.linalg import pinv | |
22 | |
23 import nmftools | |
24 | |
25 | |
26 class Transformer: | |
27 def __init__(self): | |
28 self.pca_transformer = None | |
29 self.lda_transformer = None | |
30 self.nmf_transformer = None | |
31 self.ssnmf_H = None | |
32 self.modelKNN = None | |
33 self.modelLDA = None | |
34 self.modelSVM = None | |
35 self.modelRF = None | |
36 #self.df_results = None | |
37 | |
38 | |
39 def ssnmf_fit(self, data, labels, npc=None): | |
40 binarizer = LabelBinarizer() | |
41 F_class = binarizer.fit_transform(labels) | |
42 F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200) | |
43 ssWH = np.dot(F, G) + np.dot(W, H) | |
44 rec_err = np.linalg.norm(data - ssWH) | |
45 return G, W, H, rec_err | |
46 | |
47 | |
48 def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False): | |
49 X_train = scale(X_train, axis=0) | |
50 # then pca | |
51 print "training with PCA transform..." | |
52 self.pca_transformer = PCA(n_components=n_components).fit(X_train) | |
53 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_)) | |
54 if pca_only: | |
55 # return pca transformer only | |
56 return | |
57 # then lda | |
58 print "training with LDA transform..." | |
59 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train) | |
60 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_)) | |
61 | |
62 | |
63 def transform_lda_data(self, X_test): | |
64 X_test = scale(X_test, axis=0) | |
65 print "transform test data..." | |
66 pca_testdata = self.pca_transformer.transform(X_test) | |
67 lda_testdata = self.lda_transformer.transform(X_test) | |
68 #norm_testdata = normalize(X_test - np.min(X_test)) | |
69 #nmf_testdata = self.nmf_transformer.transform(norm_testdata) | |
70 #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H)) | |
71 transformed_data = {'none': X_test, 'pca': pca_testdata, | |
72 'lda': lda_testdata, | |
73 'nmf': [], | |
74 'ssnmf': []} | |
75 return transformed_data | |
76 | |
77 | |
78 def fit_data(self, X_train, Y_train, n_components=None, pca_only=False): | |
79 if n_components is None: | |
80 n_components = X_train.shape[1] | |
81 X_train = scale(X_train, axis=0) | |
82 # then pca | |
83 print "training with PCA transform..." | |
84 self.pca_transformer = PCA(n_components=n_components).fit(X_train) | |
85 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_)) | |
86 if pca_only: | |
87 # return pca transformer only | |
88 return | |
89 # then lda | |
90 print "training with LDA transform..." | |
91 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train) | |
92 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_)) | |
93 # then nmf | |
94 print "training with NMF transform..." | |
95 norm_traindata = normalize(X_train - np.min(X_train)) | |
96 self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata) | |
97 print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_)) | |
98 # then ssnmf | |
99 print "training with SSNMF transform..." | |
100 G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components) | |
101 print "reconstruction error " + str(rec_err) | |
102 | |
103 | |
104 def transform_data(self, X_test): | |
105 X_test = scale(X_test, axis=0) | |
106 print "transform test data..." | |
107 pca_testdata = self.pca_transformer.transform(X_test) | |
108 lda_testdata = self.lda_transformer.transform(X_test) | |
109 norm_testdata = normalize(X_test - np.min(X_test)) | |
110 nmf_testdata = self.nmf_transformer.transform(norm_testdata) | |
111 ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H)) | |
112 transformed_data = {'none': X_test, 'pca': pca_testdata, | |
113 'lda': lda_testdata, | |
114 'nmf': nmf_testdata, | |
115 'ssnmf': ssnmf_testdata} | |
116 return transformed_data | |
117 | |
118 | |
119 def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None): | |
120 if model is None: | |
121 model = LDA() | |
122 model.fit(X_train, Y_train) | |
123 predictions = model.predict(X_test) | |
124 accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes | |
125 return accuracy, predictions | |
126 | |
127 | |
128 def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "): | |
129 modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') | |
130 modelLDA = LDA() | |
131 modelSVM = svm.SVC(kernel='rbf', gamma=0.1) | |
132 modelRF = RandomForestClassifier() | |
133 model_labels = ['KNN', 'LDA', 'SVM', 'RF'] | |
134 models = [modelKNN, modelLDA, modelSVM, modelRF] | |
135 df_results = pd.DataFrame() | |
136 for model, model_label in zip(models, model_labels): | |
137 acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) | |
138 print model_label + " " + transform_label + " " + str(acc) | |
139 df_results = df_results.append(pd.DataFrame([[model_label, acc]])) | |
140 #self.df_results = df_results | |
141 return df_results | |
142 | |
143 | |
144 def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "): | |
145 self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') | |
146 self.modelLDA = LDA() | |
147 self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1) | |
148 self.modelRF = RandomForestClassifier() | |
149 model_labels = ['KNN', 'LDA', 'SVM', 'RF'] | |
150 models = [modelKNN, modelLDA, modelSVM, modelRF] | |
151 df_results = pd.DataFrame() | |
152 for model, model_label in zip(models, model_labels): | |
153 acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) | |
154 print model_label + " " + transform_label + " " + str(acc) | |
155 df_results = df_results.append(pd.DataFrame([[model_label, acc]])) | |
156 #self.df_results = df_results | |
157 return df_results | |
158 | |
159 | |
160 if __name__ == '__main__': | |
161 Transformer() |