comparison scripts/util_feature_learning.py @ 9:c4841876a8ff branch-tests

adding notebooks and trying to explain classifier coefficients
author Maria Panteli <m.x.panteli@gmail.com>
date Mon, 11 Sep 2017 19:06:40 +0100
parents
children 8e897e82af51
comparison
equal deleted inserted replaced
8:0f3eba42b425 9:c4841876a8ff
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Apr 3 15:14:40 2017
4
5 @author: mariapanteli
6 """
7
8
9 import numpy as np
10 import pandas as pd
11 from sklearn.preprocessing import LabelBinarizer
12 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
13 from sklearn.decomposition.pca import PCA
14 from sklearn.decomposition import NMF
15 from sklearn.preprocessing import scale
16 from sklearn.neighbors import KNeighborsClassifier
17 from sklearn import svm
18 from sklearn import metrics
19 from sklearn.ensemble import RandomForestClassifier
20 from sklearn.preprocessing import normalize
21 from numpy.linalg import pinv
22
23 import nmftools
24
25
26 class Transformer:
27 def __init__(self):
28 self.pca_transformer = None
29 self.lda_transformer = None
30 self.nmf_transformer = None
31 self.ssnmf_H = None
32 self.modelKNN = None
33 self.modelLDA = None
34 self.modelSVM = None
35 self.modelRF = None
36 #self.df_results = None
37
38
39 def ssnmf_fit(self, data, labels, npc=None):
40 binarizer = LabelBinarizer()
41 F_class = binarizer.fit_transform(labels)
42 F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200)
43 ssWH = np.dot(F, G) + np.dot(W, H)
44 rec_err = np.linalg.norm(data - ssWH)
45 return G, W, H, rec_err
46
47
48 def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False):
49 X_train = scale(X_train, axis=0)
50 # then pca
51 print "training with PCA transform..."
52 self.pca_transformer = PCA(n_components=n_components).fit(X_train)
53 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
54 if pca_only:
55 # return pca transformer only
56 return
57 # then lda
58 print "training with LDA transform..."
59 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
60 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
61
62
63 def transform_lda_data(self, X_test):
64 X_test = scale(X_test, axis=0)
65 print "transform test data..."
66 pca_testdata = self.pca_transformer.transform(X_test)
67 lda_testdata = self.lda_transformer.transform(X_test)
68 #norm_testdata = normalize(X_test - np.min(X_test))
69 #nmf_testdata = self.nmf_transformer.transform(norm_testdata)
70 #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
71 transformed_data = {'none': X_test, 'pca': pca_testdata,
72 'lda': lda_testdata,
73 'nmf': [],
74 'ssnmf': []}
75 return transformed_data
76
77
78 def fit_data(self, X_train, Y_train, n_components=None, pca_only=False):
79 if n_components is None:
80 n_components = X_train.shape[1]
81 X_train = scale(X_train, axis=0)
82 # then pca
83 print "training with PCA transform..."
84 self.pca_transformer = PCA(n_components=n_components).fit(X_train)
85 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
86 if pca_only:
87 # return pca transformer only
88 return
89 # then lda
90 print "training with LDA transform..."
91 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
92 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
93 # then nmf
94 print "training with NMF transform..."
95 norm_traindata = normalize(X_train - np.min(X_train))
96 self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata)
97 print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_))
98 # then ssnmf
99 print "training with SSNMF transform..."
100 G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components)
101 print "reconstruction error " + str(rec_err)
102
103
104 def transform_data(self, X_test):
105 X_test = scale(X_test, axis=0)
106 print "transform test data..."
107 pca_testdata = self.pca_transformer.transform(X_test)
108 lda_testdata = self.lda_transformer.transform(X_test)
109 norm_testdata = normalize(X_test - np.min(X_test))
110 nmf_testdata = self.nmf_transformer.transform(norm_testdata)
111 ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
112 transformed_data = {'none': X_test, 'pca': pca_testdata,
113 'lda': lda_testdata,
114 'nmf': nmf_testdata,
115 'ssnmf': ssnmf_testdata}
116 return transformed_data
117
118
119 def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None):
120 if model is None:
121 model = LDA()
122 model.fit(X_train, Y_train)
123 predictions = model.predict(X_test)
124 accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes
125 return accuracy, predictions
126
127
128 def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
129 modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
130 modelLDA = LDA()
131 modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
132 modelRF = RandomForestClassifier()
133 model_labels = ['KNN', 'LDA', 'SVM', 'RF']
134 models = [modelKNN, modelLDA, modelSVM, modelRF]
135 df_results = pd.DataFrame()
136 for model, model_label in zip(models, model_labels):
137 acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
138 print model_label + " " + transform_label + " " + str(acc)
139 df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
140 #self.df_results = df_results
141 return df_results
142
143
144 def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
145 self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
146 self.modelLDA = LDA()
147 self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
148 self.modelRF = RandomForestClassifier()
149 model_labels = ['KNN', 'LDA', 'SVM', 'RF']
150 models = [modelKNN, modelLDA, modelSVM, modelRF]
151 df_results = pd.DataFrame()
152 for model, model_label in zip(models, model_labels):
153 acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
154 print model_label + " " + transform_label + " " + str(acc)
155 df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
156 #self.df_results = df_results
157 return df_results
158
159
160 if __name__ == '__main__':
161 Transformer()