m@9
|
1 # -*- coding: utf-8 -*-
|
m@9
|
2 """
|
m@9
|
3 Created on Mon Apr 3 15:14:40 2017
|
m@9
|
4
|
m@9
|
5 @author: mariapanteli
|
m@9
|
6 """
|
m@9
|
7
|
m@9
|
8
|
m@9
|
9 import numpy as np
|
m@9
|
10 import pandas as pd
|
m@9
|
11 from sklearn.preprocessing import LabelBinarizer
|
m@9
|
12 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
|
m@9
|
13 from sklearn.decomposition.pca import PCA
|
m@9
|
14 from sklearn.decomposition import NMF
|
m@9
|
15 from sklearn.preprocessing import scale
|
m@9
|
16 from sklearn.neighbors import KNeighborsClassifier
|
m@9
|
17 from sklearn import svm
|
m@9
|
18 from sklearn import metrics
|
m@9
|
19 from sklearn.ensemble import RandomForestClassifier
|
m@9
|
20 from sklearn.preprocessing import normalize
|
m@9
|
21 from numpy.linalg import pinv
|
m@9
|
22
|
m@9
|
23 import nmftools
|
m@9
|
24
|
m@9
|
25
|
m@9
|
26 class Transformer:
|
m@9
|
27 def __init__(self):
|
m@9
|
28 self.pca_transformer = None
|
m@9
|
29 self.lda_transformer = None
|
m@9
|
30 self.nmf_transformer = None
|
m@9
|
31 self.ssnmf_H = None
|
m@9
|
32 self.modelKNN = None
|
m@9
|
33 self.modelLDA = None
|
m@9
|
34 self.modelSVM = None
|
m@9
|
35 self.modelRF = None
|
m@9
|
36
|
m@9
|
37
|
m@9
|
38 def ssnmf_fit(self, data, labels, npc=None):
|
m@9
|
39 binarizer = LabelBinarizer()
|
m@9
|
40 F_class = binarizer.fit_transform(labels)
|
m@9
|
41 F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200)
|
m@9
|
42 ssWH = np.dot(F, G) + np.dot(W, H)
|
m@9
|
43 rec_err = np.linalg.norm(data - ssWH)
|
m@9
|
44 return G, W, H, rec_err
|
m@9
|
45
|
m@9
|
46
|
m@9
|
47 def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False):
|
m@9
|
48 X_train = scale(X_train, axis=0)
|
m@9
|
49 # then pca
|
m@9
|
50 print "training with PCA transform..."
|
m@9
|
51 self.pca_transformer = PCA(n_components=n_components).fit(X_train)
|
m@9
|
52 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
|
m@9
|
53 if pca_only:
|
m@9
|
54 # return pca transformer only
|
m@9
|
55 return
|
m@9
|
56 # then lda
|
m@9
|
57 print "training with LDA transform..."
|
m@9
|
58 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
|
m@9
|
59 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
|
m@9
|
60
|
m@9
|
61
|
m@9
|
62 def transform_lda_data(self, X_test):
|
m@9
|
63 X_test = scale(X_test, axis=0)
|
m@9
|
64 print "transform test data..."
|
m@9
|
65 pca_testdata = self.pca_transformer.transform(X_test)
|
m@9
|
66 lda_testdata = self.lda_transformer.transform(X_test)
|
m@9
|
67 transformed_data = {'none': X_test, 'pca': pca_testdata,
|
m@9
|
68 'lda': lda_testdata,
|
m@9
|
69 'nmf': [],
|
m@9
|
70 'ssnmf': []}
|
m@9
|
71 return transformed_data
|
m@9
|
72
|
m@9
|
73
|
m@9
|
74 def fit_data(self, X_train, Y_train, n_components=None, pca_only=False):
|
m@9
|
75 if n_components is None:
|
m@9
|
76 n_components = X_train.shape[1]
|
m@9
|
77 X_train = scale(X_train, axis=0)
|
m@9
|
78 # then pca
|
m@9
|
79 print "training with PCA transform..."
|
m@9
|
80 self.pca_transformer = PCA(n_components=n_components).fit(X_train)
|
m@9
|
81 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
|
m@9
|
82 if pca_only:
|
m@9
|
83 # return pca transformer only
|
m@9
|
84 return
|
m@9
|
85 # then lda
|
m@9
|
86 print "training with LDA transform..."
|
m@9
|
87 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
|
m@9
|
88 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
|
m@9
|
89 # then nmf
|
m@9
|
90 print "training with NMF transform..."
|
m@9
|
91 norm_traindata = normalize(X_train - np.min(X_train))
|
m@9
|
92 self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata)
|
m@9
|
93 print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_))
|
m@9
|
94 # then ssnmf
|
m@9
|
95 print "training with SSNMF transform..."
|
m@9
|
96 G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components)
|
m@9
|
97 print "reconstruction error " + str(rec_err)
|
m@9
|
98
|
m@9
|
99
|
m@9
|
100 def transform_data(self, X_test):
|
m@9
|
101 X_test = scale(X_test, axis=0)
|
m@9
|
102 print "transform test data..."
|
m@9
|
103 pca_testdata = self.pca_transformer.transform(X_test)
|
m@9
|
104 lda_testdata = self.lda_transformer.transform(X_test)
|
m@9
|
105 norm_testdata = normalize(X_test - np.min(X_test))
|
m@9
|
106 nmf_testdata = self.nmf_transformer.transform(norm_testdata)
|
m@9
|
107 ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
|
m@9
|
108 transformed_data = {'none': X_test, 'pca': pca_testdata,
|
m@9
|
109 'lda': lda_testdata,
|
m@9
|
110 'nmf': nmf_testdata,
|
m@9
|
111 'ssnmf': ssnmf_testdata}
|
m@9
|
112 return transformed_data
|
m@9
|
113
|
m@9
|
114
|
m@9
|
115 def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None):
|
m@9
|
116 if model is None:
|
m@9
|
117 model = LDA()
|
m@9
|
118 model.fit(X_train, Y_train)
|
m@9
|
119 predictions = model.predict(X_test)
|
m@9
|
120 accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes
|
m@9
|
121 return accuracy, predictions
|
m@9
|
122
|
m@30
|
123
|
m@9
|
124 def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
|
m@9
|
125 self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
|
m@9
|
126 self.modelLDA = LDA()
|
m@9
|
127 self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
|
m@9
|
128 self.modelRF = RandomForestClassifier()
|
m@9
|
129 model_labels = ['KNN', 'LDA', 'SVM', 'RF']
|
m@10
|
130 models = [self.modelKNN, self.modelLDA, self.modelSVM, self.modelRF]
|
m@9
|
131 df_results = pd.DataFrame()
|
m@9
|
132 for model, model_label in zip(models, model_labels):
|
m@9
|
133 acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
|
m@9
|
134 print model_label + " " + transform_label + " " + str(acc)
|
m@79
|
135 df_results = df_results.append(pd.DataFrame([[transform_label, model_label, acc]]))
|
m@9
|
136 return df_results
|
m@9
|
137
|
m@9
|
138
|
m@9
|
139 if __name__ == '__main__':
|
m@9
|
140 Transformer() |