annotate scripts/util_feature_learning.py @ 30:e8084526f7e5 branch-tests

additional test functions
author Maria Panteli <m.x.panteli@gmail.com>
date Wed, 13 Sep 2017 19:57:49 +0100
parents 8e897e82af51
children 635028c5be34
rev   line source
m@9 1 # -*- coding: utf-8 -*-
m@9 2 """
m@9 3 Created on Mon Apr 3 15:14:40 2017
m@9 4
m@9 5 @author: mariapanteli
m@9 6 """
m@9 7
m@9 8
m@9 9 import numpy as np
m@9 10 import pandas as pd
m@9 11 from sklearn.preprocessing import LabelBinarizer
m@9 12 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
m@9 13 from sklearn.decomposition.pca import PCA
m@9 14 from sklearn.decomposition import NMF
m@9 15 from sklearn.preprocessing import scale
m@9 16 from sklearn.neighbors import KNeighborsClassifier
m@9 17 from sklearn import svm
m@9 18 from sklearn import metrics
m@9 19 from sklearn.ensemble import RandomForestClassifier
m@9 20 from sklearn.preprocessing import normalize
m@9 21 from numpy.linalg import pinv
m@9 22
m@9 23 import nmftools
m@9 24
m@9 25
m@9 26 class Transformer:
m@9 27 def __init__(self):
m@9 28 self.pca_transformer = None
m@9 29 self.lda_transformer = None
m@9 30 self.nmf_transformer = None
m@9 31 self.ssnmf_H = None
m@9 32 self.modelKNN = None
m@9 33 self.modelLDA = None
m@9 34 self.modelSVM = None
m@9 35 self.modelRF = None
m@9 36 #self.df_results = None
m@9 37
m@9 38
m@9 39 def ssnmf_fit(self, data, labels, npc=None):
m@9 40 binarizer = LabelBinarizer()
m@9 41 F_class = binarizer.fit_transform(labels)
m@9 42 F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200)
m@9 43 ssWH = np.dot(F, G) + np.dot(W, H)
m@9 44 rec_err = np.linalg.norm(data - ssWH)
m@9 45 return G, W, H, rec_err
m@9 46
m@9 47
m@9 48 def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False):
m@9 49 X_train = scale(X_train, axis=0)
m@9 50 # then pca
m@9 51 print "training with PCA transform..."
m@9 52 self.pca_transformer = PCA(n_components=n_components).fit(X_train)
m@9 53 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
m@9 54 if pca_only:
m@9 55 # return pca transformer only
m@9 56 return
m@9 57 # then lda
m@9 58 print "training with LDA transform..."
m@9 59 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
m@9 60 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
m@9 61
m@9 62
m@9 63 def transform_lda_data(self, X_test):
m@9 64 X_test = scale(X_test, axis=0)
m@9 65 print "transform test data..."
m@9 66 pca_testdata = self.pca_transformer.transform(X_test)
m@9 67 lda_testdata = self.lda_transformer.transform(X_test)
m@9 68 transformed_data = {'none': X_test, 'pca': pca_testdata,
m@9 69 'lda': lda_testdata,
m@9 70 'nmf': [],
m@9 71 'ssnmf': []}
m@9 72 return transformed_data
m@9 73
m@9 74
m@9 75 def fit_data(self, X_train, Y_train, n_components=None, pca_only=False):
m@9 76 if n_components is None:
m@9 77 n_components = X_train.shape[1]
m@9 78 X_train = scale(X_train, axis=0)
m@9 79 # then pca
m@9 80 print "training with PCA transform..."
m@9 81 self.pca_transformer = PCA(n_components=n_components).fit(X_train)
m@9 82 print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
m@9 83 if pca_only:
m@9 84 # return pca transformer only
m@9 85 return
m@9 86 # then lda
m@9 87 print "training with LDA transform..."
m@9 88 self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
m@9 89 print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
m@9 90 # then nmf
m@9 91 print "training with NMF transform..."
m@9 92 norm_traindata = normalize(X_train - np.min(X_train))
m@9 93 self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata)
m@9 94 print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_))
m@9 95 # then ssnmf
m@9 96 print "training with SSNMF transform..."
m@9 97 G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components)
m@9 98 print "reconstruction error " + str(rec_err)
m@9 99
m@9 100
m@9 101 def transform_data(self, X_test):
m@9 102 X_test = scale(X_test, axis=0)
m@9 103 print "transform test data..."
m@9 104 pca_testdata = self.pca_transformer.transform(X_test)
m@9 105 lda_testdata = self.lda_transformer.transform(X_test)
m@9 106 norm_testdata = normalize(X_test - np.min(X_test))
m@9 107 nmf_testdata = self.nmf_transformer.transform(norm_testdata)
m@9 108 ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
m@9 109 transformed_data = {'none': X_test, 'pca': pca_testdata,
m@9 110 'lda': lda_testdata,
m@9 111 'nmf': nmf_testdata,
m@9 112 'ssnmf': ssnmf_testdata}
m@9 113 return transformed_data
m@9 114
m@9 115
m@9 116 def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None):
m@9 117 if model is None:
m@9 118 model = LDA()
m@9 119 model.fit(X_train, Y_train)
m@9 120 predictions = model.predict(X_test)
m@9 121 accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes
m@9 122 return accuracy, predictions
m@9 123
m@30 124
m@9 125 def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
m@9 126 self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
m@9 127 self.modelLDA = LDA()
m@9 128 self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
m@9 129 self.modelRF = RandomForestClassifier()
m@9 130 model_labels = ['KNN', 'LDA', 'SVM', 'RF']
m@10 131 models = [self.modelKNN, self.modelLDA, self.modelSVM, self.modelRF]
m@9 132 df_results = pd.DataFrame()
m@9 133 for model, model_label in zip(models, model_labels):
m@9 134 acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
m@9 135 print model_label + " " + transform_label + " " + str(acc)
m@9 136 df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
m@9 137 return df_results
m@9 138
m@9 139
m@9 140 if __name__ == '__main__':
m@9 141 Transformer()