diff scripts/util_feature_learning.py @ 9:c4841876a8ff branch-tests

adding notebooks and trying to explain classifier coefficients
author Maria Panteli <m.x.panteli@gmail.com>
date Mon, 11 Sep 2017 19:06:40 +0100
parents
children 8e897e82af51
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/util_feature_learning.py	Mon Sep 11 19:06:40 2017 +0100
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Apr  3 15:14:40 2017
+
+@author: mariapanteli
+"""
+
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelBinarizer        
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
+from sklearn.decomposition.pca import PCA
+from sklearn.decomposition import NMF
+from sklearn.preprocessing import scale
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn import svm
+from sklearn import metrics
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import normalize
+from numpy.linalg import pinv
+
+import nmftools
+
+
+class Transformer:
+    def __init__(self):
+        self.pca_transformer = None
+        self.lda_transformer = None
+        self.nmf_transformer = None
+        self.ssnmf_H = None
+        self.modelKNN = None
+        self.modelLDA = None
+        self.modelSVM = None
+        self.modelRF = None
+        #self.df_results = None
+        
+        
+    def ssnmf_fit(self, data, labels, npc=None):
+        binarizer = LabelBinarizer()
+        F_class = binarizer.fit_transform(labels)
+        F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200)
+        ssWH = np.dot(F, G) + np.dot(W, H)
+        rec_err = np.linalg.norm(data - ssWH)
+        return G, W, H, rec_err
+    
+    
+    def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False):
+        X_train = scale(X_train, axis=0)
+        # then pca
+        print "training with PCA transform..."
+        self.pca_transformer = PCA(n_components=n_components).fit(X_train)
+        print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
+        if pca_only:
+            # return pca transformer only
+            return
+        # then lda
+        print "training with LDA transform..."
+        self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
+        print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
+        
+    
+    def transform_lda_data(self, X_test):
+        X_test = scale(X_test, axis=0)
+        print "transform test data..."
+        pca_testdata = self.pca_transformer.transform(X_test)
+        lda_testdata = self.lda_transformer.transform(X_test)
+        #norm_testdata = normalize(X_test - np.min(X_test))
+        #nmf_testdata = self.nmf_transformer.transform(norm_testdata)
+        #ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
+        transformed_data = {'none': X_test, 'pca': pca_testdata, 
+                                            'lda': lda_testdata,
+                                            'nmf': [],
+                                            'ssnmf': []}
+        return transformed_data
+    
+    
+    def fit_data(self, X_train, Y_train, n_components=None, pca_only=False):
+        if n_components is None:
+            n_components = X_train.shape[1]
+        X_train = scale(X_train, axis=0)
+        # then pca
+        print "training with PCA transform..."
+        self.pca_transformer = PCA(n_components=n_components).fit(X_train)
+        print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_))
+        if pca_only:
+            # return pca transformer only
+            return
+        # then lda
+        print "training with LDA transform..."
+        self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train)
+        print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_))
+        # then nmf
+        print "training with NMF transform..."
+        norm_traindata = normalize(X_train - np.min(X_train))
+        self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata)
+        print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_))
+        # then ssnmf
+        print "training with SSNMF transform..."
+        G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components)
+        print "reconstruction error " + str(rec_err)
+    
+    
+    def transform_data(self, X_test):
+        X_test = scale(X_test, axis=0)
+        print "transform test data..."
+        pca_testdata = self.pca_transformer.transform(X_test)
+        lda_testdata = self.lda_transformer.transform(X_test)
+        norm_testdata = normalize(X_test - np.min(X_test))
+        nmf_testdata = self.nmf_transformer.transform(norm_testdata)
+        ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H))
+        transformed_data = {'none': X_test, 'pca': pca_testdata, 
+                                            'lda': lda_testdata, 
+                                            'nmf': nmf_testdata, 
+                                            'ssnmf': ssnmf_testdata}
+        return transformed_data
+    
+    
+    def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None):
+        if model is None:
+            model = LDA()
+        model.fit(X_train, Y_train)
+        predictions = model.predict(X_test)
+        accuracy = metrics.f1_score(Y_test, predictions, average='weighted')  # for imbalanced classes
+        return accuracy, predictions
+        
+    
+    def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
+        modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
+        modelLDA = LDA()
+        modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
+        modelRF = RandomForestClassifier()
+        model_labels = ['KNN', 'LDA', 'SVM', 'RF']
+        models = [modelKNN, modelLDA, modelSVM, modelRF]        
+        df_results = pd.DataFrame()        
+        for model, model_label in zip(models, model_labels):
+            acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
+            print model_label + " " + transform_label + " " + str(acc)
+            df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
+        #self.df_results = df_results
+        return df_results
+
+
+    def classify_and_save(self, X_train, Y_train, X_test, Y_test, transform_label=" "):
+        self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
+        self.modelLDA = LDA()
+        self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1)
+        self.modelRF = RandomForestClassifier()
+        model_labels = ['KNN', 'LDA', 'SVM', 'RF']
+        models = [modelKNN, modelLDA, modelSVM, modelRF]        
+        df_results = pd.DataFrame()        
+        for model, model_label in zip(models, model_labels):
+            acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model)
+            print model_label + " " + transform_label + " " + str(acc)
+            df_results = df_results.append(pd.DataFrame([[model_label, acc]]))
+        #self.df_results = df_results
+        return df_results
+
+
+if __name__ == '__main__':
+    Transformer()
\ No newline at end of file