m@9: # -*- coding: utf-8 -*- m@9: """ m@9: Created on Mon Apr 3 15:14:40 2017 m@9: m@9: @author: mariapanteli m@9: """ m@9: m@9: m@9: import numpy as np m@9: import pandas as pd m@9: from sklearn.preprocessing import LabelBinarizer m@9: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA m@9: from sklearn.decomposition.pca import PCA m@9: from sklearn.decomposition import NMF m@9: from sklearn.preprocessing import scale m@9: from sklearn.neighbors import KNeighborsClassifier m@9: from sklearn import svm m@9: from sklearn import metrics m@9: from sklearn.ensemble import RandomForestClassifier m@9: from sklearn.preprocessing import normalize m@9: from numpy.linalg import pinv m@9: m@9: import nmftools m@9: m@9: m@9: class Transformer: m@9: def __init__(self): m@9: self.pca_transformer = None m@9: self.lda_transformer = None m@9: self.nmf_transformer = None m@9: self.ssnmf_H = None m@9: self.modelKNN = None m@9: self.modelLDA = None m@9: self.modelSVM = None m@9: self.modelRF = None m@9: m@9: m@9: def ssnmf_fit(self, data, labels, npc=None): m@9: binarizer = LabelBinarizer() m@9: F_class = binarizer.fit_transform(labels) m@9: F, G, W, H, cost = nmftools.ssnmf(data, R=npc, F=F_class, n_iter=200) m@9: ssWH = np.dot(F, G) + np.dot(W, H) m@9: rec_err = np.linalg.norm(data - ssWH) m@9: return G, W, H, rec_err m@9: m@9: m@9: def fit_lda_data(self, X_train, Y_train, n_components=None, pca_only=False): m@9: X_train = scale(X_train, axis=0) m@9: # then pca m@9: print "training with PCA transform..." m@9: self.pca_transformer = PCA(n_components=n_components).fit(X_train) m@9: print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_)) m@9: if pca_only: m@9: # return pca transformer only m@9: return m@9: # then lda m@9: print "training with LDA transform..." m@9: self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train) m@9: print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_)) m@9: m@9: m@9: def transform_lda_data(self, X_test): m@9: X_test = scale(X_test, axis=0) m@9: print "transform test data..." m@9: pca_testdata = self.pca_transformer.transform(X_test) m@9: lda_testdata = self.lda_transformer.transform(X_test) m@9: transformed_data = {'none': X_test, 'pca': pca_testdata, m@9: 'lda': lda_testdata, m@9: 'nmf': [], m@9: 'ssnmf': []} m@9: return transformed_data m@9: m@9: m@9: def fit_data(self, X_train, Y_train, n_components=None, pca_only=False): m@9: if n_components is None: m@9: n_components = X_train.shape[1] m@9: X_train = scale(X_train, axis=0) m@9: # then pca m@9: print "training with PCA transform..." m@9: self.pca_transformer = PCA(n_components=n_components).fit(X_train) m@9: print "variance explained " + str(np.sum(self.pca_transformer.explained_variance_ratio_)) m@9: if pca_only: m@9: # return pca transformer only m@9: return m@9: # then lda m@9: print "training with LDA transform..." m@9: self.lda_transformer = LDA(n_components=n_components).fit(X_train, Y_train) m@9: print "variance explained " + str(np.sum(self.lda_transformer.explained_variance_ratio_)) m@9: # then nmf m@9: print "training with NMF transform..." m@9: norm_traindata = normalize(X_train - np.min(X_train)) m@9: self.nmf_transformer = NMF(n_components=n_components).fit(norm_traindata) m@9: print "reconstruction error " + str(np.sum(self.nmf_transformer.reconstruction_err_)) m@9: # then ssnmf m@9: print "training with SSNMF transform..." m@9: G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata, Y_train, npc=n_components) m@9: print "reconstruction error " + str(rec_err) m@9: m@9: m@9: def transform_data(self, X_test): m@9: X_test = scale(X_test, axis=0) m@9: print "transform test data..." m@9: pca_testdata = self.pca_transformer.transform(X_test) m@9: lda_testdata = self.lda_transformer.transform(X_test) m@9: norm_testdata = normalize(X_test - np.min(X_test)) m@9: nmf_testdata = self.nmf_transformer.transform(norm_testdata) m@9: ssnmf_testdata = np.dot(norm_testdata, pinv(self.ssnmf_H)) m@9: transformed_data = {'none': X_test, 'pca': pca_testdata, m@9: 'lda': lda_testdata, m@9: 'nmf': nmf_testdata, m@9: 'ssnmf': ssnmf_testdata} m@9: return transformed_data m@9: m@9: m@9: def classification_accuracy(self, X_train, Y_train, X_test, Y_test, model=None): m@9: if model is None: m@9: model = LDA() m@9: model.fit(X_train, Y_train) m@9: predictions = model.predict(X_test) m@9: accuracy = metrics.f1_score(Y_test, predictions, average='weighted') # for imbalanced classes m@9: return accuracy, predictions m@9: m@30: m@9: def classify(self, X_train, Y_train, X_test, Y_test, transform_label=" "): m@9: self.modelKNN = KNeighborsClassifier(n_neighbors=3, metric='euclidean') m@9: self.modelLDA = LDA() m@9: self.modelSVM = svm.SVC(kernel='rbf', gamma=0.1) m@9: self.modelRF = RandomForestClassifier() m@9: model_labels = ['KNN', 'LDA', 'SVM', 'RF'] m@10: models = [self.modelKNN, self.modelLDA, self.modelSVM, self.modelRF] m@9: df_results = pd.DataFrame() m@9: for model, model_label in zip(models, model_labels): m@9: acc, _ = self.classification_accuracy(X_train, Y_train, X_test, Y_test, model=model) m@9: print model_label + " " + transform_label + " " + str(acc) m@79: df_results = df_results.append(pd.DataFrame([[transform_label, model_label, acc]])) m@9: return df_results m@9: m@9: m@9: if __name__ == '__main__': m@9: Transformer()