changeset 8:b177d57ac0bd

adding function to traverse down linkages file and provide classification for each part of the tree
author DaveM
date Fri, 10 Feb 2017 08:28:22 +0000
parents cf00dc8be4f7
children 699c769b76da
files misc/SFXFeatureSelection.m phase2/aglomCluster.m phase2/rfFeatureSelection.m phase2/traceLinkageToBinary.m phase2/traverseDownOneStep.m
diffstat 5 files changed, 118 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/misc/SFXFeatureSelection.m	Thu Feb 09 22:04:54 2017 +0000
+++ b/misc/SFXFeatureSelection.m	Fri Feb 10 08:28:22 2017 +0000
@@ -1,7 +1,8 @@
 tic
 rng(1945,'twister')
 options = statset('UseParallel', true);
-b = TreeBagger(500, DataTrain63, LabelsTrain,'OOBVarImp','On', 'SampleWithReplacement', 'Off', 'InBagFraction', 0.632, 'Options', options);
+b = TreeBagger(50, data, labels,'OOBVarImp','On', 'SampleWithReplacement', 'Off', 'FBoot', 0.632, 'Options', options);
+% b = TreeBagger(500, DataTrain63, LabelsTrain,'OOBVarImp','On', 'SampleWithReplacement', 'Off', 'InBagFraction', 0.632, 'Options', options);
 toc
 figure
 plot(oobError(b))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phase2/aglomCluster.m	Fri Feb 10 08:28:22 2017 +0000
@@ -0,0 +1,62 @@
+function aglomCluster(data, clusterMethod, distanceMetric, numClusters)
+%% aglomCluster(data, clusterMethod, distanceMetric, numClusters)
+% This function performs aglomerative clustering on a given data set,
+% allowing the interpretation of a hierarchical data, and plotting a
+% dendrogram.
+%
+% data in the format of of each row is an observation and each column is a
+% feature vector clusterMethod;
+%     * 'average'     Unweighted average distance (UPGMA)
+%     * 'centroid'	Centroid distance (UPGMC), appropriate for Euclidean
+%     distances only
+%     * 'complete'	Furthest distance
+%     * 'median'      Weighted center of mass distance (WPGMC),appropriate
+%     for Euclidean distances only
+%     * 'single'      Shortest distance
+%     * 'ward'        Inner squared distance (minimum variance algorithm),
+%     appropriate for Euclidean distances only (default)
+%     * 'weighted'	Weighted average distance (WPGMA)
+% distanceMetric
+%     * 'euclidean' Euclidean distance (default).
+%     * 'seuclidean' Standardized Euclidean distance. Each coordinate
+%     difference between rows in X is scaled by dividing by the
+%     corresponding element of the standard deviation S=nanstd(X). To
+%     specify another value for S, use D=pdist(X,'seuclidean',S).
+%     * 'cityblock' City block metric.
+%     * 'minkowski' Minkowski distance. The default exponent is 2. To
+%     specify a different exponent, use D = pdist(X,'minkowski',P), where P
+%     is a scalar positive value of the exponent.
+%     * 'chebychev' Chebychev distance (maximum coordinate difference).
+%     * 'mahalanobis'	Mahalanobis distance, using the sample covariance
+%     of X as computed by nancov. To compute the distance with a different
+%     covariance, use D = pdist(X,'mahalanobis',C), where the matrix C is
+%     symmetric and positive definite.
+%     * 'cosine' One minus the cosine of the included angle between points
+%     (treated as vectors).
+%     * 'correlation' One minus the sample correlation between points
+%     (treated as sequences of values).
+%     * 'spearman' One minus the sample Spearman's rank correlation between
+%     observations (treated as sequences of values).
+%     * 'hamming' Hamming distance, which is the percentage of coordinates
+%     that differ.
+%     * 'jaccard' One minus the Jaccard coefficient, which is the
+%     percentage of nonzero coordinates that differ.
+% numClusters is the number of final clusters produced by the dendrogram,
+% if 0 (default), then will infer from data
+
+if(nargin<2)
+    clusterMethod = 'ward';
+end
+if(nargin<3)
+    distanceMetric = 'euclidean';
+end
+if (nargin<4)
+    numClusters = 0;
+end
+
+distMap = pdist(data, distanceMetric);
+linkList = linkage(distMap, clusterMethod);
+[~,T] = dendrogram(linkList,numClusters);
+
+
+end
\ No newline at end of file
--- a/phase2/rfFeatureSelection.m	Thu Feb 09 22:04:54 2017 +0000
+++ b/phase2/rfFeatureSelection.m	Fri Feb 10 08:28:22 2017 +0000
@@ -1,5 +1,5 @@
 function featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
-% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees)
+%% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
 %
 % using random forests to perform feature selection for a given data set
 % data has size (x,y), where x is the number of labels and y, the number of
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phase2/traceLinkageToBinary.m	Fri Feb 10 08:28:22 2017 +0000
@@ -0,0 +1,25 @@
+function classList = traceLinkageToBinary(linkList, rowIndex)
+%% class = traceLinkageToBinary(linkList, rowIndex)
+% This function accepts a linkList and a rowIndex, and performs a transform
+% to provide a classification list for all the data points in the original
+% list. From a row index, if the data falls under column 1 (lower number)
+% then it is given a class of 1, if it falls under column 2 (higher number)
+% then it is given a class of 2. Any data not included in that branch of
+% the hierarchy is given a class of 0
+% linkList - the input result from linkages
+% rowIndex - the row on which to split the data
+
+listSize = size(linkList,1)+1;
+c(1) = linkList(rowIndex,1);
+c(2) = linkList(rowIndex,2);
+
+leafList1 = traverseDownOneStep(linkList,[],c(1));
+leafList2 = traverseDownOneStep(linkList,[],c(2));
+
+classList = zeros(listSize,1);
+classList(leafList1) = 1;
+classList(leafList2) = 2;
+
+
+end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phase2/traverseDownOneStep.m	Fri Feb 10 08:28:22 2017 +0000
@@ -0,0 +1,28 @@
+function leaf = traverseDownOneStep(linkList,leaf,row)
+
+%% leaf = traverseDownOneStep(linkList,leaf,row)
+% Recursive function which given a linkList, will search a given row, and
+% if the row is a leaf, it will append the leaf to the end of the leaf
+% list, otherwise, it will recursively call the function to identify the
+% two leaves for the branches it has discovered
+
+listSize = size(linkList,1)+1;
+if(row > listSize)
+    row = row-listSize;
+end
+leaf1 = linkList(row,1);
+leaf2 = linkList(row,2);
+
+if(leaf1 > listSize)
+    leaf = traverseDownOneStep(linkList,leaf,leaf1);
+else
+    leaf = cat(1,leaf,leaf1);
+end
+
+if(leaf2 > listSize)
+    leaf = traverseDownOneStep(linkList,leaf,leaf2);
+else
+    leaf = cat(1,leaf,leaf2);
+end
+
+end
\ No newline at end of file