view phase2/rfFeatureSelection.m @ 6:54446ca7e6cb

onePass and cut methods both working for random forest feature selection
author DaveM
date Thu, 09 Feb 2017 21:43:20 +0000
parents 7848d183c7ab
children cf00dc8be4f7
line wrap: on
line source
function featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector)
% rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees)
%
% using random forests to perform feature selection for a given data set
% data has size (x,y), where x is the number of labels and y, the number of
% features. 
% labels is the set of labels for the data
% numFeatures is the dimension of the output vector (default 5)
% iterMethod is the method for which the features are cut down
%       * 'onePass' will simply select the top (numFeatures) features and
%       report them 
%       * 'cutX' will iteratively cut the bottom X percent of
%       features out, and perform random forest feature selection on the
%       new set, until the desired number of features has been returned
%       * 'oobErr' will do something with the out-of-bag error, and return
%       that in some way, but this has not been implemented yet.
%       * 'featureDeltaErr' will do something with the feature importance
%       prediction error, and return that in some way, but this has not
%       been implemented yet. The OOBPermutedVarDeltaError property is a
%       numeric array of size 1-by-Nvars containing a measure of importance
%       for each predictor variable (feature). For any variable, the
%       measure is the increase in prediction error if the values of that
%       variable are permuted across the out-of-bag observations. This
%       measure is computed for every tree, then averaged over the entire
%       ensemble and divided by the standard deviation over the entire
%       ensemble.
% featureVector is a list of the features to use, for recursive purposes.

if(length(labels) ~= size(data,1))
    error('labels and data do not match up');
end

if(nargin < 2)
    error('must pass data and labels into function')
end
if(nargin < 3)
    numFeatures = 5;
end
if(nargin < 4)
    iterMethod = 'onePass';
end
if(nargin < 5)
    numTrees = 200;
end
if(nargin < 5)
    featureVector = 1:size(data,2);
end


if(length(featureVector) > numFeatures)
    options = statset('UseParallel', true);
    b = TreeBagger(numTrees, data(:,featureVector), labels,'OOBVarImp','On',...
        'SampleWithReplacement', 'Off','FBoot', 0.632,'Options', options);
    [FI,I] = sort(b.OOBPermutedVarDeltaError,'descend'); 
    featureVector = featureVector(I);

    if(strcmp(iterMethod,'onePass'))
        disp('onePass')
        featureVector = featureVector(1:numFeatures);
    elseif(strcmp(iterMethod(1:3),'cut'))
        disp(iterMethod)
        cutPercentage = str2double(iterMethod(4:end));
        cutSize = max(floor(length(featureVector)*cutPercentage/100),1);
        if(length(featureVector) - cutSize < numFeatures)
            cutSize = length(featureVector) - numFeatures;
        end
        featureVector = featureVector(1:end-cutSize);
    %     data = data(:,sort(featureVector));
        featureVector = rfFeatureSelection(data, labels, numFeatures, iterMethod, numTrees, featureVector);
    elseif(strcmp(iterMethod,'oobErr'))
        warning('This method has not been implemented yet, using onePass to return results')
        featureVector = featureVector(1:numFeatures);
    elseif(strcmp(iterMethod,'featureDeltaErr'))
        warning('This method has not been implemented yet, using onePass to return results')
        % this will use variable FI
        featureVector = featureVector(1:numFeatures);
    end
end
end