Mercurial > hg > sfx-subgrouping
diff misc/RandomForestEMFeatureSelection.m @ 2:985cd163ba54
adding old matlab data some datasets
author | DaveM |
---|---|
date | Thu, 09 Feb 2017 16:48:03 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/misc/RandomForestEMFeatureSelection.m Thu Feb 09 16:48:03 2017 +0000 @@ -0,0 +1,80 @@ +clearvars; +load('AdobeStratified.mat'); +morefeatures = true; +idxvar = (1:1450); +count = 1; +featuredata = struct('IdxVar', [], 'FeatureNamesRanked', {}, 'FeatureImportance', [], 'OOBError', [], 'LastOOBError', [], 'EMClusters', [], 'AIC', [], 'PreviousAIC', []); + +while(morefeatures) + DataTrain = DataTrain(:, idxvar); + FeatureNames = FeatureNames(idxvar); + idxvar = (1:length(FeatureNames)); + fprintf('\n Growing a Random Forest of 200 trees using %i features\n',length(idxvar)) + + rng(1945,'twister') + tic + options = statset('UseParallel', true); + b = TreeBagger(200, DataTrain, LabelsTrain,'OOBVarImp','On', 'SampleWithReplacement', 'Off', 'FBoot', 0.632, 'Options', options); + toc + + oobErr = oobError(b); + LastoobErr = oobErr(end); + + fprintf('\n The cumulative OOB Error at 200 trees is %f\n', LastoobErr); + + Indices = crossvalind('Kfold', size(DataTrain, 1), 10); + + AICInitial = 1e16; + AICNext = -1e16; + AICAvg = zeros(10, 1); + NumClusters = 1; + + while(AICNext <= AICInitial) + + if(NumClusters ~= 1) + AICInitial = AICNext; + end + NumClusters = NumClusters + 1; + + fprintf('\n Performing EM using 10 fold CV and %i clusters and %i features\n', NumClusters, length(idxvar)) + + for i = 1:10 + + emidx = (Indices == i); emidx = ~emidx; + + EMDataTrain = DataTrain(emidx, :); + GMModelCV = fitgmdist(EMDataTrain, NumClusters, 'RegularizationValue', 1e-5); + AICAvg(i) = GMModelCV.AIC; + end + + AICNext = mean(AICAvg); + fprintf('The average AIC was %f\n', AICNext); + end + + FI = b.OOBPermutedVarDeltaError; + + [FI,I]=sort(FI, 'descend'); + idxvar = idxvar(I); + FeatureNamesRanked = FeatureNames(I); + + featuredata(count).IdxVar = idxvar; + featuredata(count).FeatureNamesRanked = FeatureNamesRanked; + featuredata(count).FeatureImportance = FI; + featuredata(count).OOBError = oobErr; + featuredata(count).LastOOBError = LastoobErr; + featuredata(count).EMClusters = NumClusters; + featuredata(count).AIC = AICNext; + featuredata(count).PreviousAIC = AICInitial; + + idxRemove = round((length(idxvar) / 100)* 1); + fprintf('\n %i features will be removed.\n', idxRemove) + idxRemove = (length(idxvar) - idxRemove); + idxvar = idxvar(1:idxRemove); + count = count + 1; + + save('Results1Percent.mat', 'featuredata'); + + if(length(idxvar) == 2) + morefeatures = false; + end +end \ No newline at end of file