To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.
root / _FullBNT / BNT / CPDs / @tree_CPD / learn_params.m @ 8:b5b38998ef3b
History | View | Annotate | Download (28.3 KB)
| 1 |
function CPD = learn_params(CPD, fam, data, ns, cnodes, varargin) |
|---|---|
| 2 |
% LEARN_PARAMS Construct classification/regression tree given complete data |
| 3 |
% CPD = learn_params(CPD, fam, data, ns, cnodes) |
| 4 |
% |
| 5 |
% fam(i) is the node id of the i-th node in the family of nodes, self node is the last one |
| 6 |
% data(i,m) is the value of node i in case m (can be cell array). |
| 7 |
% ns(i) is the node size for the i-th node in the whold bnet |
| 8 |
% cnodes(i) is the node id for the i-th continuous node in the whole bnet |
| 9 |
% |
| 10 |
% The following optional arguments can be specified in the form of name/value pairs: |
| 11 |
% stop_cases: for early stop (pruning). A node is not split if it has less than k cases. default is 0. |
| 12 |
% min_gain: for early stop (pruning). |
| 13 |
% For discrete output: A node is not split when the gain of best split is less than min_gain. default is 0. |
| 14 |
% For continuous (cts) outpt: A node is not split when the gain of best split is less than min_gain*score(root) |
| 15 |
% (we denote it cts_min_gain). default is 0.006 |
| 16 |
% %%%%%%%%%%%%%%%%%%%Struction definition of dtree_CPD.tree%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
| 17 |
% tree.num_node the last position in tree.nodes array for adding new nodes, |
| 18 |
% it is not always same to number of nodes in a tree, because some position in the |
| 19 |
% tree.nodes array can be set to unused (e.g. in tree pruning) |
| 20 |
% tree.nodes is the array of nodes in the tree plus some unused nodes. |
| 21 |
% tree.nodes(1) is the root for the tree. |
| 22 |
% |
| 23 |
% Below is the attributes for each node |
| 24 |
% tree.nodes(i).used; % flag this node is used (0 means node not used, it can be removed from tree to save memory) |
| 25 |
% tree.nodes(i).is_leaf; % if 1 means this node is a leaf, if 0 not a leaf. |
| 26 |
% tree.nodes(i).children; % children(i) is the node number in tree.nodes array for the i-th child node |
| 27 |
% tree.nodes(i).split_id; % the attribute id used to split this node |
| 28 |
% tree.nodes(i).split_threshhold; % the threshhold for continuous attribute to split this node |
| 29 |
% %%%%%attributes specially for classification tree (discrete output) |
| 30 |
% tree.nodes(i).probs % probs(i) is the prob for i-th value of class node |
| 31 |
% % For three output class, the probs = [0.9 0.1 0.0] means the probability of |
| 32 |
% % class 1 is 0.9, for class 2 is 0.1, for class 3 is 0.0. |
| 33 |
% %%%%%attributes specially for regression tree (continuous output) |
| 34 |
% tree.nodes(i).mean % mean output value for this node |
| 35 |
% tree.nodes(i).std % standard deviation for output values in this node |
| 36 |
% |
| 37 |
% Author: yimin.zhang@intel.com |
| 38 |
% Last updated: Jan. 19, 2002 |
| 39 |
|
| 40 |
% Want list: |
| 41 |
% (1) more efficient for cts attributes: get the values of cts attributes at first (the begining of build_tree function), then doing bi_search in finding threshhold |
| 42 |
% (2) pruning classification tree using Pessimistic Error Pruning |
| 43 |
% (3) bi_search for strings (used for transform data to BNT format) |
| 44 |
|
| 45 |
global tree %tree must be global so that it can be accessed in recursive slitting function |
| 46 |
global cts_min_gain |
| 47 |
tree=[]; % clear the tree |
| 48 |
tree.num_node=0; |
| 49 |
cts_min_gain=0; |
| 50 |
|
| 51 |
stop_cases=0; |
| 52 |
min_gain=0; |
| 53 |
|
| 54 |
args = varargin; |
| 55 |
nargs = length(args); |
| 56 |
if (nargs>0) |
| 57 |
if isstr(args{1})
|
| 58 |
for i=1:2:nargs |
| 59 |
switch args{i},
|
| 60 |
case 'stop_cases', stop_cases = args{i+1};
|
| 61 |
case 'min_gain', min_gain = args{i+1};
|
| 62 |
end |
| 63 |
end |
| 64 |
else |
| 65 |
error(['error in input parameters']); |
| 66 |
end |
| 67 |
end |
| 68 |
|
| 69 |
if iscell(data) |
| 70 |
local_data = cell2num(data(fam,:)); |
| 71 |
else |
| 72 |
local_data = data(fam, :); |
| 73 |
end |
| 74 |
%counts = compute_counts(local_data, CPD.sizes); |
| 75 |
%CPD.CPT = mk_stochastic(counts + CPD.prior); % bug fix 11/5/01 |
| 76 |
node_types = zeros(1,size(ns,2)); %all nodes are disrete |
| 77 |
node_types(cnodes)=1; |
| 78 |
%make the data be BNT compliant (values for discrete nodes are from 1-n, here n is the node size) |
| 79 |
%trans_data=transform_data(local_data,'tmp.dat',[]); %here no cts nodes |
| 80 |
|
| 81 |
build_dtree (CPD, local_data, ns(fam), node_types(fam),stop_cases,min_gain); |
| 82 |
%CPD.tree=copy_tree(tree); |
| 83 |
CPD.tree=tree; %copy the tree constructed to CPD |
| 84 |
|
| 85 |
|
| 86 |
function new_tree = copy_tree(tree) |
| 87 |
% copy the tree to new_tree |
| 88 |
new_tree.num_node=tree.num_node; |
| 89 |
new_tree.root = tree.root; |
| 90 |
for i=1:tree.num_node |
| 91 |
new_tree.nodes(i)=tree.nodes(i); |
| 92 |
end |
| 93 |
|
| 94 |
|
| 95 |
function build_dtree (CPD, fam_ev, node_sizes, node_types,stop_cases,min_gain) |
| 96 |
global tree |
| 97 |
global cts_min_gain |
| 98 |
|
| 99 |
tree.num_node=0; %the current number of nodes in the tree |
| 100 |
tree.root=1; |
| 101 |
|
| 102 |
T = 1:size(fam_ev,2) ; %all cases |
| 103 |
candidate_attrs = 1:(size(node_sizes,2)-1); %all attributes |
| 104 |
node_id=1; %the root node |
| 105 |
lastnode=size(node_sizes,2); %the last element in all nodes is the dependent variable (category node) |
| 106 |
num_cat=node_sizes(lastnode); |
| 107 |
|
| 108 |
% get minimum gain for cts output (used in stop splitting) |
| 109 |
if (node_types(size(fam_ev,1))==1) %cts output |
| 110 |
N = size(fam_ev,2); |
| 111 |
output_id = size(fam_ev,1); |
| 112 |
cases_T = fam_ev(output_id,:); %get all the output value for cases T |
| 113 |
std_T = std(cases_T); |
| 114 |
avg_y_T = mean(cases_T); |
| 115 |
sqr_T = cases_T - avg_y_T; |
| 116 |
cts_min_gain = min_gain*(sum(sqr_T.*sqr_T)/N); % min_gain * (R(root) = 1/N * SUM(y-avg_y)^2) |
| 117 |
end |
| 118 |
|
| 119 |
split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases,min_gain, T, candidate_attrs, num_cat); |
| 120 |
|
| 121 |
|
| 122 |
|
| 123 |
% pruning method |
| 124 |
% (1) Restrictions on minimum node size: A node is not split if it has smaller than k cases. |
| 125 |
% (2) Threshholds on impurity: a threshhold is imposed on the splitting test score. Threshhold can be |
| 126 |
% imposed on local goodness measure (the gain_ratio of a node) or global goodness. |
| 127 |
% (3) Mininum Error Pruning (MEP), (no need pruning set) |
| 128 |
% Prune if static error<=backed-up error |
| 129 |
% Static error at node v: e(v) = (Nc + 1)/(N+k) (laplace estimate, prior for each class equal) |
| 130 |
% here N is # of all examples, Nc is # of majority class examples, k is number of classes |
| 131 |
% Backed-up error at node v: (Ti is the i-th subtree root) |
| 132 |
% E(T) = Sum_1_to_n(pi*e(Ti)) |
| 133 |
% (4) Pessimistic Error Pruning (PEP), used in Quilan C4.5 (no need pruning set, efficient because of pruning top-down) |
| 134 |
% Probability of error (apparent error rate) |
| 135 |
% q = (N-Nc+0.5)/N |
| 136 |
% where N=#examples, Nc=#examples in majority class |
| 137 |
% Error of a node v (if pruned) q(v)= (Nv- Nc,v + 0.5)/Nv |
| 138 |
% Error of a subtree q(T)= Sum_of_l_leaves(Nl - Nc,l + 0.5)/Sum_of_l_leaves(Nl) |
| 139 |
% Prune if q(v)<=q(T) |
| 140 |
% |
| 141 |
% Implementation statuts: |
| 142 |
% (1)(2) has been implemented as the input parameters of learn_params. |
| 143 |
% (4) is implemented in this function |
| 144 |
function pruning(fam_ev,node_sizes,node_types) |
| 145 |
% PRUNING prune the constructed tree using PEP |
| 146 |
% pruning(fam_ev,node_sizes,node_types) |
| 147 |
% |
| 148 |
% fam_ev(i,j) is the value of attribute i in j-th training cases (for whole tree), the last row is for the class label (self_ev) |
| 149 |
% node_sizes(i) is the node size for the i-th node in the family |
| 150 |
% node_types(i) is the node type for the i-th node in the family, 0 for disrete node, 1 for continous node |
| 151 |
% the global parameter 'tree' is for storing the input tree and the pruned tree |
| 152 |
|
| 153 |
|
| 154 |
function split_T = split_cases(fam_ev,node_sizes,node_types,T,node_i, threshhold) |
| 155 |
% SPLIT_CASES split the cases T according to values of node_i in the family |
| 156 |
% split_T = split_cases(fam_ev,node_sizes,node_types,T,node_i) |
| 157 |
% |
| 158 |
% fam_ev(i,j) is the value of attribute i in j-th training cases (for whole tree), the last row is for the class label (self_ev) |
| 159 |
% node_sizes(i) is the node size for the i-th node in the family |
| 160 |
% node_types(i) is the node type for the i-th node in the family, 0 for disrete node, 1 for continous node |
| 161 |
% node_i is the attribute we need to split |
| 162 |
|
| 163 |
if (node_types(node_i)==0) %discrete attribute |
| 164 |
%init the subsets of T |
| 165 |
split_T = cell(1,node_sizes(node_i)); %T will be separated into |node_size of i| subsets according to different values of node i |
| 166 |
for i=1:node_sizes(node_i) % here we assume that the value of an attribute is 1:node_size |
| 167 |
split_T{i}=zeros(1,0);
|
| 168 |
end |
| 169 |
|
| 170 |
size_t = size(T,2); |
| 171 |
for i=1:size_t |
| 172 |
case_id = T(i); |
| 173 |
%put this case into one subset of split_T according to its value for node_i |
| 174 |
value = fam_ev(node_i,case_id); |
| 175 |
pos = size(split_T{value},2)+1;
|
| 176 |
split_T{value}(pos)=case_id; % here assumes the value of an attribute is 1:node_size
|
| 177 |
end |
| 178 |
else %continuous attribute |
| 179 |
%init the subsets of T |
| 180 |
split_T = cell(1,2); %T will be separated into 2 subsets (<=threshhold) (>threshhold) |
| 181 |
for i=1:2 |
| 182 |
split_T{i}=zeros(1,0);
|
| 183 |
end |
| 184 |
|
| 185 |
size_t = size(T,2); |
| 186 |
for i=1:size_t |
| 187 |
case_id = T(i); |
| 188 |
%put this case into one subset of split_T according to its value for node_i |
| 189 |
value = fam_ev(node_i,case_id); |
| 190 |
subset_num=1; |
| 191 |
if (value>threshhold) |
| 192 |
subset_num=2; |
| 193 |
end |
| 194 |
pos = size(split_T{subset_num},2)+1;
|
| 195 |
split_T{subset_num}(pos)=case_id;
|
| 196 |
end |
| 197 |
end |
| 198 |
|
| 199 |
|
| 200 |
|
| 201 |
function new_node = split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases, min_gain, T, candidate_attrs, num_cat) |
| 202 |
% SPLIT_TREE Split the tree at node node_id with cases T (actually it is just indexes to family evidences). |
| 203 |
% new_node = split_dtree (fam_ev, node_sizes, node_types, T, node_id, num_cat, method) |
| 204 |
% |
| 205 |
% fam_ev(i,j) is the value of attribute i in j-th training cases (for whole tree), the last row is for the class label (self_ev) |
| 206 |
% node_sizes{i} is the node size for the i-th node in the family
|
| 207 |
% node_types{i} is the node type for the i-th node in the family, 0 for disrete node, 1 for continous node
|
| 208 |
% stop_cases is the threshold of number of cases to stop slitting |
| 209 |
% min_gain is the minimum gain need to split a node |
| 210 |
% T(i) is the index of i-th cases in current decision tree node, we need split it further |
| 211 |
% candidate_attrs(i) the node id for the i-th attribute that still need to be considered as split attribute |
| 212 |
%%%%% node_id is the index of current node considered for a split |
| 213 |
% num_cat is the number of output categories for the decision tree |
| 214 |
% output: |
| 215 |
% new_node is the new node created |
| 216 |
global tree |
| 217 |
global cts_min_gain |
| 218 |
|
| 219 |
size_fam = size(fam_ev,1); %number of family size |
| 220 |
output_type = node_types(size_fam); %the type of output for the tree (0 is discrete, 1 is continuous) |
| 221 |
size_attrs = size(candidate_attrs,2); %number of candidate attributes |
| 222 |
size_t = size(T,2); %number of training cases in this tree node |
| 223 |
|
| 224 |
%(1)computeFrequenceyForEachClass(T) |
| 225 |
if (output_type==0) %discrete output |
| 226 |
class_freqs = zeros(1,num_cat); |
| 227 |
for i=1:size_t |
| 228 |
case_id = T(i); |
| 229 |
case_class = fam_ev(size_fam,case_id); %get the class label for this case |
| 230 |
class_freqs(case_class)=class_freqs(case_class)+1; |
| 231 |
end |
| 232 |
else %cts output |
| 233 |
N = size(fam_ev,2); |
| 234 |
cases_T = fam_ev(size(fam_ev,1),T); %get the output value for cases T |
| 235 |
std_T = std(cases_T); |
| 236 |
end |
| 237 |
|
| 238 |
%(2) if OneClass (for discrete output) or same output value (for cts output) or Class With #examples < stop_cases |
| 239 |
% return a leaf; |
| 240 |
% create a decision node N; |
| 241 |
|
| 242 |
% get majority class in this node |
| 243 |
if (output_type == 0) |
| 244 |
top1_class = 0; %the class with the largest number of cases |
| 245 |
top1_class_cases = 0; %the number of cases in top1_class |
| 246 |
[top1_class_cases,top1_class]=max(class_freqs); |
| 247 |
end |
| 248 |
|
| 249 |
if (size_t==0) %impossble |
| 250 |
new_node=-1; |
| 251 |
fprintf('Fatal error: please contact the author. \n');
|
| 252 |
return; |
| 253 |
end |
| 254 |
|
| 255 |
% stop splitting if needed |
| 256 |
%for discrete output: one class |
| 257 |
%for cts output, all output value in cases are same |
| 258 |
%cases too little |
| 259 |
if ( (output_type==0 & top1_class_cases == size_t) | (output_type==1 & std_T == 0) | (size_t < stop_cases)) |
| 260 |
%create one new leaf node |
| 261 |
tree.num_node=tree.num_node+1; |
| 262 |
tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory) |
| 263 |
tree.nodes(tree.num_node).is_leaf=1; |
| 264 |
tree.nodes(tree.num_node).children=[]; |
| 265 |
tree.nodes(tree.num_node).split_id=0; %the attribute(parent) id to split this tree node |
| 266 |
tree.nodes(tree.num_node).split_threshhold=0; |
| 267 |
if (output_type==0) |
| 268 |
tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node |
| 269 |
|
| 270 |
% tree.nodes(tree.num_node).probs=zeros(1,num_cat); %the prob for each value of class node |
| 271 |
% tree.nodes(tree.num_node).probs(top1_class)=1; %use the majority class of parent node, like for binary class, |
| 272 |
%and majority is class 2, then the CPT is [0 1] |
| 273 |
%we may need to use prior to do smoothing, to get [0.001 0.999] |
| 274 |
tree.nodes(tree.num_node).error.self_error=1-top1_class_cases/size_t; %the classfication error in this tree node when use default class |
| 275 |
tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t; %no total classfication error in this tree node and its subtree |
| 276 |
tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases; |
| 277 |
fprintf('Create leaf node(onecla) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );
|
| 278 |
else |
| 279 |
avg_y_T = mean(cases_T); |
| 280 |
tree.nodes(tree.num_node).mean = avg_y_T; |
| 281 |
tree.nodes(tree.num_node).std = std_T; |
| 282 |
fprintf('Create leaf node(samevalue) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);
|
| 283 |
end |
| 284 |
new_node = tree.num_node; |
| 285 |
return; |
| 286 |
end |
| 287 |
|
| 288 |
%create one new node |
| 289 |
tree.num_node=tree.num_node+1; |
| 290 |
tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory) |
| 291 |
tree.nodes(tree.num_node).is_leaf=1; |
| 292 |
tree.nodes(tree.num_node).children=[]; |
| 293 |
tree.nodes(tree.num_node).split_id=0; |
| 294 |
tree.nodes(tree.num_node).split_threshhold=0; |
| 295 |
if (output_type==0) |
| 296 |
tree.nodes(tree.num_node).error.self_error=1-top1_class_cases/size_t; |
| 297 |
tree.nodes(tree.num_node).error.all_error=0; |
| 298 |
tree.nodes(tree.num_node).error.all_error_num=0; |
| 299 |
else |
| 300 |
avg_y_T = mean(cases_T); |
| 301 |
tree.nodes(tree.num_node).mean = avg_y_T; |
| 302 |
tree.nodes(tree.num_node).std = std_T; |
| 303 |
end |
| 304 |
new_node = tree.num_node; |
| 305 |
|
| 306 |
%Stop splitting if no attributes left in this node |
| 307 |
if (size_attrs==0) |
| 308 |
if (output_type==0) |
| 309 |
tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node |
| 310 |
tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t; |
| 311 |
tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases; |
| 312 |
fprintf('Create leaf node(noattr) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );
|
| 313 |
else |
| 314 |
fprintf('Create leaf node(noattr) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);
|
| 315 |
end |
| 316 |
return; |
| 317 |
end |
| 318 |
|
| 319 |
|
| 320 |
%(3) for each attribute A |
| 321 |
% ComputeGain(A); |
| 322 |
max_gain=0; %the max gain score (for discrete information gain or gain ration, for cts node the R(T)) |
| 323 |
best_attr=0; %the attribute with the max_gain |
| 324 |
best_split = []; %the split of T according to the value of best_attr |
| 325 |
cur_best_threshhold = 0; %the threshhold for split continuous attribute |
| 326 |
best_threshhold=0; |
| 327 |
|
| 328 |
% compute Info(T) (for discrete output) |
| 329 |
if (output_type == 0) |
| 330 |
class_split_T = split_cases(fam_ev,node_sizes,node_types,T,size(fam_ev,1),0); %split cases according to class |
| 331 |
info_T = compute_info (fam_ev, T, class_split_T); |
| 332 |
else % compute R(T) (for cts output) |
| 333 |
% N = size(fam_ev,2); |
| 334 |
% cases_T = fam_ev(size(fam_ev,1),T); %get the output value for cases T |
| 335 |
% std_T = std(cases_T); |
| 336 |
% avg_y_T = mean(cases_T); |
| 337 |
sqr_T = cases_T - avg_y_T; |
| 338 |
R_T = sum(sqr_T.*sqr_T)/N; % get R(T) = 1/N * SUM(y-avg_y)^2 |
| 339 |
info_T = R_T; |
| 340 |
end |
| 341 |
|
| 342 |
for i=1:(size_fam-1) |
| 343 |
if (myismember(i,candidate_attrs)) %if this attribute still in the candidate attribute set |
| 344 |
if (node_types(i)==0) %discrete attibute |
| 345 |
split_T = split_cases(fam_ev,node_sizes,node_types,T,i,0); %split cases according to value of attribute i |
| 346 |
% For cts output, we compute the least square gain. |
| 347 |
% For discrete output, we compute gain ratio |
| 348 |
cur_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,0,output_type); %gain ratio |
| 349 |
else %cts attribute |
| 350 |
%get the values of this attribute |
| 351 |
ev = fam_ev(:,T); |
| 352 |
values = ev(i,:); |
| 353 |
sort_v = sort(values); |
| 354 |
%remove the duplicate values in sort_v |
| 355 |
v_set = unique(sort_v); |
| 356 |
best_gain = 0; |
| 357 |
best_threshhold = 0; |
| 358 |
best_split1 = []; |
| 359 |
|
| 360 |
%find the best split for this cts attribute |
| 361 |
% see "Quilan 96: Improved Use of Continuous Attributes in C4.5" |
| 362 |
for j=1:(size(v_set,2)-1) |
| 363 |
mid_v = (v_set(j)+v_set(j+1))/2; |
| 364 |
split_T = split_cases(fam_ev,node_sizes,node_types,T,i,mid_v); %split cases according to value of attribute i (<=mid_v) |
| 365 |
% For cts output, we compute the least square gain. |
| 366 |
% For discrete output, we use Quilan 96: use information gain instead of gain ratio to select threshhold |
| 367 |
cur_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,1,output_type); |
| 368 |
%if (i==6) |
| 369 |
% fprintf('gain %8.5f threshhold %6.3f spliting %d\n', cur_gain, mid_v, size(split_T{1},2));
|
| 370 |
%end |
| 371 |
|
| 372 |
if (best_gain < cur_gain) |
| 373 |
best_gain = cur_gain; |
| 374 |
best_threshhold = mid_v; |
| 375 |
%best_split1 = split_T; %here we need to copy array, not good!!! (maybe we can compute after we get best_attr |
| 376 |
end |
| 377 |
end |
| 378 |
%recalculate the gain_ratio of the best_threshhold |
| 379 |
split_T = split_cases(fam_ev,node_sizes,node_types,T,i,best_threshhold); |
| 380 |
best_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,0,output_type); %gain_ratio |
| 381 |
if (output_type==0) %for discrete output |
| 382 |
cur_gain = best_gain-log2(size(v_set,2)-1)/size_t; % Quilan 96: use the gain_ratio-log2(N-1)/|D| as the gain of this attr |
| 383 |
else %for cts output |
| 384 |
cur_gain = best_gain; |
| 385 |
end |
| 386 |
end |
| 387 |
|
| 388 |
if (max_gain < cur_gain) |
| 389 |
max_gain = cur_gain; |
| 390 |
best_attr = i; |
| 391 |
cur_best_threshhold=best_threshhold; %save the threshhold |
| 392 |
%best_split = split_T; %here we need to copy array, not good!!! So we will recalculate in below line 313 |
| 393 |
end |
| 394 |
end |
| 395 |
end |
| 396 |
|
| 397 |
% stop splitting if gain is too small |
| 398 |
if (max_gain==0 | (output_type==0 & max_gain < min_gain) | (output_type==1 & max_gain < cts_min_gain)) |
| 399 |
if (output_type==0) |
| 400 |
tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node |
| 401 |
tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t; |
| 402 |
tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases; |
| 403 |
fprintf('Create leaf node(nogain) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );
|
| 404 |
else |
| 405 |
fprintf('Create leaf node(nogain) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);
|
| 406 |
end |
| 407 |
return; |
| 408 |
end |
| 409 |
|
| 410 |
%get the split of cases according to the best split attribute |
| 411 |
if (node_types(best_attr)==0) %discrete attibute |
| 412 |
best_split = split_cases(fam_ev,node_sizes,node_types,T,best_attr,0); |
| 413 |
else |
| 414 |
best_split = split_cases(fam_ev,node_sizes,node_types,T,best_attr,cur_best_threshhold); |
| 415 |
end |
| 416 |
|
| 417 |
%(4) best_attr = AttributeWithBestGain; |
| 418 |
%(5) if best_attr is continuous ???? why need this? maybe the value in the decision tree must appeared in data |
| 419 |
% find threshhold in all cases that <= max_V |
| 420 |
% change the split of T |
| 421 |
tree.nodes(tree.num_node).split_id=best_attr; |
| 422 |
tree.nodes(tree.num_node).split_threshhold=cur_best_threshhold; %for cts attribute only |
| 423 |
|
| 424 |
%note: below threshhold rejust is linera search, so it is slow. A better method is described in paper "Efficient C4.5" |
| 425 |
%if (output_type==0) |
| 426 |
if (node_types(best_attr)==1) %is a continuous attribute |
| 427 |
%find the value that approximate best_threshhold from below (the largest that <= best_threshhold) |
| 428 |
best_value=0; |
| 429 |
for i=1:size(fam_ev,2) %note: need to search in all cases for all tree, not just in cases for this node |
| 430 |
val = fam_ev(best_attr,i); |
| 431 |
if (val <= cur_best_threshhold & val > best_value) %val is more clear to best_threshhold |
| 432 |
best_value=val; |
| 433 |
end |
| 434 |
end |
| 435 |
tree.nodes(tree.num_node).split_threshhold=best_value; %for cts attribute only |
| 436 |
end |
| 437 |
%end |
| 438 |
|
| 439 |
if (output_type == 0) |
| 440 |
fprintf('Create node %d split at %d gain %8.4f Th %d. Class %d Cases %d Error %d \n',tree.num_node, best_attr, max_gain, tree.nodes(tree.num_node).split_threshhold, top1_class, size_t, size_t - top1_class_cases );
|
| 441 |
else |
| 442 |
fprintf('Create node %d split at %d gain %8.4f Th %d. Mean %8.4f Cases %d\n',tree.num_node, best_attr, max_gain, tree.nodes(tree.num_node).split_threshhold, avg_y_T, size_t );
|
| 443 |
end |
| 444 |
|
| 445 |
%(6) Foreach T' in the split_T |
| 446 |
% if T' is Empty |
| 447 |
% Child of node_id is a leaf |
| 448 |
% else |
| 449 |
% Child of node_id = split_tree (T') |
| 450 |
tree.nodes(new_node).is_leaf=0; %because this node will be split, it is not leaf now |
| 451 |
for i=1:size(best_split,2) |
| 452 |
if (size(best_split{i},2)==0) %T(i) is empty
|
| 453 |
%create one new leaf node |
| 454 |
tree.num_node=tree.num_node+1; |
| 455 |
tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory) |
| 456 |
tree.nodes(tree.num_node).is_leaf=1; |
| 457 |
tree.nodes(tree.num_node).children=[]; |
| 458 |
tree.nodes(tree.num_node).split_id=0; |
| 459 |
tree.nodes(tree.num_node).split_threshhold=0; |
| 460 |
if (output_type == 0) |
| 461 |
tree.nodes(tree.num_node).probs=zeros(1,num_cat); %the prob for each value of class node |
| 462 |
tree.nodes(tree.num_node).probs(top1_class)=1; %use the majority class of parent node, like for binary class, |
| 463 |
%and majority is class 2, then the CPT is [0 1] |
| 464 |
%we may need to use prior to do smoothing, to get [0.001 0.999] |
| 465 |
tree.nodes(tree.num_node).error.self_error=0; |
| 466 |
tree.nodes(tree.num_node).error.all_error=0; |
| 467 |
tree.nodes(tree.num_node).error.all_error_num=0; |
| 468 |
else |
| 469 |
tree.nodes(tree.num_node).mean = avg_y_T; %just use parent node's mean value |
| 470 |
tree.nodes(tree.num_node).std = std_T; |
| 471 |
end |
| 472 |
%add the new leaf node to parents |
| 473 |
num_children=size(tree.nodes(new_node).children,2); |
| 474 |
tree.nodes(new_node).children(num_children+1)=tree.num_node; |
| 475 |
if (output_type==0) |
| 476 |
fprintf('Create leaf node(nullset) %d. %d-th child of Father %d Class %d\n',tree.num_node, i, new_node, top1_class );
|
| 477 |
else |
| 478 |
fprintf('Create leaf node(nullset) %d. %d-th child of Father %d \n',tree.num_node, i, new_node );
|
| 479 |
end |
| 480 |
|
| 481 |
else |
| 482 |
if (node_types(best_attr)==0) % if attr is discrete, it should be removed from the candidate set |
| 483 |
new_candidate_attrs = mysetdiff(candidate_attrs,[best_attr]); |
| 484 |
else |
| 485 |
new_candidate_attrs = candidate_attrs; |
| 486 |
end |
| 487 |
new_sub_node = split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases, min_gain, best_split{i}, new_candidate_attrs, num_cat);
|
| 488 |
%tree.nodes(parent_id).error.all_error += tree.nodes(new_sub_node).error.all_error; |
| 489 |
fprintf('Add subtree node %d to %d. #nodes %d\n',new_sub_node,new_node, tree.num_node );
|
| 490 |
|
| 491 |
% tree.nodes(new_node).error.all_error_num = tree.nodes(new_node).error.all_error_num + tree.nodes(new_sub_node).error.all_error_num; |
| 492 |
%add the new leaf node to parents |
| 493 |
num_children=size(tree.nodes(new_node).children,2); |
| 494 |
tree.nodes(new_node).children(num_children+1)=new_sub_node; |
| 495 |
end |
| 496 |
end |
| 497 |
|
| 498 |
%(7) Compute errors of N; for doing pruning |
| 499 |
% get the total error for the subtree |
| 500 |
if (output_type==0) |
| 501 |
tree.nodes(new_node).error.all_error=tree.nodes(new_node).error.all_error_num/size_t; |
| 502 |
end |
| 503 |
%doing pruning, but doing here is not so efficient, because it is bottom up. |
| 504 |
%if tree.nodes() |
| 505 |
%after doing pruning, need to update the all_error to self_error |
| 506 |
|
| 507 |
%(8) Return N |
| 508 |
|
| 509 |
|
| 510 |
|
| 511 |
|
| 512 |
%(1) For discrete output, we use GainRatio defined as below |
| 513 |
% Gain(X,T) |
| 514 |
% GainRatio(X,T) = ---------- |
| 515 |
% SplitInfo(X,T) |
| 516 |
% where |
| 517 |
% Gain(X,T) = Info(T) - Info(X,T) |
| 518 |
% |Ti| |
| 519 |
% Info(X,T) = Sum for i from 1 to n of ( ---- * Info(Ti)) |
| 520 |
% |T| |
| 521 |
|
| 522 |
% SplitInfo(D,T) is the information due to the split of T on the basis |
| 523 |
% of the value of the categorical attribute D. Thus SplitInfo(D,T) is |
| 524 |
% I(|T1|/|T|, |T2|/|T|, .., |Tm|/|T|) |
| 525 |
% where {T1, T2, .. Tm} is the partition of T induced by the value of D.
|
| 526 |
|
| 527 |
% Definition of Info(Ti) |
| 528 |
% If a set T of records is partitioned into disjoint exhaustive classes C1, C2, .., Ck on the basis of the |
| 529 |
% value of the categorical attribute, then the information needed to identify the class of an element of T |
| 530 |
% is Info(T) = I(P), where P is the probability distribution of the partition (C1, C2, .., Ck): |
| 531 |
% P = (|C1|/|T|, |C2|/|T|, ..., |Ck|/|T|) |
| 532 |
% Here I(P) is defined as |
| 533 |
% I(P) = -(p1*log(p1) + p2*log(p2) + .. + pn*log(pn)) |
| 534 |
% |
| 535 |
%(2) For continuous output (regression tree), we use least squares score (adapted from Leo Breiman's book "Classification and regression trees", page 231 |
| 536 |
% The original support only binary split, we further extend it to permit multiple-child split |
| 537 |
% |
| 538 |
% Delta_R = R(T) - Sum for all childe nodes Ti (R(Ti)) |
| 539 |
% Where R(Ti)= 1/N * Sum for all cases i in node Ti ((yi - avg_y(Ti))^2) |
| 540 |
% here N is the number of all training cases for construct the regression tree |
| 541 |
% avg_y(Ti) is the average value for output variable for the cases in node Ti |
| 542 |
|
| 543 |
function gain_score = compute_gain (fam_ev, node_sizes, node_types, T, info_T, attr_id, split_T, score_type, output_type) |
| 544 |
% COMPUTE_GAIN Compute the score for the split of cases T using attribute attr_id |
| 545 |
% gain_score = compute_gain (fam_ev, T, attr_id, node_size, method) |
| 546 |
% |
| 547 |
% fam_ev(i,j) is the value of attribute i in j-th training cases, the last row is for the class label (self_ev) |
| 548 |
% T(i) is the index of i-th cases in current decision tree node, we need split it further |
| 549 |
% attr_id is the index of current node considered for a split |
| 550 |
% split_T{i} is the i_th subset in partition of cases T according to the value of attribute attr_id
|
| 551 |
% score_type if 0, is gain ratio, 1 is information gain (only apply to discrete output) |
| 552 |
% node_size(i) the node size of i-th node in the family |
| 553 |
% output_type: 0 means discrete output, 1 means continuous output. |
| 554 |
gain_score=0; |
| 555 |
% ***********for DISCRETE output******************************************************* |
| 556 |
if (output_type == 0) |
| 557 |
% compute Info(T) |
| 558 |
total_cnt = size(T,2); |
| 559 |
if (total_cnt==0) |
| 560 |
return; |
| 561 |
end; |
| 562 |
%class_split_T = split_cases(fam_ev,node_sizes,node_types,T,size(fam_ev,1),0); %split cases according to class |
| 563 |
%info_T = compute_info (fam_ev, T, class_split_T); |
| 564 |
|
| 565 |
% compute Info(X,T) |
| 566 |
num_class = size(split_T,2); |
| 567 |
subset_sizes = zeros(1,num_class); |
| 568 |
info_ti = zeros(1,num_class); |
| 569 |
for i=1:num_class |
| 570 |
subset_sizes(i)=size(split_T{i},2);
|
| 571 |
if (subset_sizes(i)~=0) |
| 572 |
class_split_Ti = split_cases(fam_ev,node_sizes,node_types,split_T{i},size(fam_ev,1),0); %split cases according to class
|
| 573 |
info_ti(i) = compute_info(fam_ev, split_T{i}, class_split_Ti);
|
| 574 |
end |
| 575 |
end |
| 576 |
ti_ratios = subset_sizes/total_cnt; %get the |Ti|/|T| |
| 577 |
info_X_T = sum(ti_ratios.*info_ti); |
| 578 |
|
| 579 |
%get Gain(X,T) |
| 580 |
gain_X_T = info_T - info_X_T; |
| 581 |
|
| 582 |
if (score_type == 1) %information gain |
| 583 |
gain_score=gain_X_T; |
| 584 |
return; |
| 585 |
end |
| 586 |
%compute the SplitInfo(X,T) //is this also for cts attr, only split into two subsets |
| 587 |
splitinfo_T = compute_info (fam_ev, T, split_T); |
| 588 |
if (splitinfo_T~=0) |
| 589 |
gain_score = gain_X_T/splitinfo_T; |
| 590 |
end |
| 591 |
|
| 592 |
% ************for continuous output************************************************** |
| 593 |
else |
| 594 |
N = size(fam_ev,2); |
| 595 |
|
| 596 |
% compute R(Ti) |
| 597 |
num_class = size(split_T,2); |
| 598 |
R_Ti = zeros(1,num_class); |
| 599 |
for i=1:num_class |
| 600 |
if (size(split_T{i},2)~=0)
|
| 601 |
cases_T = fam_ev(size(fam_ev,1),split_T{i});
|
| 602 |
avg_y_T = mean(cases_T); |
| 603 |
sqr_T = cases_T - avg_y_T; |
| 604 |
R_Ti(i) = sum(sqr_T.*sqr_T)/N; % get R(Ti) = 1/N * SUM(y-avg_y)^2 |
| 605 |
end |
| 606 |
end |
| 607 |
%delta_R = R(T) - SUM(R(Ti)) |
| 608 |
gain_score = info_T - sum(R_Ti); |
| 609 |
|
| 610 |
end |
| 611 |
|
| 612 |
|
| 613 |
% Definition of Info(Ti) |
| 614 |
% If a set T of records is partitioned into disjoint exhaustive classes C1, C2, .., Ck on the basis of the |
| 615 |
% value of the categorical attribute, then the information needed to identify the class of an element of T |
| 616 |
% is Info(T) = I(P), where P is the probability distribution of the partition (C1, C2, .., Ck): |
| 617 |
% P = (|C1|/|T|, |C2|/|T|, ..., |Ck|/|T|) |
| 618 |
% Here I(P) is defined as |
| 619 |
% I(P) = -(p1*log(p1) + p2*log(p2) + .. + pn*log(pn)) |
| 620 |
function info = compute_info (fam_ev, T, split_T) |
| 621 |
% COMPUTE_INFO compute the information for the split of T into split_T |
| 622 |
% info = compute_info (fam_ev, T, split_T) |
| 623 |
|
| 624 |
total_cnt = size(T,2); |
| 625 |
num_class = size(split_T,2); |
| 626 |
subset_sizes = zeros(1,num_class); |
| 627 |
probs = zeros(1,num_class); |
| 628 |
log_probs = zeros(1,num_class); |
| 629 |
for i=1:num_class |
| 630 |
subset_sizes(i)=size(split_T{i},2);
|
| 631 |
end |
| 632 |
|
| 633 |
probs = subset_sizes/total_cnt; |
| 634 |
%log_probs = log2(probs); % if probs(i)=0, the log2(probs(i)) will be Inf |
| 635 |
for i=1:size(probs,2) |
| 636 |
if (probs(i)~=0) |
| 637 |
log_probs(i)=log2(probs(i)); |
| 638 |
end |
| 639 |
end |
| 640 |
|
| 641 |
info = sum(-(probs.*log_probs)); |
| 642 |
|