camir-aes2014: toolboxes/FullBNT-1.0.7/bnt/CPDs/@tree_CPD/learn

annotate toolboxes/FullBNT-1.0.7/bnt/CPDs/@tree_CPD/learn_params.m @ 0:e9a9cd732c1e tip

first hg version after svn

author	wolffd
date	Tue, 10 Feb 2015 15:05:51 +0000
parents
children

rev	line source
wolffd@0	1 function CPD = learn_params(CPD, fam, data, ns, cnodes, varargin)
wolffd@0	2 % LEARN_PARAMS Construct classification/regression tree given complete data
wolffd@0	3 % CPD = learn_params(CPD, fam, data, ns, cnodes)
wolffd@0	4 %
wolffd@0	5 % fam(i) is the node id of the i-th node in the family of nodes, self node is the last one
wolffd@0	6 % data(i,m) is the value of node i in case m (can be cell array).
wolffd@0	7 % ns(i) is the node size for the i-th node in the whold bnet
wolffd@0	8 % cnodes(i) is the node id for the i-th continuous node in the whole bnet
wolffd@0	9 %
wolffd@0	10 % The following optional arguments can be specified in the form of name/value pairs:
wolffd@0	11 % stop_cases: for early stop (pruning). A node is not split if it has less than k cases. default is 0.
wolffd@0	12 % min_gain: for early stop (pruning).
wolffd@0	13 % For discrete output: A node is not split when the gain of best split is less than min_gain. default is 0.
wolffd@0	14 % For continuous (cts) outpt: A node is not split when the gain of best split is less than min_gain*score(root)
wolffd@0	15 % (we denote it cts_min_gain). default is 0.006
wolffd@0	16 % %%%%%%%%%%%%%%%%%%%Struction definition of dtree_CPD.tree%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
wolffd@0	17 % tree.num_node the last position in tree.nodes array for adding new nodes,
wolffd@0	18 % it is not always same to number of nodes in a tree, because some position in the
wolffd@0	19 % tree.nodes array can be set to unused (e.g. in tree pruning)
wolffd@0	20 % tree.nodes is the array of nodes in the tree plus some unused nodes.
wolffd@0	21 % tree.nodes(1) is the root for the tree.
wolffd@0	22 %
wolffd@0	23 % Below is the attributes for each node
wolffd@0	24 % tree.nodes(i).used; % flag this node is used (0 means node not used, it can be removed from tree to save memory)
wolffd@0	25 % tree.nodes(i).is_leaf; % if 1 means this node is a leaf, if 0 not a leaf.
wolffd@0	26 % tree.nodes(i).children; % children(i) is the node number in tree.nodes array for the i-th child node
wolffd@0	27 % tree.nodes(i).split_id; % the attribute id used to split this node
wolffd@0	28 % tree.nodes(i).split_threshhold; % the threshhold for continuous attribute to split this node
wolffd@0	29 % %%%%%attributes specially for classification tree (discrete output)
wolffd@0	30 % tree.nodes(i).probs % probs(i) is the prob for i-th value of class node
wolffd@0	31 % % For three output class, the probs = [0.9 0.1 0.0] means the probability of
wolffd@0	32 % % class 1 is 0.9, for class 2 is 0.1, for class 3 is 0.0.
wolffd@0	33 % %%%%%attributes specially for regression tree (continuous output)
wolffd@0	34 % tree.nodes(i).mean % mean output value for this node
wolffd@0	35 % tree.nodes(i).std % standard deviation for output values in this node
wolffd@0	36 %
wolffd@0	37 % Author: yimin.zhang@intel.com
wolffd@0	38 % Last updated: Jan. 19, 2002
wolffd@0	39
wolffd@0	40 % Want list:
wolffd@0	41 % (1) more efficient for cts attributes: get the values of cts attributes at first (the begining of build_tree function), then doing bi_search in finding threshhold
wolffd@0	42 % (2) pruning classification tree using Pessimistic Error Pruning
wolffd@0	43 % (3) bi_search for strings (used for transform data to BNT format)
wolffd@0	44
wolffd@0	45 global tree %tree must be global so that it can be accessed in recursive slitting function
wolffd@0	46 global cts_min_gain
wolffd@0	47 tree=[]; % clear the tree
wolffd@0	48 tree.num_node=0;
wolffd@0	49 cts_min_gain=0;
wolffd@0	50
wolffd@0	51 stop_cases=0;
wolffd@0	52 min_gain=0;
wolffd@0	53
wolffd@0	54 args = varargin;
wolffd@0	55 nargs = length(args);
wolffd@0	56 if (nargs>0)
wolffd@0	57 if isstr(args{1})
wolffd@0	58 for i=1:2:nargs
wolffd@0	59 switch args{i},
wolffd@0	60 case 'stop_cases', stop_cases = args{i+1};
wolffd@0	61 case 'min_gain', min_gain = args{i+1};
wolffd@0	62 end
wolffd@0	63 end
wolffd@0	64 else
wolffd@0	65 error(['error in input parameters']);
wolffd@0	66 end
wolffd@0	67 end
wolffd@0	68
wolffd@0	69 if iscell(data)
wolffd@0	70 local_data = cell2num(data(fam,:));
wolffd@0	71 else
wolffd@0	72 local_data = data(fam, :);
wolffd@0	73 end
wolffd@0	74 %counts = compute_counts(local_data, CPD.sizes);
wolffd@0	75 %CPD.CPT = mk_stochastic(counts + CPD.prior); % bug fix 11/5/01
wolffd@0	76 node_types = zeros(1,size(ns,2)); %all nodes are disrete
wolffd@0	77 node_types(cnodes)=1;
wolffd@0	78 %make the data be BNT compliant (values for discrete nodes are from 1-n, here n is the node size)
wolffd@0	79 %trans_data=transform_data(local_data,'tmp.dat',[]); %here no cts nodes
wolffd@0	80
wolffd@0	81 build_dtree (CPD, local_data, ns(fam), node_types(fam),stop_cases,min_gain);
wolffd@0	82 %CPD.tree=copy_tree(tree);
wolffd@0	83 CPD.tree=tree; %copy the tree constructed to CPD
wolffd@0	84
wolffd@0	85
wolffd@0	86 function new_tree = copy_tree(tree)
wolffd@0	87 % copy the tree to new_tree
wolffd@0	88 new_tree.num_node=tree.num_node;
wolffd@0	89 new_tree.root = tree.root;
wolffd@0	90 for i=1:tree.num_node
wolffd@0	91 new_tree.nodes(i)=tree.nodes(i);
wolffd@0	92 end
wolffd@0	93
wolffd@0	94
wolffd@0	95 function build_dtree (CPD, fam_ev, node_sizes, node_types,stop_cases,min_gain)
wolffd@0	96 global tree
wolffd@0	97 global cts_min_gain
wolffd@0	98
wolffd@0	99 tree.num_node=0; %the current number of nodes in the tree
wolffd@0	100 tree.root=1;
wolffd@0	101
wolffd@0	102 T = 1:size(fam_ev,2) ; %all cases
wolffd@0	103 candidate_attrs = 1:(size(node_sizes,2)-1); %all attributes
wolffd@0	104 node_id=1; %the root node
wolffd@0	105 lastnode=size(node_sizes,2); %the last element in all nodes is the dependent variable (category node)
wolffd@0	106 num_cat=node_sizes(lastnode);
wolffd@0	107
wolffd@0	108 % get minimum gain for cts output (used in stop splitting)
wolffd@0	109 if (node_types(size(fam_ev,1))==1) %cts output
wolffd@0	110 N = size(fam_ev,2);
wolffd@0	111 output_id = size(fam_ev,1);
wolffd@0	112 cases_T = fam_ev(output_id,:); %get all the output value for cases T
wolffd@0	113 std_T = std(cases_T);
wolffd@0	114 avg_y_T = mean(cases_T);
wolffd@0	115 sqr_T = cases_T - avg_y_T;
wolffd@0	116 cts_min_gain = min_gain(sum(sqr_T.sqr_T)/N); % min_gain * (R(root) = 1/N * SUM(y-avg_y)^2)
wolffd@0	117 end
wolffd@0	118
wolffd@0	119 split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases,min_gain, T, candidate_attrs, num_cat);
wolffd@0	120
wolffd@0	121
wolffd@0	122
wolffd@0	123 % pruning method
wolffd@0	124 % (1) Restrictions on minimum node size: A node is not split if it has smaller than k cases.
wolffd@0	125 % (2) Threshholds on impurity: a threshhold is imposed on the splitting test score. Threshhold can be
wolffd@0	126 % imposed on local goodness measure (the gain_ratio of a node) or global goodness.
wolffd@0	127 % (3) Mininum Error Pruning (MEP), (no need pruning set)
wolffd@0	128 % Prune if static error<=backed-up error
wolffd@0	129 % Static error at node v: e(v) = (Nc + 1)/(N+k) (laplace estimate, prior for each class equal)
wolffd@0	130 % here N is # of all examples, Nc is # of majority class examples, k is number of classes
wolffd@0	131 % Backed-up error at node v: (Ti is the i-th subtree root)
wolffd@0	132 % E(T) = Sum_1_to_n(pi*e(Ti))
wolffd@0	133 % (4) Pessimistic Error Pruning (PEP), used in Quilan C4.5 (no need pruning set, efficient because of pruning top-down)
wolffd@0	134 % Probability of error (apparent error rate)
wolffd@0	135 % q = (N-Nc+0.5)/N
wolffd@0	136 % where N=#examples, Nc=#examples in majority class
wolffd@0	137 % Error of a node v (if pruned) q(v)= (Nv- Nc,v + 0.5)/Nv
wolffd@0	138 % Error of a subtree q(T)= Sum_of_l_leaves(Nl - Nc,l + 0.5)/Sum_of_l_leaves(Nl)
wolffd@0	139 % Prune if q(v)<=q(T)
wolffd@0	140 %
wolffd@0	141 % Implementation statuts:
wolffd@0	142 % (1)(2) has been implemented as the input parameters of learn_params.
wolffd@0	143 % (4) is implemented in this function
wolffd@0	144 function pruning(fam_ev,node_sizes,node_types)
wolffd@0	145 % PRUNING prune the constructed tree using PEP
wolffd@0	146 % pruning(fam_ev,node_sizes,node_types)
wolffd@0	147 %
wolffd@0	148 % fam_ev(i,j) is the value of attribute i in j-th training cases (for whole tree), the last row is for the class label (self_ev)
wolffd@0	149 % node_sizes(i) is the node size for the i-th node in the family
wolffd@0	150 % node_types(i) is the node type for the i-th node in the family, 0 for disrete node, 1 for continous node
wolffd@0	151 % the global parameter 'tree' is for storing the input tree and the pruned tree
wolffd@0	152
wolffd@0	153
wolffd@0	154 function split_T = split_cases(fam_ev,node_sizes,node_types,T,node_i, threshhold)
wolffd@0	155 % SPLIT_CASES split the cases T according to values of node_i in the family
wolffd@0	156 % split_T = split_cases(fam_ev,node_sizes,node_types,T,node_i)
wolffd@0	157 %
wolffd@0	158 % fam_ev(i,j) is the value of attribute i in j-th training cases (for whole tree), the last row is for the class label (self_ev)
wolffd@0	159 % node_sizes(i) is the node size for the i-th node in the family
wolffd@0	160 % node_types(i) is the node type for the i-th node in the family, 0 for disrete node, 1 for continous node
wolffd@0	161 % node_i is the attribute we need to split
wolffd@0	162
wolffd@0	163 if (node_types(node_i)==0) %discrete attribute
wolffd@0	164 %init the subsets of T
wolffd@0	165 split_T = cell(1,node_sizes(node_i)); %T will be separated into \|node_size of i\| subsets according to different values of node i
wolffd@0	166 for i=1:node_sizes(node_i) % here we assume that the value of an attribute is 1:node_size
wolffd@0	167 split_T{i}=zeros(1,0);
wolffd@0	168 end
wolffd@0	169
wolffd@0	170 size_t = size(T,2);
wolffd@0	171 for i=1:size_t
wolffd@0	172 case_id = T(i);
wolffd@0	173 %put this case into one subset of split_T according to its value for node_i
wolffd@0	174 value = fam_ev(node_i,case_id);
wolffd@0	175 pos = size(split_T{value},2)+1;
wolffd@0	176 split_T{value}(pos)=case_id; % here assumes the value of an attribute is 1:node_size
wolffd@0	177 end
wolffd@0	178 else %continuous attribute
wolffd@0	179 %init the subsets of T
wolffd@0	180 split_T = cell(1,2); %T will be separated into 2 subsets (<=threshhold) (>threshhold)
wolffd@0	181 for i=1:2
wolffd@0	182 split_T{i}=zeros(1,0);
wolffd@0	183 end
wolffd@0	184
wolffd@0	185 size_t = size(T,2);
wolffd@0	186 for i=1:size_t
wolffd@0	187 case_id = T(i);
wolffd@0	188 %put this case into one subset of split_T according to its value for node_i
wolffd@0	189 value = fam_ev(node_i,case_id);
wolffd@0	190 subset_num=1;
wolffd@0	191 if (value>threshhold)
wolffd@0	192 subset_num=2;
wolffd@0	193 end
wolffd@0	194 pos = size(split_T{subset_num},2)+1;
wolffd@0	195 split_T{subset_num}(pos)=case_id;
wolffd@0	196 end
wolffd@0	197 end
wolffd@0	198
wolffd@0	199
wolffd@0	200
wolffd@0	201 function new_node = split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases, min_gain, T, candidate_attrs, num_cat)
wolffd@0	202 % SPLIT_TREE Split the tree at node node_id with cases T (actually it is just indexes to family evidences).
wolffd@0	203 % new_node = split_dtree (fam_ev, node_sizes, node_types, T, node_id, num_cat, method)
wolffd@0	204 %
wolffd@0	205 % fam_ev(i,j) is the value of attribute i in j-th training cases (for whole tree), the last row is for the class label (self_ev)
wolffd@0	206 % node_sizes{i} is the node size for the i-th node in the family
wolffd@0	207 % node_types{i} is the node type for the i-th node in the family, 0 for disrete node, 1 for continous node
wolffd@0	208 % stop_cases is the threshold of number of cases to stop slitting
wolffd@0	209 % min_gain is the minimum gain need to split a node
wolffd@0	210 % T(i) is the index of i-th cases in current decision tree node, we need split it further
wolffd@0	211 % candidate_attrs(i) the node id for the i-th attribute that still need to be considered as split attribute
wolffd@0	212 %%%%% node_id is the index of current node considered for a split
wolffd@0	213 % num_cat is the number of output categories for the decision tree
wolffd@0	214 % output:
wolffd@0	215 % new_node is the new node created
wolffd@0	216 global tree
wolffd@0	217 global cts_min_gain
wolffd@0	218
wolffd@0	219 size_fam = size(fam_ev,1); %number of family size
wolffd@0	220 output_type = node_types(size_fam); %the type of output for the tree (0 is discrete, 1 is continuous)
wolffd@0	221 size_attrs = size(candidate_attrs,2); %number of candidate attributes
wolffd@0	222 size_t = size(T,2); %number of training cases in this tree node
wolffd@0	223
wolffd@0	224 %(1)computeFrequenceyForEachClass(T)
wolffd@0	225 if (output_type==0) %discrete output
wolffd@0	226 class_freqs = zeros(1,num_cat);
wolffd@0	227 for i=1:size_t
wolffd@0	228 case_id = T(i);
wolffd@0	229 case_class = fam_ev(size_fam,case_id); %get the class label for this case
wolffd@0	230 class_freqs(case_class)=class_freqs(case_class)+1;
wolffd@0	231 end
wolffd@0	232 else %cts output
wolffd@0	233 N = size(fam_ev,2);
wolffd@0	234 cases_T = fam_ev(size(fam_ev,1),T); %get the output value for cases T
wolffd@0	235 std_T = std(cases_T);
wolffd@0	236 end
wolffd@0	237
wolffd@0	238 %(2) if OneClass (for discrete output) or same output value (for cts output) or Class With #examples < stop_cases
wolffd@0	239 % return a leaf;
wolffd@0	240 % create a decision node N;
wolffd@0	241
wolffd@0	242 % get majority class in this node
wolffd@0	243 if (output_type == 0)
wolffd@0	244 top1_class = 0; %the class with the largest number of cases
wolffd@0	245 top1_class_cases = 0; %the number of cases in top1_class
wolffd@0	246 [top1_class_cases,top1_class]=max(class_freqs);
wolffd@0	247 end
wolffd@0	248
wolffd@0	249 if (size_t==0) %impossble
wolffd@0	250 new_node=-1;
wolffd@0	251 fprintf('Fatal error: please contact the author. \n');
wolffd@0	252 return;
wolffd@0	253 end
wolffd@0	254
wolffd@0	255 % stop splitting if needed
wolffd@0	256 %for discrete output: one class
wolffd@0	257 %for cts output, all output value in cases are same
wolffd@0	258 %cases too little
wolffd@0	259 if ( (output_type==0 & top1_class_cases == size_t) \| (output_type==1 & std_T == 0) \| (size_t < stop_cases))
wolffd@0	260 %create one new leaf node
wolffd@0	261 tree.num_node=tree.num_node+1;
wolffd@0	262 tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory)
wolffd@0	263 tree.nodes(tree.num_node).is_leaf=1;
wolffd@0	264 tree.nodes(tree.num_node).children=[];
wolffd@0	265 tree.nodes(tree.num_node).split_id=0; %the attribute(parent) id to split this tree node
wolffd@0	266 tree.nodes(tree.num_node).split_threshhold=0;
wolffd@0	267 if (output_type==0)
wolffd@0	268 tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node
wolffd@0	269
wolffd@0	270 % tree.nodes(tree.num_node).probs=zeros(1,num_cat); %the prob for each value of class node
wolffd@0	271 % tree.nodes(tree.num_node).probs(top1_class)=1; %use the majority class of parent node, like for binary class,
wolffd@0	272 %and majority is class 2, then the CPT is [0 1]
wolffd@0	273 %we may need to use prior to do smoothing, to get [0.001 0.999]
wolffd@0	274 tree.nodes(tree.num_node).error.self_error=1-top1_class_cases/size_t; %the classfication error in this tree node when use default class
wolffd@0	275 tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t; %no total classfication error in this tree node and its subtree
wolffd@0	276 tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases;
wolffd@0	277 fprintf('Create leaf node(onecla) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );
wolffd@0	278 else
wolffd@0	279 avg_y_T = mean(cases_T);
wolffd@0	280 tree.nodes(tree.num_node).mean = avg_y_T;
wolffd@0	281 tree.nodes(tree.num_node).std = std_T;
wolffd@0	282 fprintf('Create leaf node(samevalue) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);
wolffd@0	283 end
wolffd@0	284 new_node = tree.num_node;
wolffd@0	285 return;
wolffd@0	286 end
wolffd@0	287
wolffd@0	288 %create one new node
wolffd@0	289 tree.num_node=tree.num_node+1;
wolffd@0	290 tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory)
wolffd@0	291 tree.nodes(tree.num_node).is_leaf=1;
wolffd@0	292 tree.nodes(tree.num_node).children=[];
wolffd@0	293 tree.nodes(tree.num_node).split_id=0;
wolffd@0	294 tree.nodes(tree.num_node).split_threshhold=0;
wolffd@0	295 if (output_type==0)
wolffd@0	296 tree.nodes(tree.num_node).error.self_error=1-top1_class_cases/size_t;
wolffd@0	297 tree.nodes(tree.num_node).error.all_error=0;
wolffd@0	298 tree.nodes(tree.num_node).error.all_error_num=0;
wolffd@0	299 else
wolffd@0	300 avg_y_T = mean(cases_T);
wolffd@0	301 tree.nodes(tree.num_node).mean = avg_y_T;
wolffd@0	302 tree.nodes(tree.num_node).std = std_T;
wolffd@0	303 end
wolffd@0	304 new_node = tree.num_node;
wolffd@0	305
wolffd@0	306 %Stop splitting if no attributes left in this node
wolffd@0	307 if (size_attrs==0)
wolffd@0	308 if (output_type==0)
wolffd@0	309 tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node
wolffd@0	310 tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t;
wolffd@0	311 tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases;
wolffd@0	312 fprintf('Create leaf node(noattr) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );
wolffd@0	313 else
wolffd@0	314 fprintf('Create leaf node(noattr) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);
wolffd@0	315 end
wolffd@0	316 return;
wolffd@0	317 end
wolffd@0	318
wolffd@0	319
wolffd@0	320 %(3) for each attribute A
wolffd@0	321 % ComputeGain(A);
wolffd@0	322 max_gain=0; %the max gain score (for discrete information gain or gain ration, for cts node the R(T))
wolffd@0	323 best_attr=0; %the attribute with the max_gain
wolffd@0	324 best_split = []; %the split of T according to the value of best_attr
wolffd@0	325 cur_best_threshhold = 0; %the threshhold for split continuous attribute
wolffd@0	326 best_threshhold=0;
wolffd@0	327
wolffd@0	328 % compute Info(T) (for discrete output)
wolffd@0	329 if (output_type == 0)
wolffd@0	330 class_split_T = split_cases(fam_ev,node_sizes,node_types,T,size(fam_ev,1),0); %split cases according to class
wolffd@0	331 info_T = compute_info (fam_ev, T, class_split_T);
wolffd@0	332 else % compute R(T) (for cts output)
wolffd@0	333 % N = size(fam_ev,2);
wolffd@0	334 % cases_T = fam_ev(size(fam_ev,1),T); %get the output value for cases T
wolffd@0	335 % std_T = std(cases_T);
wolffd@0	336 % avg_y_T = mean(cases_T);
wolffd@0	337 sqr_T = cases_T - avg_y_T;
wolffd@0	338 R_T = sum(sqr_T.sqr_T)/N; % get R(T) = 1/N SUM(y-avg_y)^2
wolffd@0	339 info_T = R_T;
wolffd@0	340 end
wolffd@0	341
wolffd@0	342 for i=1:(size_fam-1)
wolffd@0	343 if (myismember(i,candidate_attrs)) %if this attribute still in the candidate attribute set
wolffd@0	344 if (node_types(i)==0) %discrete attibute
wolffd@0	345 split_T = split_cases(fam_ev,node_sizes,node_types,T,i,0); %split cases according to value of attribute i
wolffd@0	346 % For cts output, we compute the least square gain.
wolffd@0	347 % For discrete output, we compute gain ratio
wolffd@0	348 cur_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,0,output_type); %gain ratio
wolffd@0	349 else %cts attribute
wolffd@0	350 %get the values of this attribute
wolffd@0	351 ev = fam_ev(:,T);
wolffd@0	352 values = ev(i,:);
wolffd@0	353 sort_v = sort(values);
wolffd@0	354 %remove the duplicate values in sort_v
wolffd@0	355 v_set = unique(sort_v);
wolffd@0	356 best_gain = 0;
wolffd@0	357 best_threshhold = 0;
wolffd@0	358 best_split1 = [];
wolffd@0	359
wolffd@0	360 %find the best split for this cts attribute
wolffd@0	361 % see "Quilan 96: Improved Use of Continuous Attributes in C4.5"
wolffd@0	362 for j=1:(size(v_set,2)-1)
wolffd@0	363 mid_v = (v_set(j)+v_set(j+1))/2;
wolffd@0	364 split_T = split_cases(fam_ev,node_sizes,node_types,T,i,mid_v); %split cases according to value of attribute i (<=mid_v)
wolffd@0	365 % For cts output, we compute the least square gain.
wolffd@0	366 % For discrete output, we use Quilan 96: use information gain instead of gain ratio to select threshhold
wolffd@0	367 cur_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,1,output_type);
wolffd@0	368 %if (i==6)
wolffd@0	369 % fprintf('gain %8.5f threshhold %6.3f spliting %d\n', cur_gain, mid_v, size(split_T{1},2));
wolffd@0	370 %end
wolffd@0	371
wolffd@0	372 if (best_gain < cur_gain)
wolffd@0	373 best_gain = cur_gain;
wolffd@0	374 best_threshhold = mid_v;
wolffd@0	375 %best_split1 = split_T; %here we need to copy array, not good!!! (maybe we can compute after we get best_attr
wolffd@0	376 end
wolffd@0	377 end
wolffd@0	378 %recalculate the gain_ratio of the best_threshhold
wolffd@0	379 split_T = split_cases(fam_ev,node_sizes,node_types,T,i,best_threshhold);
wolffd@0	380 best_gain = compute_gain(fam_ev,node_sizes,node_types,T,info_T,i,split_T,0,output_type); %gain_ratio
wolffd@0	381 if (output_type==0) %for discrete output
wolffd@0	382 cur_gain = best_gain-log2(size(v_set,2)-1)/size_t; % Quilan 96: use the gain_ratio-log2(N-1)/\|D\| as the gain of this attr
wolffd@0	383 else %for cts output
wolffd@0	384 cur_gain = best_gain;
wolffd@0	385 end
wolffd@0	386 end
wolffd@0	387
wolffd@0	388 if (max_gain < cur_gain)
wolffd@0	389 max_gain = cur_gain;
wolffd@0	390 best_attr = i;
wolffd@0	391 cur_best_threshhold=best_threshhold; %save the threshhold
wolffd@0	392 %best_split = split_T; %here we need to copy array, not good!!! So we will recalculate in below line 313
wolffd@0	393 end
wolffd@0	394 end
wolffd@0	395 end
wolffd@0	396
wolffd@0	397 % stop splitting if gain is too small
wolffd@0	398 if (max_gain==0 \| (output_type==0 & max_gain < min_gain) \| (output_type==1 & max_gain < cts_min_gain))
wolffd@0	399 if (output_type==0)
wolffd@0	400 tree.nodes(tree.num_node).probs=class_freqs/size_t; %the prob for each value of class node
wolffd@0	401 tree.nodes(tree.num_node).error.all_error=1-top1_class_cases/size_t;
wolffd@0	402 tree.nodes(tree.num_node).error.all_error_num=size_t - top1_class_cases;
wolffd@0	403 fprintf('Create leaf node(nogain) %d. Class %d Cases %d Error %d \n',tree.num_node, top1_class, size_t, size_t - top1_class_cases );
wolffd@0	404 else
wolffd@0	405 fprintf('Create leaf node(nogain) %d. Mean %8.4f Std %8.4f Cases %d \n',tree.num_node, avg_y_T, std_T, size_t);
wolffd@0	406 end
wolffd@0	407 return;
wolffd@0	408 end
wolffd@0	409
wolffd@0	410 %get the split of cases according to the best split attribute
wolffd@0	411 if (node_types(best_attr)==0) %discrete attibute
wolffd@0	412 best_split = split_cases(fam_ev,node_sizes,node_types,T,best_attr,0);
wolffd@0	413 else
wolffd@0	414 best_split = split_cases(fam_ev,node_sizes,node_types,T,best_attr,cur_best_threshhold);
wolffd@0	415 end
wolffd@0	416
wolffd@0	417 %(4) best_attr = AttributeWithBestGain;
wolffd@0	418 %(5) if best_attr is continuous ???? why need this? maybe the value in the decision tree must appeared in data
wolffd@0	419 % find threshhold in all cases that <= max_V
wolffd@0	420 % change the split of T
wolffd@0	421 tree.nodes(tree.num_node).split_id=best_attr;
wolffd@0	422 tree.nodes(tree.num_node).split_threshhold=cur_best_threshhold; %for cts attribute only
wolffd@0	423
wolffd@0	424 %note: below threshhold rejust is linera search, so it is slow. A better method is described in paper "Efficient C4.5"
wolffd@0	425 %if (output_type==0)
wolffd@0	426 if (node_types(best_attr)==1) %is a continuous attribute
wolffd@0	427 %find the value that approximate best_threshhold from below (the largest that <= best_threshhold)
wolffd@0	428 best_value=0;
wolffd@0	429 for i=1:size(fam_ev,2) %note: need to search in all cases for all tree, not just in cases for this node
wolffd@0	430 val = fam_ev(best_attr,i);
wolffd@0	431 if (val <= cur_best_threshhold & val > best_value) %val is more clear to best_threshhold
wolffd@0	432 best_value=val;
wolffd@0	433 end
wolffd@0	434 end
wolffd@0	435 tree.nodes(tree.num_node).split_threshhold=best_value; %for cts attribute only
wolffd@0	436 end
wolffd@0	437 %end
wolffd@0	438
wolffd@0	439 if (output_type == 0)
wolffd@0	440 fprintf('Create node %d split at %d gain %8.4f Th %d. Class %d Cases %d Error %d \n',tree.num_node, best_attr, max_gain, tree.nodes(tree.num_node).split_threshhold, top1_class, size_t, size_t - top1_class_cases );
wolffd@0	441 else
wolffd@0	442 fprintf('Create node %d split at %d gain %8.4f Th %d. Mean %8.4f Cases %d\n',tree.num_node, best_attr, max_gain, tree.nodes(tree.num_node).split_threshhold, avg_y_T, size_t );
wolffd@0	443 end
wolffd@0	444
wolffd@0	445 %(6) Foreach T' in the split_T
wolffd@0	446 % if T' is Empty
wolffd@0	447 % Child of node_id is a leaf
wolffd@0	448 % else
wolffd@0	449 % Child of node_id = split_tree (T')
wolffd@0	450 tree.nodes(new_node).is_leaf=0; %because this node will be split, it is not leaf now
wolffd@0	451 for i=1:size(best_split,2)
wolffd@0	452 if (size(best_split{i},2)==0) %T(i) is empty
wolffd@0	453 %create one new leaf node
wolffd@0	454 tree.num_node=tree.num_node+1;
wolffd@0	455 tree.nodes(tree.num_node).used=1; %flag this node is used (0 means node not used, it will be removed from tree at last to save memory)
wolffd@0	456 tree.nodes(tree.num_node).is_leaf=1;
wolffd@0	457 tree.nodes(tree.num_node).children=[];
wolffd@0	458 tree.nodes(tree.num_node).split_id=0;
wolffd@0	459 tree.nodes(tree.num_node).split_threshhold=0;
wolffd@0	460 if (output_type == 0)
wolffd@0	461 tree.nodes(tree.num_node).probs=zeros(1,num_cat); %the prob for each value of class node
wolffd@0	462 tree.nodes(tree.num_node).probs(top1_class)=1; %use the majority class of parent node, like for binary class,
wolffd@0	463 %and majority is class 2, then the CPT is [0 1]
wolffd@0	464 %we may need to use prior to do smoothing, to get [0.001 0.999]
wolffd@0	465 tree.nodes(tree.num_node).error.self_error=0;
wolffd@0	466 tree.nodes(tree.num_node).error.all_error=0;
wolffd@0	467 tree.nodes(tree.num_node).error.all_error_num=0;
wolffd@0	468 else
wolffd@0	469 tree.nodes(tree.num_node).mean = avg_y_T; %just use parent node's mean value
wolffd@0	470 tree.nodes(tree.num_node).std = std_T;
wolffd@0	471 end
wolffd@0	472 %add the new leaf node to parents
wolffd@0	473 num_children=size(tree.nodes(new_node).children,2);
wolffd@0	474 tree.nodes(new_node).children(num_children+1)=tree.num_node;
wolffd@0	475 if (output_type==0)
wolffd@0	476 fprintf('Create leaf node(nullset) %d. %d-th child of Father %d Class %d\n',tree.num_node, i, new_node, top1_class );
wolffd@0	477 else
wolffd@0	478 fprintf('Create leaf node(nullset) %d. %d-th child of Father %d \n',tree.num_node, i, new_node );
wolffd@0	479 end
wolffd@0	480
wolffd@0	481 else
wolffd@0	482 if (node_types(best_attr)==0) % if attr is discrete, it should be removed from the candidate set
wolffd@0	483 new_candidate_attrs = mysetdiff(candidate_attrs,[best_attr]);
wolffd@0	484 else
wolffd@0	485 new_candidate_attrs = candidate_attrs;
wolffd@0	486 end
wolffd@0	487 new_sub_node = split_dtree (CPD, fam_ev, node_sizes, node_types, stop_cases, min_gain, best_split{i}, new_candidate_attrs, num_cat);
wolffd@0	488 %tree.nodes(parent_id).error.all_error += tree.nodes(new_sub_node).error.all_error;
wolffd@0	489 fprintf('Add subtree node %d to %d. #nodes %d\n',new_sub_node,new_node, tree.num_node );
wolffd@0	490
wolffd@0	491 % tree.nodes(new_node).error.all_error_num = tree.nodes(new_node).error.all_error_num + tree.nodes(new_sub_node).error.all_error_num;
wolffd@0	492 %add the new leaf node to parents
wolffd@0	493 num_children=size(tree.nodes(new_node).children,2);
wolffd@0	494 tree.nodes(new_node).children(num_children+1)=new_sub_node;
wolffd@0	495 end
wolffd@0	496 end
wolffd@0	497
wolffd@0	498 %(7) Compute errors of N; for doing pruning
wolffd@0	499 % get the total error for the subtree
wolffd@0	500 if (output_type==0)
wolffd@0	501 tree.nodes(new_node).error.all_error=tree.nodes(new_node).error.all_error_num/size_t;
wolffd@0	502 end
wolffd@0	503 %doing pruning, but doing here is not so efficient, because it is bottom up.
wolffd@0	504 %if tree.nodes()
wolffd@0	505 %after doing pruning, need to update the all_error to self_error
wolffd@0	506
wolffd@0	507 %(8) Return N
wolffd@0	508
wolffd@0	509
wolffd@0	510
wolffd@0	511
wolffd@0	512 %(1) For discrete output, we use GainRatio defined as below
wolffd@0	513 % Gain(X,T)
wolffd@0	514 % GainRatio(X,T) = ----------
wolffd@0	515 % SplitInfo(X,T)
wolffd@0	516 % where
wolffd@0	517 % Gain(X,T) = Info(T) - Info(X,T)
wolffd@0	518 % \|Ti\|
wolffd@0	519 % Info(X,T) = Sum for i from 1 to n of ( ---- * Info(Ti))
wolffd@0	520 % \|T\|
wolffd@0	521
wolffd@0	522 % SplitInfo(D,T) is the information due to the split of T on the basis
wolffd@0	523 % of the value of the categorical attribute D. Thus SplitInfo(D,T) is
wolffd@0	524 % I(\|T1\|/\|T\|, \|T2\|/\|T\|, .., \|Tm\|/\|T\|)
wolffd@0	525 % where {T1, T2, .. Tm} is the partition of T induced by the value of D.
wolffd@0	526
wolffd@0	527 % Definition of Info(Ti)
wolffd@0	528 % If a set T of records is partitioned into disjoint exhaustive classes C1, C2, .., Ck on the basis of the
wolffd@0	529 % value of the categorical attribute, then the information needed to identify the class of an element of T
wolffd@0	530 % is Info(T) = I(P), where P is the probability distribution of the partition (C1, C2, .., Ck):
wolffd@0	531 % P = (\|C1\|/\|T\|, \|C2\|/\|T\|, ..., \|Ck\|/\|T\|)
wolffd@0	532 % Here I(P) is defined as
wolffd@0	533 % I(P) = -(p1log(p1) + p2log(p2) + .. + pn*log(pn))
wolffd@0	534 %
wolffd@0	535 %(2) For continuous output (regression tree), we use least squares score (adapted from Leo Breiman's book "Classification and regression trees", page 231
wolffd@0	536 % The original support only binary split, we further extend it to permit multiple-child split
wolffd@0	537 %
wolffd@0	538 % Delta_R = R(T) - Sum for all childe nodes Ti (R(Ti))
wolffd@0	539 % Where R(Ti)= 1/N * Sum for all cases i in node Ti ((yi - avg_y(Ti))^2)
wolffd@0	540 % here N is the number of all training cases for construct the regression tree
wolffd@0	541 % avg_y(Ti) is the average value for output variable for the cases in node Ti
wolffd@0	542
wolffd@0	543 function gain_score = compute_gain (fam_ev, node_sizes, node_types, T, info_T, attr_id, split_T, score_type, output_type)
wolffd@0	544 % COMPUTE_GAIN Compute the score for the split of cases T using attribute attr_id
wolffd@0	545 % gain_score = compute_gain (fam_ev, T, attr_id, node_size, method)
wolffd@0	546 %
wolffd@0	547 % fam_ev(i,j) is the value of attribute i in j-th training cases, the last row is for the class label (self_ev)
wolffd@0	548 % T(i) is the index of i-th cases in current decision tree node, we need split it further
wolffd@0	549 % attr_id is the index of current node considered for a split
wolffd@0	550 % split_T{i} is the i_th subset in partition of cases T according to the value of attribute attr_id
wolffd@0	551 % score_type if 0, is gain ratio, 1 is information gain (only apply to discrete output)
wolffd@0	552 % node_size(i) the node size of i-th node in the family
wolffd@0	553 % output_type: 0 means discrete output, 1 means continuous output.
wolffd@0	554 gain_score=0;
wolffd@0	555 % *********for DISCRETE output*****************************************************
wolffd@0	556 if (output_type == 0)
wolffd@0	557 % compute Info(T)
wolffd@0	558 total_cnt = size(T,2);
wolffd@0	559 if (total_cnt==0)
wolffd@0	560 return;
wolffd@0	561 end;
wolffd@0	562 %class_split_T = split_cases(fam_ev,node_sizes,node_types,T,size(fam_ev,1),0); %split cases according to class
wolffd@0	563 %info_T = compute_info (fam_ev, T, class_split_T);
wolffd@0	564
wolffd@0	565 % compute Info(X,T)
wolffd@0	566 num_class = size(split_T,2);
wolffd@0	567 subset_sizes = zeros(1,num_class);
wolffd@0	568 info_ti = zeros(1,num_class);
wolffd@0	569 for i=1:num_class
wolffd@0	570 subset_sizes(i)=size(split_T{i},2);
wolffd@0	571 if (subset_sizes(i)~=0)
wolffd@0	572 class_split_Ti = split_cases(fam_ev,node_sizes,node_types,split_T{i},size(fam_ev,1),0); %split cases according to class
wolffd@0	573 info_ti(i) = compute_info(fam_ev, split_T{i}, class_split_Ti);
wolffd@0	574 end
wolffd@0	575 end
wolffd@0	576 ti_ratios = subset_sizes/total_cnt; %get the \|Ti\|/\|T\|
wolffd@0	577 info_X_T = sum(ti_ratios.*info_ti);
wolffd@0	578
wolffd@0	579 %get Gain(X,T)
wolffd@0	580 gain_X_T = info_T - info_X_T;
wolffd@0	581
wolffd@0	582 if (score_type == 1) %information gain
wolffd@0	583 gain_score=gain_X_T;
wolffd@0	584 return;
wolffd@0	585 end
wolffd@0	586 %compute the SplitInfo(X,T) //is this also for cts attr, only split into two subsets
wolffd@0	587 splitinfo_T = compute_info (fam_ev, T, split_T);
wolffd@0	588 if (splitinfo_T~=0)
wolffd@0	589 gain_score = gain_X_T/splitinfo_T;
wolffd@0	590 end
wolffd@0	591
wolffd@0	592 % **********for continuous output************************************************
wolffd@0	593 else
wolffd@0	594 N = size(fam_ev,2);
wolffd@0	595
wolffd@0	596 % compute R(Ti)
wolffd@0	597 num_class = size(split_T,2);
wolffd@0	598 R_Ti = zeros(1,num_class);
wolffd@0	599 for i=1:num_class
wolffd@0	600 if (size(split_T{i},2)~=0)
wolffd@0	601 cases_T = fam_ev(size(fam_ev,1),split_T{i});
wolffd@0	602 avg_y_T = mean(cases_T);
wolffd@0	603 sqr_T = cases_T - avg_y_T;
wolffd@0	604 R_Ti(i) = sum(sqr_T.sqr_T)/N; % get R(Ti) = 1/N SUM(y-avg_y)^2
wolffd@0	605 end
wolffd@0	606 end
wolffd@0	607 %delta_R = R(T) - SUM(R(Ti))
wolffd@0	608 gain_score = info_T - sum(R_Ti);
wolffd@0	609
wolffd@0	610 end
wolffd@0	611
wolffd@0	612
wolffd@0	613 % Definition of Info(Ti)
wolffd@0	614 % If a set T of records is partitioned into disjoint exhaustive classes C1, C2, .., Ck on the basis of the
wolffd@0	615 % value of the categorical attribute, then the information needed to identify the class of an element of T
wolffd@0	616 % is Info(T) = I(P), where P is the probability distribution of the partition (C1, C2, .., Ck):
wolffd@0	617 % P = (\|C1\|/\|T\|, \|C2\|/\|T\|, ..., \|Ck\|/\|T\|)
wolffd@0	618 % Here I(P) is defined as
wolffd@0	619 % I(P) = -(p1log(p1) + p2log(p2) + .. + pn*log(pn))
wolffd@0	620 function info = compute_info (fam_ev, T, split_T)
wolffd@0	621 % COMPUTE_INFO compute the information for the split of T into split_T
wolffd@0	622 % info = compute_info (fam_ev, T, split_T)
wolffd@0	623
wolffd@0	624 total_cnt = size(T,2);
wolffd@0	625 num_class = size(split_T,2);
wolffd@0	626 subset_sizes = zeros(1,num_class);
wolffd@0	627 probs = zeros(1,num_class);
wolffd@0	628 log_probs = zeros(1,num_class);
wolffd@0	629 for i=1:num_class
wolffd@0	630 subset_sizes(i)=size(split_T{i},2);
wolffd@0	631 end
wolffd@0	632
wolffd@0	633 probs = subset_sizes/total_cnt;
wolffd@0	634 %log_probs = log2(probs); % if probs(i)=0, the log2(probs(i)) will be Inf
wolffd@0	635 for i=1:size(probs,2)
wolffd@0	636 if (probs(i)~=0)
wolffd@0	637 log_probs(i)=log2(probs(i));
wolffd@0	638 end
wolffd@0	639 end
wolffd@0	640
wolffd@0	641 info = sum(-(probs.*log_probs));
wolffd@0	642

Mercurial > hg > camir-aes2014

annotate toolboxes/FullBNT-1.0.7/bnt/CPDs/@tree_CPD/learn_params.m @ 0:e9a9cd732c1e tip