diff toolboxes/FullBNT-1.0.7/bnt/CPDs/@softmax_CPD/softmax_CPD.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/toolboxes/FullBNT-1.0.7/bnt/CPDs/@softmax_CPD/softmax_CPD.m	Tue Feb 10 15:05:51 2015 +0000
@@ -0,0 +1,187 @@
+function CPD = softmax_CPD(bnet, self, varargin)
+% SOFTMAX_CPD Make a softmax (multinomial logit) CPD
+%
+% To define this CPD precisely, let W be an (m x n) matrix with W(i,:) = {i-th row of B} 
+% => we can define the following vectorial function:
+%    
+%                                   softmax: R^n |--> R^m  
+%                  softmax(z,i-th)=exp(W(i,:)*z)/sum_k(exp(W(k,:)*z))      
+%
+% (this constructor augments z with a one at the beginning to introduce an offset term (=bias, intercept))                                   
+% Now call the continuous (cts) and always observed (obs) parents X,
+% the discrete parents (if any) Q, and this node Y then we use the discrete parent(s) just  to index
+% the parameter vectors (c.f., conditional Gaussian nodes); that is:
+%                 prob(Y=i | X=x, Q=j) = softmax(x,i-th|j)
+% where '|j' means that we are using the j-th (m x n) parameters matrix W(:,:,j).
+% If there are no discrete parents, this is a regular softmax node.
+% If Y is binary, this is a logistic (sigmoid) function.
+%
+% CPD = softmax_CPD(bnet, node_num, ...) will create a softmax CPD with random parameters,
+% where node is the number of a node in this equivalence class.
+%
+% The following optional arguments can be specified in the form of name/value pairs:
+% [default value in brackets]
+% (Let ns(i) be the size of node i, X = ns(X), Y = ns(Y), Q1=ns(dps(1)), Q2=ns(dps(2)), ...
+% where dps are the discrete parents; if there are no discrete parents, we set Q1=1.)
+%
+% discrete - the discrete parents that we want to treat like the cts ones [ [] ]. 
+%            This can be used to define sigmoid belief network - see below the reference.             
+%            For example suppose that Y has one cts parents X and two discrete ones: Q, C1 where:
+%            -> Q is binary (1/2) and used just to index the parameters of 'self'
+%            -> C1 is ternary (1/2/3) and treated as a cts node <=> its values appear into the linear 
+%               part of the softmax function
+%            then:
+%                     prob(Y|X=x, Q=q, C1=c1)= softmax(W(:,:,q)' * y)
+%            where y = [1 | delta(C1,1) delta(C1,2) delta(C1,3) | x(:)']' and delta(Y,a)=indicator(Y=a).
+% weights - (w(:,j,a,b,...) - w(:,j',a,b,...)) is ppn to dec. boundary
+%           between j,j' given Q1=a,Q2=b,... [ randn(X,Y,Q1,Q2,...) ]
+% offset  - (b(j,a,b,...) - b(j',a,b,...)) is the offset to dec. boundary
+%           between j,j' given Q1=a,Q2=b,... [ randn(Y,Q1,Q2,...) ]
+%
+% e.g., CPD = softmax_CPD(bnet, i, 'offset', zeros(ns(i),1));
+%
+% The following fields control the behavior of the M step, which uses 
+% a weighted version of the Iteratively Reweighted Least Squares (WIRLS) if dps_as_cps=[]; or
+% a weighted SCG otherwise, as implemented in Netlab, and modified by Pierpaolo Brutti.
+%
+% clamped     - 'yes' means don't adjust params during learning ['no']
+% max_iter    - the maximum number of steps to take [10]
+% verbose     - 'yes' means print the LL at each step of IRLS ['no']
+% wthresh     - convergence threshold for weights [1e-2]
+% llthresh    - convergence threshold for log likelihood [1e-2]
+% approx_hess - 'yes' means approximate the Hessian for speed ['no']
+%
+% For backwards compatibility with BNT2, you can also specify the parameters in the following order
+%   softmax_CPD(bnet, self, w, b, clamped, max_iter, verbose, wthresh, llthresh, approx_hess)
+%
+% REFERENCE
+% For details on the sigmoid belief nets, see:
+% - Neal (1992). Connectionist learning of belief networks, Artificial Intelligence, 56, 71-113.
+% - Saul, Jakkola, Jordan (1996). Mean field theory for sigmoid belief networks, Journal of Artificial Intelligence Reseach (4), pagg. 61-76.
+%
+% For details on the M step, see:
+% - K. Chen, L. Xu, H. Chi (1999). Improved learning algorithms for mixtures of experts in multiclass 
+%       classification. Neural Networks 12, pp. 1229-1252.
+% - M.I. Jordan, R.A. Jacobs (1994). Hierarchical Mixtures of Experts and the EM algorithm. 
+%       Neural Computation 6, pp. 181-214.
+% - S.R. Waterhouse, A.J. Robinson (1994). Classification Using Hierarchical Mixtures of Experts. In Proc. IEEE
+%       Workshop on Neural Network for Signal Processing IV, pp. 177-186
+
+if nargin==0
+  % This occurs if we are trying to load an object from a file.
+  CPD = init_fields;
+  CPD = class(CPD, 'softmax_CPD', discrete_CPD(0, []));
+  return;
+elseif isa(bnet, 'softmax_CPD')
+  % This might occur if we are copying an object.
+  CPD = bnet;
+  return;
+end
+CPD = init_fields;
+
+assert(myismember(self, bnet.dnodes));
+ns = bnet.node_sizes;
+ps = parents(bnet.dag, self);
+dps = myintersect(ps, bnet.dnodes);
+cps = myintersect(ps, bnet.cnodes);
+
+clamped = 0;
+CPD = class(CPD, 'softmax_CPD', discrete_CPD(clamped, ns([ps self])));
+
+dps_as_cpssz = 0;
+dps_as_cps = [];
+% determine if any discrete parents are to be treated as cts
+if nargin >= 3 & isstr(varargin{1}) % might have passed in 'discrete'
+  for i=1:2:length(varargin)
+    if strcmp(varargin{i}, 'discrete')
+      dps_as_cps = varargin{i+1};
+      assert(myismember(dps_as_cps, dps));
+      dps = mysetdiff(dps, dps_as_cps);         % put out the dps treated as cts
+      CPD.dps_as_cps.ndx = find_equiv_posns(dps_as_cps, ps);
+      CPD.dps_as_cps.separator = [0 cumsum(ns(dps_as_cps(1:end-1)))]; % concatenated dps_as_cps dims separators
+      dps_as_cpssz = sum(ns(dps_as_cps));
+      break;
+    end
+  end
+end
+assert(~isempty(union(cps, dps_as_cps)));   % It have to be at least a cts or a dps_as_cps parents
+self_size = ns(self); 
+cpsz = sum(ns(cps));  
+glimsz = prod(ns(dps));
+CPD.dpndx = find_equiv_posns(dps, ps);  % it contains only the indeces of the 'pure' dps
+CPD.cpndx = find_equiv_posns(cps, ps);
+
+CPD.self  = self;
+CPD.solo  = (length(ns)<=2);
+CPD.sizes = bnet.node_sizes([ps self]);
+
+% set default params
+CPD.max_iter = 10;
+CPD.verbose = 0;
+CPD.wthresh = 1e-2;
+CPD.llthresh = 1e-2;
+CPD.approx_hess = 0;
+CPD.glim = cell(1,glimsz);
+for i=1:glimsz
+  CPD.glim{i} = glm(dps_as_cpssz + cpsz, self_size, 'softmax');
+end
+
+if nargin >= 3
+  args = varargin;
+  nargs = length(args);
+  if ~isstr(args{1})
+    %   softmax_CPD(bnet, self, w, b, clamped, max_iter, verbose, wthresh, llthresh, approx_hess)
+    if nargs >= 1 & ~isempty(args{1}), CPD = set_fields(CPD, 'weights', args{1}); end
+    if nargs >= 2 & ~isempty(args{2}), CPD = set_fields(CPD, 'offset', args{2});  end
+    if nargs >= 3 & ~isempty(args{3}), CPD = set_clamped(CPD, args{3});           end
+    if nargs >= 4 & ~isempty(args{4}), CPD.max_iter    = args{4}; end
+    if nargs >= 5 & ~isempty(args{5}), CPD.verbose     = args{5}; end
+    if nargs >= 6 & ~isempty(args{6}), CPD.wthresh     = args{6}; end
+    if nargs >= 7 & ~isempty(args{7}), CPD.llthresh   = args{7}; end
+    if nargs >= 8 & ~isempty(args{8}), CPD.approx_hess = args{8}; end
+  else
+    CPD = set_fields(CPD, args{:});
+  end
+end
+
+% sufficient statistics 
+% Since dsoftmax is not in the exponential family, we must store all the raw data.
+CPD.parent_vals = [];         % X(l,:) = value of cts parents in l'th example
+CPD.self_vals = [];           % Y(l,:) = value of self in l'th example
+
+CPD.eso_weights=[];           % weights used by the WIRLS algorithm
+
+% For BIC
+CPD.nsamples = 0;   
+if ~adjustable_CPD(CPD),
+   CPD.nparams=0;
+else
+   [W, b] = extract_params(CPD);
+   CPD.nparams= prod(size(W)) + prod(size(b));
+end
+
+%%%%%%%%%%%
+
+function CPD = init_fields()
+% This ensures we define the fields in the same order 
+% no matter whether we load an object from a file,
+% or create it from scratch. (Matlab requires this.)
+
+CPD.glim = {};
+CPD.self = [];
+CPD.solo = [];
+CPD.max_iter = [];
+CPD.verbose = [];
+CPD.wthresh = [];
+CPD.llthresh = [];
+CPD.approx_hess = [];
+CPD.sizes = [];
+CPD.parent_vals = [];
+CPD.eso_weights=[];
+CPD.self_vals = [];
+CPD.nsamples = [];
+CPD.nparams = [];
+CPD.dpndx = [];
+CPD.cpndx = [];
+CPD.dps_as_cps.ndx = [];
+CPD.dps_as_cps.separator = [];