wolffd@0: function [beta,post,lli] = logistK(x,y,w,beta) wolffd@0: % [beta,post,lli] = logistK(x,y,beta,w) wolffd@0: % wolffd@0: % k-class logistic regression with optional sample weights wolffd@0: % wolffd@0: % k = number of classes wolffd@0: % n = number of samples wolffd@0: % d = dimensionality of samples wolffd@0: % wolffd@0: % INPUT wolffd@0: % x dxn matrix of n input column vectors wolffd@0: % y kxn vector of class assignments wolffd@0: % [w] 1xn vector of sample weights wolffd@0: % [beta] dxk matrix of model coefficients wolffd@0: % wolffd@0: % OUTPUT wolffd@0: % beta dxk matrix of fitted model coefficients wolffd@0: % (beta(:,k) are fixed at 0) wolffd@0: % post kxn matrix of fitted class posteriors wolffd@0: % lli log likelihood wolffd@0: % wolffd@0: % Let p(i,j) = exp(beta(:,j)'*x(:,i)), wolffd@0: % Class j posterior for observation i is: wolffd@0: % post(j,i) = p(i,j) / (p(i,1) + ... p(i,k)) wolffd@0: % wolffd@0: % See also logistK_eval. wolffd@0: % wolffd@0: % David Martin wolffd@0: % May 3, 2002 wolffd@0: wolffd@0: % Copyright (C) 2002 David R. Martin wolffd@0: % wolffd@0: % This program is free software; you can redistribute it and/or wolffd@0: % modify it under the terms of the GNU General Public License as wolffd@0: % published by the Free Software Foundation; either version 2 of the wolffd@0: % License, or (at your option) any later version. wolffd@0: % wolffd@0: % This program is distributed in the hope that it will be useful, but wolffd@0: % WITHOUT ANY WARRANTY; without even the implied warranty of wolffd@0: % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU wolffd@0: % General Public License for more details. wolffd@0: % wolffd@0: % You should have received a copy of the GNU General Public License wolffd@0: % along with this program; if not, write to the Free Software wolffd@0: % Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA wolffd@0: % 02111-1307, USA, or see http://www.gnu.org/copyleft/gpl.html. wolffd@0: wolffd@0: % TODO - this code would be faster if x were transposed wolffd@0: wolffd@0: error(nargchk(2,4,nargin)); wolffd@0: wolffd@0: debug = 0; wolffd@0: if debug>0, wolffd@0: h=figure(1); wolffd@0: set(h,'DoubleBuffer','on'); wolffd@0: end wolffd@0: wolffd@0: % get sizes wolffd@0: [d,nx] = size(x); wolffd@0: [k,ny] = size(y); wolffd@0: wolffd@0: % check sizes wolffd@0: if k < 2, wolffd@0: error('Input y must encode at least 2 classes.'); wolffd@0: end wolffd@0: if nx ~= ny, wolffd@0: error('Inputs x,y not the same length.'); wolffd@0: end wolffd@0: wolffd@0: n = nx; wolffd@0: wolffd@0: % make sure class assignments have unit L1-norm wolffd@0: sumy = sum(y,1); wolffd@0: if abs(1-sumy) > eps, wolffd@0: sumy = sum(y,1); wolffd@0: for i = 1:k, y(i,:) = y(i,:) ./ sumy; end wolffd@0: end wolffd@0: clear sumy; wolffd@0: wolffd@0: % if sample weights weren't specified, set them to 1 wolffd@0: if nargin < 3, wolffd@0: w = ones(1,n); wolffd@0: end wolffd@0: wolffd@0: % normalize sample weights so max is 1 wolffd@0: w = w / max(w); wolffd@0: wolffd@0: % if starting beta wasn't specified, initialize randomly wolffd@0: if nargin < 4, wolffd@0: beta = 1e-3*rand(d,k); wolffd@0: beta(:,k) = 0; % fix beta for class k at zero wolffd@0: else wolffd@0: if sum(beta(:,k)) ~= 0, wolffd@0: error('beta(:,k) ~= 0'); wolffd@0: end wolffd@0: end wolffd@0: wolffd@0: stepsize = 1; wolffd@0: minstepsize = 1e-2; wolffd@0: wolffd@0: post = computePost(beta,x); wolffd@0: lli = computeLogLik(post,y,w); wolffd@0: wolffd@0: for iter = 1:100, wolffd@0: %disp(sprintf(' logist iter=%d lli=%g',iter,lli)); wolffd@0: vis(x,y,beta,lli,d,k,iter,debug); wolffd@0: wolffd@0: % gradient and hessian wolffd@0: [g,h] = derivs(post,x,y,w); wolffd@0: wolffd@0: % make sure Hessian is well conditioned wolffd@0: if rcond(h) < eps, wolffd@0: % condition with Levenberg-Marquardt method wolffd@0: for i = -16:16, wolffd@0: h2 = h .* ((1 + 10^i)*eye(size(h)) + (1-eye(size(h)))); wolffd@0: if rcond(h2) > eps, break, end wolffd@0: end wolffd@0: if rcond(h2) < eps, wolffd@0: warning(['Stopped at iteration ' num2str(iter) ... wolffd@0: ' because Hessian can''t be conditioned']); wolffd@0: break wolffd@0: end wolffd@0: h = h2; wolffd@0: end wolffd@0: wolffd@0: % save lli before update wolffd@0: lli_prev = lli; wolffd@0: wolffd@0: % Newton-Raphson with step-size halving wolffd@0: while stepsize >= minstepsize, wolffd@0: % Newton-Raphson update step wolffd@0: step = stepsize * (h \ g); wolffd@0: beta2 = beta; wolffd@0: beta2(:,1:k-1) = beta2(:,1:k-1) - reshape(step,d,k-1); wolffd@0: wolffd@0: % get the new log likelihood wolffd@0: post2 = computePost(beta2,x); wolffd@0: lli2 = computeLogLik(post2,y,w); wolffd@0: wolffd@0: % if the log likelihood increased, then stop wolffd@0: if lli2 > lli, wolffd@0: post = post2; lli = lli2; beta = beta2; wolffd@0: break wolffd@0: end wolffd@0: wolffd@0: % otherwise, reduce step size by half wolffd@0: stepsize = 0.5 * stepsize; wolffd@0: end wolffd@0: wolffd@0: % stop if the average log likelihood has gotten small enough wolffd@0: if 1-exp(lli/n) < 1e-2, break, end wolffd@0: wolffd@0: % stop if the log likelihood changed by a small enough fraction wolffd@0: dlli = (lli_prev-lli) / lli; wolffd@0: if abs(dlli) < 1e-3, break, end wolffd@0: wolffd@0: % stop if the step size has gotten too small wolffd@0: if stepsize < minstepsize, brea, end wolffd@0: wolffd@0: % stop if the log likelihood has decreased; this shouldn't happen wolffd@0: if lli < lli_prev, wolffd@0: warning(['Stopped at iteration ' num2str(iter) ... wolffd@0: ' because the log likelihood decreased from ' ... wolffd@0: num2str(lli_prev) ' to ' num2str(lli) '.' ... wolffd@0: ' This may be a bug.']); wolffd@0: break wolffd@0: end wolffd@0: end wolffd@0: wolffd@0: if debug>0, wolffd@0: vis(x,y,beta,lli,d,k,iter,2); wolffd@0: end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% class posteriors wolffd@0: function post = computePost(beta,x) wolffd@0: [d,n] = size(x); wolffd@0: [d,k] = size(beta); wolffd@0: post = zeros(k,n); wolffd@0: bx = zeros(k,n); wolffd@0: for j = 1:k, wolffd@0: bx(j,:) = beta(:,j)'*x; wolffd@0: end wolffd@0: for j = 1:k, wolffd@0: post(j,:) = 1 ./ sum(exp(bx - repmat(bx(j,:),k,1)),1); wolffd@0: end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% log likelihood wolffd@0: function lli = computeLogLik(post,y,w) wolffd@0: [k,n] = size(post); wolffd@0: lli = 0; wolffd@0: for j = 1:k, wolffd@0: lli = lli + sum(w.*y(j,:).*log(post(j,:)+eps)); wolffd@0: end wolffd@0: if isnan(lli), wolffd@0: error('lli is nan'); wolffd@0: end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% gradient and hessian wolffd@0: %% These are computed in what seems a verbose manner, but it is wolffd@0: %% done this way to use minimal memory. x should be transposed wolffd@0: %% to make it faster. wolffd@0: function [g,h] = derivs(post,x,y,w) wolffd@0: wolffd@0: [k,n] = size(post); wolffd@0: [d,n] = size(x); wolffd@0: wolffd@0: % first derivative of likelihood w.r.t. beta wolffd@0: g = zeros(d,k-1); wolffd@0: for j = 1:k-1, wolffd@0: wyp = w .* (y(j,:) - post(j,:)); wolffd@0: for ii = 1:d, wolffd@0: g(ii,j) = x(ii,:) * wyp'; wolffd@0: end wolffd@0: end wolffd@0: g = reshape(g,d*(k-1),1); wolffd@0: wolffd@0: % hessian of likelihood w.r.t. beta wolffd@0: h = zeros(d*(k-1),d*(k-1)); wolffd@0: for i = 1:k-1, % diagonal wolffd@0: wt = w .* post(i,:) .* (1 - post(i,:)); wolffd@0: hii = zeros(d,d); wolffd@0: for a = 1:d, wolffd@0: wxa = wt .* x(a,:); wolffd@0: for b = a:d, wolffd@0: hii_ab = wxa * x(b,:)'; wolffd@0: hii(a,b) = hii_ab; wolffd@0: hii(b,a) = hii_ab; wolffd@0: end wolffd@0: end wolffd@0: h( (i-1)*d+1 : i*d , (i-1)*d+1 : i*d ) = -hii; wolffd@0: end wolffd@0: for i = 1:k-1, % off-diagonal wolffd@0: for j = i+1:k-1, wolffd@0: wt = w .* post(j,:) .* post(i,:); wolffd@0: hij = zeros(d,d); wolffd@0: for a = 1:d, wolffd@0: wxa = wt .* x(a,:); wolffd@0: for b = a:d, wolffd@0: hij_ab = wxa * x(b,:)'; wolffd@0: hij(a,b) = hij_ab; wolffd@0: hij(b,a) = hij_ab; wolffd@0: end wolffd@0: end wolffd@0: h( (i-1)*d+1 : i*d , (j-1)*d+1 : j*d ) = hij; wolffd@0: h( (j-1)*d+1 : j*d , (i-1)*d+1 : i*d ) = hij; wolffd@0: end wolffd@0: end wolffd@0: wolffd@0: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% wolffd@0: %% debug/visualization wolffd@0: function vis (x,y,beta,lli,d,k,iter,debug) wolffd@0: wolffd@0: if debug<=0, return, end wolffd@0: wolffd@0: disp(['iter=' num2str(iter) ' lli=' num2str(lli)]); wolffd@0: if debug<=1, return, end wolffd@0: wolffd@0: if d~=3 | k>10, return, end wolffd@0: wolffd@0: figure(1); wolffd@0: res = 100; wolffd@0: r = abs(max(max(x))); wolffd@0: dom = linspace(-r,r,res); wolffd@0: [px,py] = meshgrid(dom,dom); wolffd@0: xx = px(:); yy = py(:); wolffd@0: points = [xx' ; yy' ; ones(1,res*res)]; wolffd@0: func = zeros(k,res*res); wolffd@0: for j = 1:k, wolffd@0: func(j,:) = exp(beta(:,j)'*points); wolffd@0: end wolffd@0: [mval,ind] = max(func,[],1); wolffd@0: hold off; wolffd@0: im = reshape(ind,res,res); wolffd@0: imagesc(xx,yy,im); wolffd@0: hold on; wolffd@0: syms = {'w.' 'wx' 'w+' 'wo' 'w*' 'ws' 'wd' 'wv' 'w^' 'w<'}; wolffd@0: for j = 1:k, wolffd@0: [mval,ind] = max(y,[],1); wolffd@0: ind = find(ind==j); wolffd@0: plot(x(1,ind),x(2,ind),syms{j}); wolffd@0: end wolffd@0: pause(0.1); wolffd@0: wolffd@0: % eof