Daniel@0: function [W, Xi, Diagnostics] = mlr_solver(C, Margins, W, K)
Daniel@0: % [W, Xi, D] = mlr_solver(C, Margins, W, X)
Daniel@0: %
Daniel@0: %   C       >= 0    Slack trade-off parameter
Daniel@0: %   Margins =       array of mean margin values
Daniel@0: %   W       =       initial value for W 
Daniel@0: %   X       =       data matrix (or kernel)
Daniel@0: %
Daniel@0: %   W (output)  =   the learned metric
Daniel@0: %   Xi          =   1-slack
Daniel@0: %   D           =   diagnostics
Daniel@0: 
Daniel@0:     global DEBUG REG FEASIBLE LOSS;
Daniel@0: 
Daniel@0:     %%%
Daniel@0:     % Initialize the gradient directions for each constraint
Daniel@0:     %
Daniel@0:     global PsiR;
Daniel@0:     global PsiClock;
Daniel@0: 
Daniel@0:     numConstraints = length(PsiR);
Daniel@0: 
Daniel@0:     %%% 
Daniel@0:     % Some optimization details
Daniel@0:     
Daniel@0:     % Armijo rule number
Daniel@0:     armijo      = 1e-5;
Daniel@0:     
Daniel@0:     % Initial learning rate
Daniel@0:     lambda0     = 1e-6;
Daniel@0: 
Daniel@0:     % Increase/decrease after each iteration
Daniel@0:     lambdaup    = ((1+sqrt(5))/2)^(1/3);
Daniel@0:     lambdadown  = ((1+sqrt(5))/2)^(-1);
Daniel@0: 
Daniel@0:     % Maximum steps to take
Daniel@0:     maxsteps    = 1e4;
Daniel@0: 
Daniel@0:     % Size of convergence window
Daniel@0:     frame       = 10;
Daniel@0:     
Daniel@0:     % Convergence threshold
Daniel@0:     convthresh  = 1e-5;
Daniel@0: 
Daniel@0:     % Maximum number of backtracks
Daniel@0:     maxbackcount = 100;
Daniel@0: 
Daniel@0: 
Daniel@0:     Diagnostics = struct(   'f',                [], ...
Daniel@0:                             'num_steps',        [], ...
Daniel@0:                             'stop_criteria',    []);
Daniel@0: 
Daniel@0:     % Repeat until convergence:
Daniel@0:     % 1) Calculate f
Daniel@0:     % 2) Take a gradient step
Daniel@0:     % 3) Project W back onto PSD
Daniel@0: 
Daniel@0:     %%%
Daniel@0:     % Initialze
Daniel@0:     %
Daniel@0: 
Daniel@0:     f       = inf;
Daniel@0:     dfdW    = zeros(size(W));
Daniel@0:     lambda  = lambda0;
Daniel@0:     F       = Inf * ones(1,maxsteps+1);
Daniel@0:     XiR     = zeros(numConstraints,1);
Daniel@0: 
Daniel@0: 
Daniel@0:     stepcount   = -1;
Daniel@0:     backcount   = 0;
Daniel@0:     done        = 0;
Daniel@0: 
Daniel@0: 
Daniel@0:     while 1
Daniel@0:         fold = f;
Daniel@0:         Wold = W;
Daniel@0: 
Daniel@0:         %%%
Daniel@0:         % Count constraint violations and build the gradient
Daniel@0:         dbprint(3, 'Computing gradient');
Daniel@0: 
Daniel@0:         %%%
Daniel@0:         % Calculate constraint violations
Daniel@0:         %
Daniel@0:         XiR(:) = 0;
Daniel@0:         for R = numConstraints:-1:1
Daniel@0:             XiR(R)  = LOSS(W, PsiR{R}, Margins(R), 0);
Daniel@0:         end
Daniel@0: 
Daniel@0:         %%%
Daniel@0:         % Find the most active constraint
Daniel@0:         %
Daniel@0:         [Xi, mgrad] = max(XiR);
Daniel@0:         Xi          = max(Xi, 0);
Daniel@0:         
Daniel@0:         PsiClock(mgrad) = 0;
Daniel@0: 
Daniel@0:         %%%
Daniel@0:         % Evaluate f
Daniel@0:         %
Daniel@0: 
Daniel@0:         f           = C     * max(Xi, 0) ...
Daniel@0:                             + REG(W, K, 0);
Daniel@0: 
Daniel@0:         %%%
Daniel@0:         % Test for convergence
Daniel@0:         %
Daniel@0:         objDiff        = fold - f;
Daniel@0: 
Daniel@0:         if objDiff > armijo * lambda * (dfdW(:)' * dfdW(:))
Daniel@0: 
Daniel@0:             stepcount = stepcount + 1;
Daniel@0: 
Daniel@0:             F(stepcount+1) = f;
Daniel@0: 
Daniel@0:             sdiff = inf;
Daniel@0:             if stepcount >= frame;
Daniel@0:                 sdiff = log(F(stepcount+1-frame) / f);
Daniel@0:             end
Daniel@0: 
Daniel@0:             if stepcount >= maxsteps
Daniel@0:                 done = 1; 
Daniel@0:                 stopcriteria = 'MAXSTEPS';
Daniel@0:             elseif sdiff <= convthresh
Daniel@0:                 done = 1;
Daniel@0:                 stopcriteria = 'CONVERGENCE';
Daniel@0:             else
Daniel@0:                 %%%
Daniel@0:                 % If it's positive, add the corresponding gradient
Daniel@0:                 dfdW    = C     * LOSS(W, PsiR{mgrad}, Margins(mgrad), 1) ...
Daniel@0:                                 + REG(W, K, 1);
Daniel@0:             end
Daniel@0: 
Daniel@0:             dbprint(3, 'Lambda up!');
Daniel@0:             Wold        = W;
Daniel@0:             lambda      = lambdaup * lambda;
Daniel@0:             backcount   = 0;
Daniel@0: 
Daniel@0:         else
Daniel@0:             % Backtracking time, drop the learning rate
Daniel@0:             if backcount >= maxbackcount
Daniel@0:                 W       = Wold;
Daniel@0:                 f       = fold;
Daniel@0:                 done    = 1;
Daniel@0: 
Daniel@0:                 stopcriteria = 'BACKTRACK';
Daniel@0:             else
Daniel@0:                 dbprint(3, 'Lambda down!');
Daniel@0:                 lambda      = lambdadown * lambda;
Daniel@0:                 backcount   = backcount+1;
Daniel@0:             end
Daniel@0:         end
Daniel@0:         
Daniel@0:         %%%
Daniel@0:         % Take a gradient step
Daniel@0:         %
Daniel@0:         W   = W - lambda * dfdW;
Daniel@0: 
Daniel@0:         %%%
Daniel@0:         % Project back onto the feasible set
Daniel@0:         %
Daniel@0: 
Daniel@0:         dbprint(3, 'Projecting onto feasible set');
Daniel@0:         W   = FEASIBLE(W);
Daniel@0:         if done
Daniel@0:             break;
Daniel@0:         end; 
Daniel@0: 
Daniel@0:     end
Daniel@0: 
Daniel@0:     Diagnostics.f               = F(2:(stepcount+1))';
Daniel@0:     Diagnostics.stop_criteria   = stopcriteria;
Daniel@0:     Diagnostics.num_steps       = stepcount;
Daniel@0: 
Daniel@0:     dbprint(2, '\t%s after %d steps.\n', stopcriteria, stepcount);
Daniel@0: end
Daniel@0: