wolffd@0
|
1 function [W, Xi, Diagnostics] = mlr_solver(C, Margins, W, K)
|
wolffd@0
|
2 % [W, Xi, D] = mlr_solver(C, Margins, W, X)
|
wolffd@0
|
3 %
|
wolffd@0
|
4 % C >= 0 Slack trade-off parameter
|
wolffd@0
|
5 % Margins = array of mean margin values
|
wolffd@0
|
6 % W = initial value for W
|
wolffd@0
|
7 % X = data matrix (or kernel)
|
wolffd@0
|
8 %
|
wolffd@0
|
9 % W (output) = the learned metric
|
wolffd@0
|
10 % Xi = 1-slack
|
wolffd@0
|
11 % D = diagnostics
|
wolffd@0
|
12
|
wolffd@0
|
13 global DEBUG REG FEASIBLE LOSS;
|
wolffd@0
|
14
|
wolffd@0
|
15 %%%
|
wolffd@0
|
16 % Initialize the gradient directions for each constraint
|
wolffd@0
|
17 %
|
wolffd@0
|
18 global PsiR;
|
wolffd@0
|
19 global PsiClock;
|
wolffd@0
|
20
|
wolffd@0
|
21 numConstraints = length(PsiR);
|
wolffd@0
|
22
|
wolffd@0
|
23 %%%
|
wolffd@0
|
24 % Some optimization details
|
wolffd@0
|
25
|
wolffd@0
|
26 % Armijo rule number
|
wolffd@0
|
27 armijo = 1e-5;
|
wolffd@0
|
28
|
wolffd@0
|
29 % Initial learning rate
|
wolffd@0
|
30 lambda0 = 1e-4;
|
wolffd@0
|
31
|
wolffd@0
|
32 % Increase/decrease after each iteration
|
wolffd@0
|
33 lambdaup = ((1+sqrt(5))/2)^(1/3);
|
wolffd@0
|
34 lambdadown = ((1+sqrt(5))/2)^(-1);
|
wolffd@0
|
35
|
wolffd@0
|
36 % Maximum steps to take
|
wolffd@0
|
37 maxsteps = 1e4;
|
wolffd@0
|
38
|
wolffd@0
|
39 % Size of convergence window
|
wolffd@0
|
40 frame = 10;
|
wolffd@0
|
41
|
wolffd@0
|
42 % Convergence threshold
|
wolffd@0
|
43 convthresh = 1e-5;
|
wolffd@0
|
44
|
wolffd@0
|
45 % Maximum number of backtracks
|
wolffd@0
|
46 maxbackcount = 100;
|
wolffd@0
|
47
|
wolffd@0
|
48
|
wolffd@0
|
49 Diagnostics = struct( 'f', [], ...
|
wolffd@0
|
50 'num_steps', [], ...
|
wolffd@0
|
51 'stop_criteria', []);
|
wolffd@0
|
52
|
wolffd@0
|
53 % Repeat until convergence:
|
wolffd@0
|
54 % 1) Calculate f
|
wolffd@0
|
55 % 2) Take a gradient step
|
wolffd@0
|
56 % 3) Project W back onto PSD
|
wolffd@0
|
57
|
wolffd@0
|
58 %%%
|
wolffd@0
|
59 % Initialze
|
wolffd@0
|
60 %
|
wolffd@0
|
61
|
wolffd@0
|
62 f = inf;
|
wolffd@0
|
63 dfdW = zeros(size(W));
|
wolffd@0
|
64 lambda = lambda0;
|
wolffd@0
|
65 F = Inf * ones(1,maxsteps+1);
|
wolffd@0
|
66 XiR = zeros(numConstraints,1);
|
wolffd@0
|
67
|
wolffd@0
|
68
|
wolffd@0
|
69 stepcount = -1;
|
wolffd@0
|
70 backcount = 0;
|
wolffd@0
|
71 done = 0;
|
wolffd@0
|
72
|
wolffd@0
|
73
|
wolffd@0
|
74 while 1
|
wolffd@0
|
75 fold = f;
|
wolffd@0
|
76 Wold = W;
|
wolffd@0
|
77
|
wolffd@0
|
78 %%%
|
wolffd@0
|
79 % Count constraint violations and build the gradient
|
wolffd@0
|
80 dbprint(3, 'Computing gradient');
|
wolffd@0
|
81
|
wolffd@0
|
82 %%%
|
wolffd@0
|
83 % Calculate constraint violations
|
wolffd@0
|
84 %
|
wolffd@0
|
85 XiR(:) = 0;
|
wolffd@0
|
86 for R = numConstraints:-1:1
|
wolffd@0
|
87 XiR(R) = LOSS(W, PsiR{R}, Margins(R), 0);
|
wolffd@0
|
88 end
|
wolffd@0
|
89
|
wolffd@0
|
90 %%%
|
wolffd@0
|
91 % Find the most active constraint
|
wolffd@0
|
92 %
|
wolffd@0
|
93 [Xi, mgrad] = max(XiR);
|
wolffd@0
|
94 Xi = max(Xi, 0);
|
wolffd@0
|
95
|
wolffd@0
|
96 PsiClock(mgrad) = 0;
|
wolffd@0
|
97
|
wolffd@0
|
98 %%%
|
wolffd@0
|
99 % Evaluate f
|
wolffd@0
|
100 %
|
wolffd@0
|
101
|
wolffd@0
|
102 f = C * max(Xi, 0) ...
|
wolffd@0
|
103 + REG(W, K, 0);
|
wolffd@0
|
104
|
wolffd@0
|
105 %%%
|
wolffd@0
|
106 % Test for convergence
|
wolffd@0
|
107 %
|
wolffd@0
|
108 objDiff = fold - f;
|
wolffd@0
|
109
|
wolffd@0
|
110 if objDiff > armijo * lambda * (dfdW(:)' * dfdW(:))
|
wolffd@0
|
111
|
wolffd@0
|
112 stepcount = stepcount + 1;
|
wolffd@0
|
113
|
wolffd@0
|
114 F(stepcount+1) = f;
|
wolffd@0
|
115
|
wolffd@0
|
116 sdiff = inf;
|
wolffd@0
|
117 if stepcount >= frame;
|
wolffd@0
|
118 sdiff = log(F(stepcount+1-frame) / f);
|
wolffd@0
|
119 end
|
wolffd@0
|
120
|
wolffd@0
|
121 if stepcount >= maxsteps
|
wolffd@0
|
122 done = 1;
|
wolffd@0
|
123 stopcriteria = 'MAXSTEPS';
|
wolffd@0
|
124 elseif sdiff <= convthresh
|
wolffd@0
|
125 done = 1;
|
wolffd@0
|
126 stopcriteria = 'CONVERGENCE';
|
wolffd@0
|
127 else
|
wolffd@0
|
128 %%%
|
wolffd@0
|
129 % If it's positive, add the corresponding gradient
|
wolffd@0
|
130 dfdW = C * LOSS(W, PsiR{mgrad}, Margins(mgrad), 1) ...
|
wolffd@0
|
131 + REG(W, K, 1);
|
wolffd@0
|
132 end
|
wolffd@0
|
133
|
wolffd@0
|
134 dbprint(3, 'Lambda up!');
|
wolffd@0
|
135 Wold = W;
|
wolffd@0
|
136 lambda = lambdaup * lambda;
|
wolffd@0
|
137 backcount = 0;
|
wolffd@0
|
138
|
wolffd@0
|
139 else
|
wolffd@0
|
140 % Backtracking time, drop the learning rate
|
wolffd@0
|
141 if backcount >= maxbackcount
|
wolffd@0
|
142 W = Wold;
|
wolffd@0
|
143 f = fold;
|
wolffd@0
|
144 done = 1;
|
wolffd@0
|
145
|
wolffd@0
|
146 stopcriteria = 'BACKTRACK';
|
wolffd@0
|
147 else
|
wolffd@0
|
148 dbprint(3, 'Lambda down!');
|
wolffd@0
|
149 lambda = lambdadown * lambda;
|
wolffd@0
|
150 backcount = backcount+1;
|
wolffd@0
|
151 end
|
wolffd@0
|
152 end
|
wolffd@0
|
153
|
wolffd@0
|
154 %%%
|
wolffd@0
|
155 % Take a gradient step
|
wolffd@0
|
156 %
|
wolffd@0
|
157 W = W - lambda * dfdW;
|
wolffd@0
|
158
|
wolffd@0
|
159 %%%
|
wolffd@0
|
160 % Project back onto the feasible set
|
wolffd@0
|
161 %
|
wolffd@0
|
162
|
wolffd@0
|
163 dbprint(3, 'Projecting onto feasible set');
|
wolffd@0
|
164 W = FEASIBLE(W);
|
wolffd@0
|
165 if done
|
wolffd@0
|
166 break;
|
wolffd@0
|
167 end;
|
wolffd@0
|
168
|
wolffd@0
|
169 end
|
wolffd@0
|
170
|
wolffd@0
|
171 Diagnostics.f = F(2:(stepcount+1))';
|
wolffd@0
|
172 Diagnostics.stop_criteria = stopcriteria;
|
wolffd@0
|
173 Diagnostics.num_steps = stepcount;
|
wolffd@0
|
174
|
wolffd@0
|
175 dbprint(1, '\t%s after %d steps.\n', stopcriteria, stepcount);
|
wolffd@0
|
176 end
|
wolffd@0
|
177
|