Mercurial > hg > camir-aes2014
comparison toolboxes/SVM-light/src/svm_learn.c @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 /***********************************************************************/ | |
2 /* */ | |
3 /* svm_learn.c */ | |
4 /* */ | |
5 /* Learning module of Support Vector Machine. */ | |
6 /* */ | |
7 /* Author: Thorsten Joachims */ | |
8 /* Date: 02.07.02 */ | |
9 /* */ | |
10 /* Copyright (c) 2002 Thorsten Joachims - All rights reserved */ | |
11 /* */ | |
12 /* This software is available for non-commercial use only. It must */ | |
13 /* not be modified and distributed without prior permission of the */ | |
14 /* author. The author is not responsible for implications from the */ | |
15 /* use of this software. */ | |
16 /* */ | |
17 /***********************************************************************/ | |
18 | |
19 | |
20 # include "svm_common.h" | |
21 # include "svm_learn.h" | |
22 | |
23 | |
24 /* interface to QP-solver */ | |
25 double *optimize_qp(QP *, double *, long, double *, LEARN_PARM *); | |
26 | |
27 /*---------------------------------------------------------------------------*/ | |
28 | |
29 /* Learns an SVM classification model based on the training data in | |
30 docs/label. The resulting model is returned in the structure | |
31 model. */ | |
32 | |
33 void svm_learn_classification(DOC **docs, double *class, long int | |
34 totdoc, long int totwords, | |
35 LEARN_PARM *learn_parm, | |
36 KERNEL_PARM *kernel_parm, | |
37 KERNEL_CACHE *kernel_cache, | |
38 MODEL *model, | |
39 double *alpha) | |
40 /* docs: Training vectors (x-part) */ | |
41 /* class: Training labels (y-part, zero if test example for | |
42 transduction) */ | |
43 /* totdoc: Number of examples in docs/label */ | |
44 /* totwords: Number of features (i.e. highest feature index) */ | |
45 /* learn_parm: Learning paramenters */ | |
46 /* kernel_parm: Kernel paramenters */ | |
47 /* kernel_cache:Initialized Cache of size totdoc, if using a kernel. | |
48 NULL if linear.*/ | |
49 /* model: Returns learning result (assumed empty before called) */ | |
50 /* alpha: Start values for the alpha variables or NULL | |
51 pointer. The new alpha values are returned after | |
52 optimization if not NULL. Array must be of size totdoc. */ | |
53 { | |
54 long *inconsistent,i,*label; | |
55 long inconsistentnum; | |
56 long misclassified,upsupvecnum; | |
57 double loss,model_length,example_length; | |
58 double maxdiff,*lin,*a,*c; | |
59 long runtime_start,runtime_end; | |
60 long iterations; | |
61 long *unlabeled,transduction; | |
62 long heldout; | |
63 long loo_count=0,loo_count_pos=0,loo_count_neg=0,trainpos=0,trainneg=0; | |
64 long loocomputed=0,runtime_start_loo=0,runtime_start_xa=0; | |
65 double heldout_c=0,r_delta_sq=0,r_delta,r_delta_avg; | |
66 long *index,*index2dnum; | |
67 double *weights; | |
68 CFLOAT *aicache; /* buffer to keep one row of hessian */ | |
69 | |
70 double *xi_fullset; /* buffer for storing xi on full sample in loo */ | |
71 double *a_fullset; /* buffer for storing alpha on full sample in loo */ | |
72 TIMING timing_profile; | |
73 SHRINK_STATE shrink_state; | |
74 | |
75 runtime_start=get_runtime(); | |
76 timing_profile.time_kernel=0; | |
77 timing_profile.time_opti=0; | |
78 timing_profile.time_shrink=0; | |
79 timing_profile.time_update=0; | |
80 timing_profile.time_model=0; | |
81 timing_profile.time_check=0; | |
82 timing_profile.time_select=0; | |
83 kernel_cache_statistic=0; | |
84 | |
85 learn_parm->totwords=totwords; | |
86 | |
87 /* make sure -n value is reasonable */ | |
88 if((learn_parm->svm_newvarsinqp < 2) | |
89 || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) { | |
90 learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; | |
91 } | |
92 | |
93 init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK); | |
94 | |
95 label = (long *)my_malloc(sizeof(long)*totdoc); | |
96 inconsistent = (long *)my_malloc(sizeof(long)*totdoc); | |
97 unlabeled = (long *)my_malloc(sizeof(long)*totdoc); | |
98 c = (double *)my_malloc(sizeof(double)*totdoc); | |
99 a = (double *)my_malloc(sizeof(double)*totdoc); | |
100 a_fullset = (double *)my_malloc(sizeof(double)*totdoc); | |
101 xi_fullset = (double *)my_malloc(sizeof(double)*totdoc); | |
102 lin = (double *)my_malloc(sizeof(double)*totdoc); | |
103 learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc); | |
104 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); | |
105 model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); | |
106 model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); | |
107 | |
108 model->at_upper_bound=0; | |
109 model->b=0; | |
110 model->supvec[0]=0; /* element 0 reserved and empty for now */ | |
111 model->alpha[0]=0; | |
112 model->lin_weights=NULL; | |
113 model->totwords=totwords; | |
114 model->totdoc=totdoc; | |
115 model->kernel_parm=(*kernel_parm); | |
116 model->sv_num=1; | |
117 model->loo_error=-1; | |
118 model->loo_recall=-1; | |
119 model->loo_precision=-1; | |
120 model->xa_error=-1; | |
121 model->xa_recall=-1; | |
122 model->xa_precision=-1; | |
123 inconsistentnum=0; | |
124 transduction=0; | |
125 | |
126 r_delta=estimate_r_delta(docs,totdoc,kernel_parm); | |
127 r_delta_sq=r_delta*r_delta; | |
128 | |
129 r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm); | |
130 if(learn_parm->svm_c == 0.0) { /* default value for C */ | |
131 learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg); | |
132 if(verbosity>=1) | |
133 printf("Setting default regularization parameter C=%.4f\n", | |
134 learn_parm->svm_c); | |
135 } | |
136 | |
137 learn_parm->eps=-1.0; /* equivalent regression epsilon for | |
138 classification */ | |
139 | |
140 for(i=0;i<totdoc;i++) { /* various inits */ | |
141 docs[i]->docnum=i; | |
142 inconsistent[i]=0; | |
143 a[i]=0; | |
144 lin[i]=0; | |
145 c[i]=0.0; | |
146 unlabeled[i]=0; | |
147 if(class[i] == 0) { | |
148 unlabeled[i]=1; | |
149 label[i]=0; | |
150 transduction=1; | |
151 } | |
152 if(class[i] > 0) { | |
153 learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio* | |
154 docs[i]->costfactor; | |
155 label[i]=1; | |
156 trainpos++; | |
157 } | |
158 else if(class[i] < 0) { | |
159 learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor; | |
160 label[i]=-1; | |
161 trainneg++; | |
162 } | |
163 else { | |
164 learn_parm->svm_cost[i]=0; | |
165 } | |
166 } | |
167 if(verbosity>=2) { | |
168 printf("%ld positive, %ld negative, and %ld unlabeled examples.\n",trainpos,trainneg,totdoc-trainpos-trainneg); fflush(stdout); | |
169 } | |
170 | |
171 /* caching makes no sense for linear kernel */ | |
172 if(kernel_parm->kernel_type == LINEAR) { | |
173 kernel_cache = NULL; | |
174 } | |
175 | |
176 /* compute starting state for initial alpha values */ | |
177 if(alpha) { | |
178 if(verbosity>=1) { | |
179 printf("Computing starting state..."); fflush(stdout); | |
180 } | |
181 index = (long *)my_malloc(sizeof(long)*totdoc); | |
182 index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
183 weights=(double *)my_malloc(sizeof(double)*(totwords+1)); | |
184 aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); | |
185 for(i=0;i<totdoc;i++) { /* create full index and clip alphas */ | |
186 index[i]=1; | |
187 alpha[i]=fabs(alpha[i]); | |
188 if(alpha[i]<0) alpha[i]=0; | |
189 if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i]; | |
190 } | |
191 if(kernel_parm->kernel_type != LINEAR) { | |
192 for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */ | |
193 if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) | |
194 && (kernel_cache_space_available(kernel_cache))) | |
195 cache_kernel_row(kernel_cache,docs,i,kernel_parm); | |
196 for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */ | |
197 if((alpha[i]==learn_parm->svm_cost[i]) | |
198 && (kernel_cache_space_available(kernel_cache))) | |
199 cache_kernel_row(kernel_cache,docs,i,kernel_parm); | |
200 } | |
201 (void)compute_index(index,totdoc,index2dnum); | |
202 update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc, | |
203 totwords,kernel_parm,kernel_cache,lin,aicache, | |
204 weights); | |
205 (void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c, | |
206 learn_parm,index2dnum,index2dnum,model); | |
207 for(i=0;i<totdoc;i++) { /* copy initial alphas */ | |
208 a[i]=alpha[i]; | |
209 } | |
210 free(index); | |
211 free(index2dnum); | |
212 free(weights); | |
213 free(aicache); | |
214 if(verbosity>=1) { | |
215 printf("done.\n"); fflush(stdout); | |
216 } | |
217 } | |
218 | |
219 if(transduction) { | |
220 learn_parm->svm_iter_to_shrink=99999999; | |
221 if(verbosity >= 1) | |
222 printf("\nDeactivating Shrinking due to an incompatibility with the transductive \nlearner in the current version.\n\n"); | |
223 } | |
224 | |
225 if(transduction && learn_parm->compute_loo) { | |
226 learn_parm->compute_loo=0; | |
227 if(verbosity >= 1) | |
228 printf("\nCannot compute leave-one-out estimates for transductive learner.\n\n"); | |
229 } | |
230 | |
231 if(learn_parm->remove_inconsistent && learn_parm->compute_loo) { | |
232 learn_parm->compute_loo=0; | |
233 printf("\nCannot compute leave-one-out estimates when removing inconsistent examples.\n\n"); | |
234 } | |
235 | |
236 if(learn_parm->compute_loo && ((trainpos == 1) || (trainneg == 1))) { | |
237 learn_parm->compute_loo=0; | |
238 printf("\nCannot compute leave-one-out with only one example in one class.\n\n"); | |
239 } | |
240 | |
241 | |
242 if(verbosity==1) { | |
243 printf("Optimizing"); fflush(stdout); | |
244 } | |
245 | |
246 /* train the svm */ | |
247 iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, | |
248 kernel_parm,kernel_cache,&shrink_state,model, | |
249 inconsistent,unlabeled,a,lin, | |
250 c,&timing_profile, | |
251 &maxdiff,(long)-1, | |
252 (long)1); | |
253 | |
254 if(verbosity>=1) { | |
255 if(verbosity==1) printf("done. (%ld iterations)\n",iterations); | |
256 | |
257 misclassified=0; | |
258 for(i=0;(i<totdoc);i++) { /* get final statistic */ | |
259 if((lin[i]-model->b)*(double)label[i] <= 0.0) | |
260 misclassified++; | |
261 } | |
262 | |
263 printf("Optimization finished (%ld misclassified, maxdiff=%.5f).\n", | |
264 misclassified,maxdiff); | |
265 | |
266 runtime_end=get_runtime(); | |
267 if(verbosity>=2) { | |
268 printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n", | |
269 ((float)runtime_end-(float)runtime_start)/100.0, | |
270 (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start), | |
271 (100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start), | |
272 (100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start), | |
273 (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start), | |
274 (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start), | |
275 (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start), | |
276 (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start)); | |
277 } | |
278 else { | |
279 printf("Runtime in cpu-seconds: %.2f\n", | |
280 (runtime_end-runtime_start)/100.0); | |
281 } | |
282 | |
283 if(learn_parm->remove_inconsistent) { | |
284 inconsistentnum=0; | |
285 for(i=0;i<totdoc;i++) | |
286 if(inconsistent[i]) | |
287 inconsistentnum++; | |
288 printf("Number of SV: %ld (plus %ld inconsistent examples)\n", | |
289 model->sv_num-1,inconsistentnum); | |
290 } | |
291 else { | |
292 upsupvecnum=0; | |
293 for(i=1;i<model->sv_num;i++) { | |
294 if(fabs(model->alpha[i]) >= | |
295 (learn_parm->svm_cost[(model->supvec[i])->docnum]- | |
296 learn_parm->epsilon_a)) | |
297 upsupvecnum++; | |
298 } | |
299 printf("Number of SV: %ld (including %ld at upper bound)\n", | |
300 model->sv_num-1,upsupvecnum); | |
301 } | |
302 | |
303 if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { | |
304 loss=0; | |
305 model_length=0; | |
306 for(i=0;i<totdoc;i++) { | |
307 if((lin[i]-model->b)*(double)label[i] < 1.0-learn_parm->epsilon_crit) | |
308 loss+=1.0-(lin[i]-model->b)*(double)label[i]; | |
309 model_length+=a[i]*label[i]*lin[i]; | |
310 } | |
311 model_length=sqrt(model_length); | |
312 fprintf(stdout,"L1 loss: loss=%.5f\n",loss); | |
313 fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length); | |
314 example_length=estimate_sphere(model,kernel_parm); | |
315 fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n", | |
316 length_of_longest_document_vector(docs,totdoc,kernel_parm)); | |
317 fprintf(stdout,"Estimated VCdim of classifier: VCdim<=%.5f\n", | |
318 estimate_margin_vcdim(model,model_length,example_length, | |
319 kernel_parm)); | |
320 if((!learn_parm->remove_inconsistent) && (!transduction)) { | |
321 runtime_start_xa=get_runtime(); | |
322 if(verbosity>=1) { | |
323 printf("Computing XiAlpha-estimates..."); fflush(stdout); | |
324 } | |
325 compute_xa_estimates(model,label,unlabeled,totdoc,docs,lin,a, | |
326 kernel_parm,learn_parm,&(model->xa_error), | |
327 &(model->xa_recall),&(model->xa_precision)); | |
328 if(verbosity>=1) { | |
329 printf("done\n"); | |
330 } | |
331 printf("Runtime for XiAlpha-estimates in cpu-seconds: %.2f\n", | |
332 (get_runtime()-runtime_start_xa)/100.0); | |
333 | |
334 fprintf(stdout,"XiAlpha-estimate of the error: error<=%.2f%% (rho=%.2f,depth=%ld)\n", | |
335 model->xa_error,learn_parm->rho,learn_parm->xa_depth); | |
336 fprintf(stdout,"XiAlpha-estimate of the recall: recall=>%.2f%% (rho=%.2f,depth=%ld)\n", | |
337 model->xa_recall,learn_parm->rho,learn_parm->xa_depth); | |
338 fprintf(stdout,"XiAlpha-estimate of the precision: precision=>%.2f%% (rho=%.2f,depth=%ld)\n", | |
339 model->xa_precision,learn_parm->rho,learn_parm->xa_depth); | |
340 } | |
341 else if(!learn_parm->remove_inconsistent) { | |
342 estimate_transduction_quality(model,label,unlabeled,totdoc,docs,lin); | |
343 } | |
344 } | |
345 if(verbosity>=1) { | |
346 printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic); | |
347 } | |
348 } | |
349 | |
350 | |
351 /* leave-one-out testing starts now */ | |
352 if(learn_parm->compute_loo) { | |
353 /* save results of training on full dataset for leave-one-out */ | |
354 runtime_start_loo=get_runtime(); | |
355 for(i=0;i<totdoc;i++) { | |
356 xi_fullset[i]=1.0-((lin[i]-model->b)*(double)label[i]); | |
357 if(xi_fullset[i]<0) xi_fullset[i]=0; | |
358 a_fullset[i]=a[i]; | |
359 } | |
360 if(verbosity>=1) { | |
361 printf("Computing leave-one-out"); | |
362 } | |
363 | |
364 /* repeat this loop for every held-out example */ | |
365 for(heldout=0;(heldout<totdoc);heldout++) { | |
366 if(learn_parm->rho*a_fullset[heldout]*r_delta_sq+xi_fullset[heldout] | |
367 < 1.0) { | |
368 /* guaranteed to not produce a leave-one-out error */ | |
369 if(verbosity==1) { | |
370 printf("+"); fflush(stdout); | |
371 } | |
372 } | |
373 else if(xi_fullset[heldout] > 1.0) { | |
374 /* guaranteed to produce a leave-one-out error */ | |
375 loo_count++; | |
376 if(label[heldout] > 0) loo_count_pos++; else loo_count_neg++; | |
377 if(verbosity==1) { | |
378 printf("-"); fflush(stdout); | |
379 } | |
380 } | |
381 else { | |
382 loocomputed++; | |
383 heldout_c=learn_parm->svm_cost[heldout]; /* set upper bound to zero */ | |
384 learn_parm->svm_cost[heldout]=0; | |
385 /* make sure heldout example is not currently */ | |
386 /* shrunk away. Assumes that lin is up to date! */ | |
387 shrink_state.active[heldout]=1; | |
388 if(verbosity>=2) | |
389 printf("\nLeave-One-Out test on example %ld\n",heldout); | |
390 if(verbosity>=1) { | |
391 printf("(?[%ld]",heldout); fflush(stdout); | |
392 } | |
393 | |
394 optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, | |
395 kernel_parm, | |
396 kernel_cache,&shrink_state,model,inconsistent,unlabeled, | |
397 a,lin,c,&timing_profile, | |
398 &maxdiff,heldout,(long)2); | |
399 | |
400 /* printf("%.20f\n",(lin[heldout]-model->b)*(double)label[heldout]); */ | |
401 | |
402 if(((lin[heldout]-model->b)*(double)label[heldout]) <= 0.0) { | |
403 loo_count++; /* there was a loo-error */ | |
404 if(label[heldout] > 0) loo_count_pos++; else loo_count_neg++; | |
405 if(verbosity>=1) { | |
406 printf("-)"); fflush(stdout); | |
407 } | |
408 } | |
409 else { | |
410 if(verbosity>=1) { | |
411 printf("+)"); fflush(stdout); | |
412 } | |
413 } | |
414 /* now we need to restore the original data set*/ | |
415 learn_parm->svm_cost[heldout]=heldout_c; /* restore upper bound */ | |
416 } | |
417 } /* end of leave-one-out loop */ | |
418 | |
419 | |
420 if(verbosity>=1) { | |
421 printf("\nRetrain on full problem"); fflush(stdout); | |
422 } | |
423 optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, | |
424 kernel_parm, | |
425 kernel_cache,&shrink_state,model,inconsistent,unlabeled, | |
426 a,lin,c,&timing_profile, | |
427 &maxdiff,(long)-1,(long)1); | |
428 if(verbosity >= 1) | |
429 printf("done.\n"); | |
430 | |
431 | |
432 /* after all leave-one-out computed */ | |
433 model->loo_error=100.0*loo_count/(double)totdoc; | |
434 model->loo_recall=(1.0-(double)loo_count_pos/(double)trainpos)*100.0; | |
435 model->loo_precision=(trainpos-loo_count_pos)/ | |
436 (double)(trainpos-loo_count_pos+loo_count_neg)*100.0; | |
437 if(verbosity >= 1) { | |
438 fprintf(stdout,"Leave-one-out estimate of the error: error=%.2f%%\n", | |
439 model->loo_error); | |
440 fprintf(stdout,"Leave-one-out estimate of the recall: recall=%.2f%%\n", | |
441 model->loo_recall); | |
442 fprintf(stdout,"Leave-one-out estimate of the precision: precision=%.2f%%\n", | |
443 model->loo_precision); | |
444 fprintf(stdout,"Actual leave-one-outs computed: %ld (rho=%.2f)\n", | |
445 loocomputed,learn_parm->rho); | |
446 printf("Runtime for leave-one-out in cpu-seconds: %.2f\n", | |
447 (double)(get_runtime()-runtime_start_loo)/100.0); | |
448 } | |
449 } | |
450 | |
451 if(learn_parm->alphafile[0]) | |
452 write_alphas(learn_parm->alphafile,a,label,totdoc); | |
453 | |
454 shrink_state_cleanup(&shrink_state); | |
455 free(label); | |
456 free(inconsistent); | |
457 free(unlabeled); | |
458 free(c); | |
459 free(a); | |
460 free(a_fullset); | |
461 free(xi_fullset); | |
462 free(lin); | |
463 free(learn_parm->svm_cost); | |
464 } | |
465 | |
466 | |
467 /* Learns an SVM regression model based on the training data in | |
468 docs/label. The resulting model is returned in the structure | |
469 model. */ | |
470 | |
471 void svm_learn_regression(DOC **docs, double *value, long int totdoc, | |
472 long int totwords, LEARN_PARM *learn_parm, | |
473 KERNEL_PARM *kernel_parm, | |
474 KERNEL_CACHE **kernel_cache, MODEL *model) | |
475 /* docs: Training vectors (x-part) */ | |
476 /* class: Training value (y-part) */ | |
477 /* totdoc: Number of examples in docs/label */ | |
478 /* totwords: Number of features (i.e. highest feature index) */ | |
479 /* learn_parm: Learning paramenters */ | |
480 /* kernel_parm: Kernel paramenters */ | |
481 /* kernel_cache:Initialized Cache, if using a kernel. NULL if | |
482 linear. Note that it will be free'd and reassigned */ | |
483 /* model: Returns learning result (assumed empty before called) */ | |
484 { | |
485 long *inconsistent,i,j; | |
486 long inconsistentnum; | |
487 long upsupvecnum; | |
488 double loss,model_length,example_length; | |
489 double maxdiff,*lin,*a,*c; | |
490 long runtime_start,runtime_end; | |
491 long iterations,kernel_cache_size; | |
492 long *unlabeled; | |
493 double r_delta_sq=0,r_delta,r_delta_avg; | |
494 double *xi_fullset; /* buffer for storing xi on full sample in loo */ | |
495 double *a_fullset; /* buffer for storing alpha on full sample in loo */ | |
496 TIMING timing_profile; | |
497 SHRINK_STATE shrink_state; | |
498 DOC **docs_org; | |
499 long *label; | |
500 | |
501 /* set up regression problem in standard form */ | |
502 docs_org=docs; | |
503 docs = (DOC **)my_malloc(sizeof(DOC)*2*totdoc); | |
504 label = (long *)my_malloc(sizeof(long)*2*totdoc); | |
505 c = (double *)my_malloc(sizeof(double)*2*totdoc); | |
506 for(i=0;i<totdoc;i++) { | |
507 j=2*totdoc-1-i; | |
508 docs[i]=create_example(i,0,0,docs_org[i]->costfactor,docs_org[i]->fvec); | |
509 label[i]=+1; | |
510 c[i]=value[i]; | |
511 docs[j]=create_example(j,0,0,docs_org[i]->costfactor,docs_org[i]->fvec); | |
512 label[j]=-1; | |
513 c[j]=value[i]; | |
514 } | |
515 totdoc*=2; | |
516 | |
517 /* need to get a bigger kernel cache */ | |
518 if(*kernel_cache) { | |
519 kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024); | |
520 kernel_cache_cleanup(*kernel_cache); | |
521 (*kernel_cache)=kernel_cache_init(totdoc,kernel_cache_size); | |
522 } | |
523 | |
524 runtime_start=get_runtime(); | |
525 timing_profile.time_kernel=0; | |
526 timing_profile.time_opti=0; | |
527 timing_profile.time_shrink=0; | |
528 timing_profile.time_update=0; | |
529 timing_profile.time_model=0; | |
530 timing_profile.time_check=0; | |
531 timing_profile.time_select=0; | |
532 kernel_cache_statistic=0; | |
533 | |
534 learn_parm->totwords=totwords; | |
535 | |
536 /* make sure -n value is reasonable */ | |
537 if((learn_parm->svm_newvarsinqp < 2) | |
538 || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) { | |
539 learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; | |
540 } | |
541 | |
542 init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK); | |
543 | |
544 inconsistent = (long *)my_malloc(sizeof(long)*totdoc); | |
545 unlabeled = (long *)my_malloc(sizeof(long)*totdoc); | |
546 a = (double *)my_malloc(sizeof(double)*totdoc); | |
547 a_fullset = (double *)my_malloc(sizeof(double)*totdoc); | |
548 xi_fullset = (double *)my_malloc(sizeof(double)*totdoc); | |
549 lin = (double *)my_malloc(sizeof(double)*totdoc); | |
550 learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc); | |
551 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); | |
552 model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); | |
553 model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); | |
554 | |
555 model->at_upper_bound=0; | |
556 model->b=0; | |
557 model->supvec[0]=0; /* element 0 reserved and empty for now */ | |
558 model->alpha[0]=0; | |
559 model->lin_weights=NULL; | |
560 model->totwords=totwords; | |
561 model->totdoc=totdoc; | |
562 model->kernel_parm=(*kernel_parm); | |
563 model->sv_num=1; | |
564 model->loo_error=-1; | |
565 model->loo_recall=-1; | |
566 model->loo_precision=-1; | |
567 model->xa_error=-1; | |
568 model->xa_recall=-1; | |
569 model->xa_precision=-1; | |
570 inconsistentnum=0; | |
571 | |
572 r_delta=estimate_r_delta(docs,totdoc,kernel_parm); | |
573 r_delta_sq=r_delta*r_delta; | |
574 | |
575 r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm); | |
576 if(learn_parm->svm_c == 0.0) { /* default value for C */ | |
577 learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg); | |
578 if(verbosity>=1) | |
579 printf("Setting default regularization parameter C=%.4f\n", | |
580 learn_parm->svm_c); | |
581 } | |
582 | |
583 for(i=0;i<totdoc;i++) { /* various inits */ | |
584 inconsistent[i]=0; | |
585 a[i]=0; | |
586 lin[i]=0; | |
587 unlabeled[i]=0; | |
588 if(label[i] > 0) { | |
589 learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio* | |
590 docs[i]->costfactor; | |
591 } | |
592 else if(label[i] < 0) { | |
593 learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor; | |
594 } | |
595 } | |
596 | |
597 /* caching makes no sense for linear kernel */ | |
598 if((kernel_parm->kernel_type == LINEAR) && (*kernel_cache)) { | |
599 printf("WARNING: Using a kernel cache for linear case will slow optimization down!\n"); | |
600 } | |
601 | |
602 if(verbosity==1) { | |
603 printf("Optimizing"); fflush(stdout); | |
604 } | |
605 | |
606 /* train the svm */ | |
607 iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, | |
608 kernel_parm,*kernel_cache,&shrink_state, | |
609 model,inconsistent,unlabeled,a,lin,c, | |
610 &timing_profile,&maxdiff,(long)-1, | |
611 (long)1); | |
612 | |
613 if(verbosity>=1) { | |
614 if(verbosity==1) printf("done. (%ld iterations)\n",iterations); | |
615 | |
616 printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); | |
617 | |
618 runtime_end=get_runtime(); | |
619 if(verbosity>=2) { | |
620 printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n", | |
621 ((float)runtime_end-(float)runtime_start)/100.0, | |
622 (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start), | |
623 (100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start), | |
624 (100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start), | |
625 (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start), | |
626 (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start), | |
627 (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start), | |
628 (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start)); | |
629 } | |
630 else { | |
631 printf("Runtime in cpu-seconds: %.2f\n", | |
632 (runtime_end-runtime_start)/100.0); | |
633 } | |
634 | |
635 if(learn_parm->remove_inconsistent) { | |
636 inconsistentnum=0; | |
637 for(i=0;i<totdoc;i++) | |
638 if(inconsistent[i]) | |
639 inconsistentnum++; | |
640 printf("Number of SV: %ld (plus %ld inconsistent examples)\n", | |
641 model->sv_num-1,inconsistentnum); | |
642 } | |
643 else { | |
644 upsupvecnum=0; | |
645 for(i=1;i<model->sv_num;i++) { | |
646 if(fabs(model->alpha[i]) >= | |
647 (learn_parm->svm_cost[(model->supvec[i])->docnum]- | |
648 learn_parm->epsilon_a)) | |
649 upsupvecnum++; | |
650 } | |
651 printf("Number of SV: %ld (including %ld at upper bound)\n", | |
652 model->sv_num-1,upsupvecnum); | |
653 } | |
654 | |
655 if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { | |
656 loss=0; | |
657 model_length=0; | |
658 for(i=0;i<totdoc;i++) { | |
659 if((lin[i]-model->b)*(double)label[i] < (-learn_parm->eps+(double)label[i]*c[i])-learn_parm->epsilon_crit) | |
660 loss+=-learn_parm->eps+(double)label[i]*c[i]-(lin[i]-model->b)*(double)label[i]; | |
661 model_length+=a[i]*label[i]*lin[i]; | |
662 } | |
663 model_length=sqrt(model_length); | |
664 fprintf(stdout,"L1 loss: loss=%.5f\n",loss); | |
665 fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length); | |
666 example_length=estimate_sphere(model,kernel_parm); | |
667 fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n", | |
668 length_of_longest_document_vector(docs,totdoc,kernel_parm)); | |
669 } | |
670 if(verbosity>=1) { | |
671 printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic); | |
672 } | |
673 } | |
674 | |
675 if(learn_parm->alphafile[0]) | |
676 write_alphas(learn_parm->alphafile,a,label,totdoc); | |
677 | |
678 /* this makes sure the model we return does not contain pointers to the | |
679 temporary documents */ | |
680 for(i=1;i<model->sv_num;i++) { | |
681 j=model->supvec[i]->docnum; | |
682 if(j >= (totdoc/2)) { | |
683 j=totdoc-j-1; | |
684 } | |
685 model->supvec[i]=docs_org[j]; | |
686 } | |
687 | |
688 shrink_state_cleanup(&shrink_state); | |
689 for(i=0;i<totdoc;i++) | |
690 free_example(docs[i],0); | |
691 free(docs); | |
692 free(label); | |
693 free(inconsistent); | |
694 free(unlabeled); | |
695 free(c); | |
696 free(a); | |
697 free(a_fullset); | |
698 free(xi_fullset); | |
699 free(lin); | |
700 free(learn_parm->svm_cost); | |
701 } | |
702 | |
703 void svm_learn_ranking(DOC **docs, double *rankvalue, long int totdoc, | |
704 long int totwords, LEARN_PARM *learn_parm, | |
705 KERNEL_PARM *kernel_parm, KERNEL_CACHE **kernel_cache, | |
706 MODEL *model) | |
707 /* docs: Training vectors (x-part) */ | |
708 /* rankvalue: Training target values that determine the ranking */ | |
709 /* totdoc: Number of examples in docs/label */ | |
710 /* totwords: Number of features (i.e. highest feature index) */ | |
711 /* learn_parm: Learning paramenters */ | |
712 /* kernel_parm: Kernel paramenters */ | |
713 /* kernel_cache:Initialized pointer to Cache of size 1*totdoc, if | |
714 using a kernel. NULL if linear. NOTE: Cache is | |
715 getting reinitialized in this function */ | |
716 /* model: Returns learning result (assumed empty before called) */ | |
717 { | |
718 DOC **docdiff; | |
719 long i,j,k,totpair,kernel_cache_size; | |
720 double *target,*alpha,cost; | |
721 long *greater,*lesser; | |
722 MODEL *pairmodel; | |
723 SVECTOR *flow,*fhigh; | |
724 | |
725 totpair=0; | |
726 for(i=0;i<totdoc;i++) { | |
727 for(j=i+1;j<totdoc;j++) { | |
728 if((docs[i]->queryid==docs[j]->queryid) && (rankvalue[i] != rankvalue[j])) { | |
729 totpair++; | |
730 } | |
731 } | |
732 } | |
733 | |
734 printf("Constructing %ld rank constraints...",totpair); fflush(stdout); | |
735 docdiff=(DOC **)my_malloc(sizeof(DOC)*totpair); | |
736 target=(double *)my_malloc(sizeof(double)*totpair); | |
737 greater=(long *)my_malloc(sizeof(long)*totpair); | |
738 lesser=(long *)my_malloc(sizeof(long)*totpair); | |
739 | |
740 k=0; | |
741 for(i=0;i<totdoc;i++) { | |
742 for(j=i+1;j<totdoc;j++) { | |
743 if(docs[i]->queryid == docs[j]->queryid) { | |
744 cost=(docs[i]->costfactor+docs[j]->costfactor)/2.0; | |
745 if(rankvalue[i] > rankvalue[j]) { | |
746 if(kernel_parm->kernel_type == LINEAR) | |
747 docdiff[k]=create_example(k,0,0,cost, | |
748 sub_ss(docs[i]->fvec,docs[j]->fvec)); | |
749 else { | |
750 flow=copy_svector(docs[j]->fvec); | |
751 flow->factor=-1.0; | |
752 flow->next=NULL; | |
753 fhigh=copy_svector(docs[i]->fvec); | |
754 fhigh->factor=1.0; | |
755 fhigh->next=flow; | |
756 docdiff[k]=create_example(k,0,0,cost,fhigh); | |
757 } | |
758 target[k]=1; | |
759 greater[k]=i; | |
760 lesser[k]=j; | |
761 k++; | |
762 } | |
763 else if(rankvalue[i] < rankvalue[j]) { | |
764 if(kernel_parm->kernel_type == LINEAR) | |
765 docdiff[k]=create_example(k,0,0,cost, | |
766 sub_ss(docs[i]->fvec,docs[j]->fvec)); | |
767 else { | |
768 flow=copy_svector(docs[j]->fvec); | |
769 flow->factor=-1.0; | |
770 flow->next=NULL; | |
771 fhigh=copy_svector(docs[i]->fvec); | |
772 fhigh->factor=1.0; | |
773 fhigh->next=flow; | |
774 docdiff[k]=create_example(k,0,0,cost,fhigh); | |
775 } | |
776 target[k]=-1; | |
777 greater[k]=i; | |
778 lesser[k]=j; | |
779 k++; | |
780 } | |
781 } | |
782 } | |
783 } | |
784 printf("done.\n"); fflush(stdout); | |
785 | |
786 /* need to get a bigger kernel cache */ | |
787 if(*kernel_cache) { | |
788 kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024); | |
789 kernel_cache_cleanup(*kernel_cache); | |
790 (*kernel_cache)=kernel_cache_init(totpair,kernel_cache_size); | |
791 } | |
792 | |
793 /* must use unbiased hyperplane on difference vectors */ | |
794 learn_parm->biased_hyperplane=0; | |
795 pairmodel=(MODEL *)my_malloc(sizeof(MODEL)); | |
796 svm_learn_classification(docdiff,target,totpair,totwords,learn_parm, | |
797 kernel_parm,(*kernel_cache),pairmodel,NULL); | |
798 | |
799 /* Transfer the result into a more compact model. If you would like | |
800 to output the original model on pairs of documents, see below. */ | |
801 alpha=(double *)my_malloc(sizeof(double)*totdoc); | |
802 for(i=0;i<totdoc;i++) { | |
803 alpha[i]=0; | |
804 } | |
805 for(i=1;i<pairmodel->sv_num;i++) { | |
806 alpha[lesser[(pairmodel->supvec[i])->docnum]]-=pairmodel->alpha[i]; | |
807 alpha[greater[(pairmodel->supvec[i])->docnum]]+=pairmodel->alpha[i]; | |
808 } | |
809 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); | |
810 model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); | |
811 model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); | |
812 model->supvec[0]=0; /* element 0 reserved and empty for now */ | |
813 model->alpha[0]=0; | |
814 model->sv_num=1; | |
815 for(i=0;i<totdoc;i++) { | |
816 if(alpha[i]) { | |
817 model->supvec[model->sv_num]=docs[i]; | |
818 model->alpha[model->sv_num]=alpha[i]; | |
819 model->index[i]=model->sv_num; | |
820 model->sv_num++; | |
821 } | |
822 else { | |
823 model->index[i]=-1; | |
824 } | |
825 } | |
826 model->at_upper_bound=0; | |
827 model->b=0; | |
828 model->lin_weights=NULL; | |
829 model->totwords=totwords; | |
830 model->totdoc=totdoc; | |
831 model->kernel_parm=(*kernel_parm); | |
832 model->loo_error=-1; | |
833 model->loo_recall=-1; | |
834 model->loo_precision=-1; | |
835 model->xa_error=-1; | |
836 model->xa_recall=-1; | |
837 model->xa_precision=-1; | |
838 | |
839 free(alpha); | |
840 free(greater); | |
841 free(lesser); | |
842 free(target); | |
843 | |
844 /* If you would like to output the original model on pairs of | |
845 document, replace the following lines with '(*model)=(*pairmodel);' */ | |
846 for(i=0;i<totpair;i++) | |
847 free_example(docdiff[i],1); | |
848 free(docdiff); | |
849 free_model(pairmodel,0); | |
850 } | |
851 | |
852 | |
853 /* The following solves a freely defined and given set of | |
854 inequalities. The optimization problem is of the following form: | |
855 | |
856 min 0.5 w*w + C sum_i C_i \xi_i | |
857 s.t. x_i * w > rhs_i - \xi_i | |
858 | |
859 This corresponds to the -z o option. */ | |
860 | |
861 void svm_learn_optimization(DOC **docs, double *rhs, long int | |
862 totdoc, long int totwords, | |
863 LEARN_PARM *learn_parm, | |
864 KERNEL_PARM *kernel_parm, | |
865 KERNEL_CACHE *kernel_cache, MODEL *model, | |
866 double *alpha) | |
867 /* docs: Left-hand side of inequalities (x-part) */ | |
868 /* rhs: Right-hand side of inequalities */ | |
869 /* totdoc: Number of examples in docs/label */ | |
870 /* totwords: Number of features (i.e. highest feature index) */ | |
871 /* learn_parm: Learning paramenters */ | |
872 /* kernel_parm: Kernel paramenters */ | |
873 /* kernel_cache:Initialized Cache of size 1*totdoc, if using a kernel. | |
874 NULL if linear.*/ | |
875 /* model: Returns solution as SV expansion (assumed empty before called) */ | |
876 /* alpha: Start values for the alpha variables or NULL | |
877 pointer. The new alpha values are returned after | |
878 optimization if not NULL. Array must be of size totdoc. */ | |
879 { | |
880 long i,*label; | |
881 long misclassified,upsupvecnum; | |
882 double loss,model_length,example_length; | |
883 double maxdiff,*lin,*a,*c; | |
884 long runtime_start,runtime_end; | |
885 long iterations,maxslackid,svsetnum; | |
886 long *unlabeled,*inconsistent; | |
887 double r_delta_sq=0,r_delta,r_delta_avg; | |
888 long *index,*index2dnum; | |
889 double *weights,*slack,*alphaslack; | |
890 CFLOAT *aicache; /* buffer to keep one row of hessian */ | |
891 | |
892 TIMING timing_profile; | |
893 SHRINK_STATE shrink_state; | |
894 | |
895 runtime_start=get_runtime(); | |
896 timing_profile.time_kernel=0; | |
897 timing_profile.time_opti=0; | |
898 timing_profile.time_shrink=0; | |
899 timing_profile.time_update=0; | |
900 timing_profile.time_model=0; | |
901 timing_profile.time_check=0; | |
902 timing_profile.time_select=0; | |
903 kernel_cache_statistic=0; | |
904 | |
905 learn_parm->totwords=totwords; | |
906 | |
907 /* make sure -n value is reasonable */ | |
908 if((learn_parm->svm_newvarsinqp < 2) | |
909 || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) { | |
910 learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; | |
911 } | |
912 | |
913 init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK); | |
914 | |
915 label = (long *)my_malloc(sizeof(long)*totdoc); | |
916 unlabeled = (long *)my_malloc(sizeof(long)*totdoc); | |
917 inconsistent = (long *)my_malloc(sizeof(long)*totdoc); | |
918 c = (double *)my_malloc(sizeof(double)*totdoc); | |
919 a = (double *)my_malloc(sizeof(double)*totdoc); | |
920 lin = (double *)my_malloc(sizeof(double)*totdoc); | |
921 learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc); | |
922 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); | |
923 model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); | |
924 model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); | |
925 | |
926 model->at_upper_bound=0; | |
927 model->b=0; | |
928 model->supvec[0]=0; /* element 0 reserved and empty for now */ | |
929 model->alpha[0]=0; | |
930 model->lin_weights=NULL; | |
931 model->totwords=totwords; | |
932 model->totdoc=totdoc; | |
933 model->kernel_parm=(*kernel_parm); | |
934 model->sv_num=1; | |
935 model->loo_error=-1; | |
936 model->loo_recall=-1; | |
937 model->loo_precision=-1; | |
938 model->xa_error=-1; | |
939 model->xa_recall=-1; | |
940 model->xa_precision=-1; | |
941 | |
942 r_delta=estimate_r_delta(docs,totdoc,kernel_parm); | |
943 r_delta_sq=r_delta*r_delta; | |
944 | |
945 r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm); | |
946 if(learn_parm->svm_c == 0.0) { /* default value for C */ | |
947 learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg); | |
948 if(verbosity>=1) | |
949 printf("Setting default regularization parameter C=%.4f\n", | |
950 learn_parm->svm_c); | |
951 } | |
952 | |
953 learn_parm->biased_hyperplane=0; /* learn an unbiased hyperplane */ | |
954 | |
955 learn_parm->eps=0.0; /* No margin, unless explicitly handcoded | |
956 in the right-hand side in the training | |
957 set. */ | |
958 | |
959 for(i=0;i<totdoc;i++) { /* various inits */ | |
960 docs[i]->docnum=i; | |
961 a[i]=0; | |
962 lin[i]=0; | |
963 c[i]=rhs[i]; /* set right-hand side */ | |
964 unlabeled[i]=0; | |
965 inconsistent[i]=0; | |
966 learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio* | |
967 docs[i]->costfactor; | |
968 label[i]=1; | |
969 } | |
970 if(learn_parm->sharedslack) /* if shared slacks are used, they must */ | |
971 for(i=0;i<totdoc;i++) /* be used on every constraint */ | |
972 if(!docs[i]->slackid) { | |
973 perror("Error: Missing shared slacks definitions in some of the examples."); | |
974 exit(0); | |
975 } | |
976 | |
977 /* compute starting state for initial alpha values */ | |
978 if(alpha) { | |
979 if(verbosity>=1) { | |
980 printf("Computing starting state..."); fflush(stdout); | |
981 } | |
982 index = (long *)my_malloc(sizeof(long)*totdoc); | |
983 index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
984 weights=(double *)my_malloc(sizeof(double)*(totwords+1)); | |
985 aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); | |
986 for(i=0;i<totdoc;i++) { /* create full index and clip alphas */ | |
987 index[i]=1; | |
988 alpha[i]=fabs(alpha[i]); | |
989 if(alpha[i]<0) alpha[i]=0; | |
990 if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i]; | |
991 } | |
992 if(kernel_parm->kernel_type != LINEAR) { | |
993 for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */ | |
994 if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) | |
995 && (kernel_cache_space_available(kernel_cache))) | |
996 cache_kernel_row(kernel_cache,docs,i,kernel_parm); | |
997 for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */ | |
998 if((alpha[i]==learn_parm->svm_cost[i]) | |
999 && (kernel_cache_space_available(kernel_cache))) | |
1000 cache_kernel_row(kernel_cache,docs,i,kernel_parm); | |
1001 } | |
1002 (void)compute_index(index,totdoc,index2dnum); | |
1003 update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc, | |
1004 totwords,kernel_parm,kernel_cache,lin,aicache, | |
1005 weights); | |
1006 (void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c, | |
1007 learn_parm,index2dnum,index2dnum,model); | |
1008 for(i=0;i<totdoc;i++) { /* copy initial alphas */ | |
1009 a[i]=alpha[i]; | |
1010 } | |
1011 free(index); | |
1012 free(index2dnum); | |
1013 free(weights); | |
1014 free(aicache); | |
1015 if(verbosity>=1) { | |
1016 printf("done.\n"); fflush(stdout); | |
1017 } | |
1018 } | |
1019 | |
1020 /* removing inconsistent does not work for general optimization problem */ | |
1021 if(learn_parm->remove_inconsistent) { | |
1022 learn_parm->remove_inconsistent = 0; | |
1023 printf("'remove inconsistent' not available in this mode. Switching option off!"); fflush(stdout); | |
1024 } | |
1025 | |
1026 /* caching makes no sense for linear kernel */ | |
1027 if(kernel_parm->kernel_type == LINEAR) { | |
1028 kernel_cache = NULL; | |
1029 } | |
1030 | |
1031 if(verbosity==1) { | |
1032 printf("Optimizing"); fflush(stdout); | |
1033 } | |
1034 | |
1035 /* train the svm */ | |
1036 if(learn_parm->sharedslack) | |
1037 iterations=optimize_to_convergence_sharedslack(docs,label,totdoc, | |
1038 totwords,learn_parm,kernel_parm, | |
1039 kernel_cache,&shrink_state,model, | |
1040 a,lin,c,&timing_profile, | |
1041 &maxdiff); | |
1042 else | |
1043 iterations=optimize_to_convergence(docs,label,totdoc, | |
1044 totwords,learn_parm,kernel_parm, | |
1045 kernel_cache,&shrink_state,model, | |
1046 inconsistent,unlabeled, | |
1047 a,lin,c,&timing_profile, | |
1048 &maxdiff,(long)-1,(long)1); | |
1049 | |
1050 if(verbosity>=1) { | |
1051 if(verbosity==1) printf("done. (%ld iterations)\n",iterations); | |
1052 | |
1053 misclassified=0; | |
1054 for(i=0;(i<totdoc);i++) { /* get final statistic */ | |
1055 if((lin[i]-model->b)*(double)label[i] <= 0.0) | |
1056 misclassified++; | |
1057 } | |
1058 | |
1059 printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); | |
1060 | |
1061 runtime_end=get_runtime(); | |
1062 if(verbosity>=2) { | |
1063 printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n", | |
1064 ((float)runtime_end-(float)runtime_start)/100.0, | |
1065 (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start), | |
1066 (100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start), | |
1067 (100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start), | |
1068 (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start), | |
1069 (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start), | |
1070 (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start), | |
1071 (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start)); | |
1072 } | |
1073 else { | |
1074 printf("Runtime in cpu-seconds: %.2f\n", | |
1075 (runtime_end-runtime_start)/100.0); | |
1076 } | |
1077 } | |
1078 if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { | |
1079 loss=0; | |
1080 model_length=0; | |
1081 for(i=0;i<totdoc;i++) { | |
1082 if((lin[i]-model->b)*(double)label[i] < c[i]-learn_parm->epsilon_crit) | |
1083 loss+=c[i]-(lin[i]-model->b)*(double)label[i]; | |
1084 model_length+=a[i]*label[i]*lin[i]; | |
1085 } | |
1086 model_length=sqrt(model_length); | |
1087 fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length); | |
1088 } | |
1089 | |
1090 if(learn_parm->sharedslack) { | |
1091 index = (long *)my_malloc(sizeof(long)*totdoc); | |
1092 index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1093 maxslackid=0; | |
1094 for(i=0;i<totdoc;i++) { /* create full index */ | |
1095 index[i]=1; | |
1096 if(maxslackid<docs[i]->slackid) | |
1097 maxslackid=docs[i]->slackid; | |
1098 } | |
1099 (void)compute_index(index,totdoc,index2dnum); | |
1100 slack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); | |
1101 alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); | |
1102 for(i=0;i<=maxslackid;i++) { /* init shared slacks */ | |
1103 slack[i]=0; | |
1104 alphaslack[i]=0; | |
1105 } | |
1106 compute_shared_slacks(docs,label,a,lin,c,index2dnum,learn_parm, | |
1107 slack,alphaslack); | |
1108 loss=0; | |
1109 model->at_upper_bound=0; | |
1110 svsetnum=0; | |
1111 for(i=0;i<=maxslackid;i++) { /* create full index */ | |
1112 loss+=slack[i]; | |
1113 if(alphaslack[i] > (learn_parm->svm_c - learn_parm->epsilon_a)) | |
1114 model->at_upper_bound++; | |
1115 if(alphaslack[i] > learn_parm->epsilon_a) | |
1116 svsetnum++; | |
1117 } | |
1118 free(index); | |
1119 free(index2dnum); | |
1120 free(slack); | |
1121 free(alphaslack); | |
1122 } | |
1123 | |
1124 if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { | |
1125 if(learn_parm->sharedslack) { | |
1126 printf("Number of SV: %ld\n", | |
1127 model->sv_num-1); | |
1128 printf("Number of non-zero slack variables: %ld (out of %ld)\n", | |
1129 model->at_upper_bound,svsetnum); | |
1130 fprintf(stdout,"L1 loss: loss=%.5f\n",loss); | |
1131 } | |
1132 else { | |
1133 upsupvecnum=0; | |
1134 for(i=1;i<model->sv_num;i++) { | |
1135 if(fabs(model->alpha[i]) >= | |
1136 (learn_parm->svm_cost[(model->supvec[i])->docnum]- | |
1137 learn_parm->epsilon_a)) | |
1138 upsupvecnum++; | |
1139 } | |
1140 printf("Number of SV: %ld (including %ld at upper bound)\n", | |
1141 model->sv_num-1,upsupvecnum); | |
1142 fprintf(stdout,"L1 loss: loss=%.5f\n",loss); | |
1143 } | |
1144 example_length=estimate_sphere(model,kernel_parm); | |
1145 fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n", | |
1146 length_of_longest_document_vector(docs,totdoc,kernel_parm)); | |
1147 } | |
1148 if(verbosity>=1) { | |
1149 printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic); | |
1150 } | |
1151 | |
1152 if(alpha) { | |
1153 for(i=0;i<totdoc;i++) { /* copy final alphas */ | |
1154 alpha[i]=a[i]; | |
1155 } | |
1156 } | |
1157 | |
1158 if(learn_parm->alphafile[0]) | |
1159 write_alphas(learn_parm->alphafile,a,label,totdoc); | |
1160 | |
1161 shrink_state_cleanup(&shrink_state); | |
1162 free(label); | |
1163 free(unlabeled); | |
1164 free(inconsistent); | |
1165 free(c); | |
1166 free(a); | |
1167 free(lin); | |
1168 free(learn_parm->svm_cost); | |
1169 } | |
1170 | |
1171 | |
1172 long optimize_to_convergence(DOC **docs, long int *label, long int totdoc, | |
1173 long int totwords, LEARN_PARM *learn_parm, | |
1174 KERNEL_PARM *kernel_parm, | |
1175 KERNEL_CACHE *kernel_cache, | |
1176 SHRINK_STATE *shrink_state, MODEL *model, | |
1177 long int *inconsistent, long int *unlabeled, | |
1178 double *a, double *lin, double *c, | |
1179 TIMING *timing_profile, double *maxdiff, | |
1180 long int heldout, long int retrain) | |
1181 /* docs: Training vectors (x-part) */ | |
1182 /* label: Training labels/value (y-part, zero if test example for | |
1183 transduction) */ | |
1184 /* totdoc: Number of examples in docs/label */ | |
1185 /* totwords: Number of features (i.e. highest feature index) */ | |
1186 /* laern_parm: Learning paramenters */ | |
1187 /* kernel_parm: Kernel paramenters */ | |
1188 /* kernel_cache: Initialized/partly filled Cache, if using a kernel. | |
1189 NULL if linear. */ | |
1190 /* shrink_state: State of active variables */ | |
1191 /* model: Returns learning result */ | |
1192 /* inconsistent: examples thrown out as inconstistent */ | |
1193 /* unlabeled: test examples for transduction */ | |
1194 /* a: alphas */ | |
1195 /* lin: linear component of gradient */ | |
1196 /* c: right hand side of inequalities (margin) */ | |
1197 /* maxdiff: returns maximum violation of KT-conditions */ | |
1198 /* heldout: marks held-out example for leave-one-out (or -1) */ | |
1199 /* retrain: selects training mode (1=regular / 2=holdout) */ | |
1200 { | |
1201 long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink; | |
1202 long inconsistentnum,choosenum,already_chosen=0,iteration; | |
1203 long misclassified,supvecnum=0,*active2dnum,inactivenum; | |
1204 long *working2dnum,*selexam; | |
1205 long activenum; | |
1206 double criterion,eq; | |
1207 double *a_old; | |
1208 long t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */ | |
1209 long transductcycle; | |
1210 long transduction; | |
1211 double epsilon_crit_org; | |
1212 double bestmaxdiff; | |
1213 long bestmaxdiffiter,terminate; | |
1214 | |
1215 double *selcrit; /* buffer for sorting */ | |
1216 CFLOAT *aicache; /* buffer to keep one row of hessian */ | |
1217 double *weights; /* buffer for weight vector in linear case */ | |
1218 QP qp; /* buffer for one quadratic program */ | |
1219 | |
1220 epsilon_crit_org=learn_parm->epsilon_crit; /* save org */ | |
1221 if(kernel_parm->kernel_type == LINEAR) { | |
1222 learn_parm->epsilon_crit=2.0; | |
1223 kernel_cache=NULL; /* caching makes no sense for linear kernel */ | |
1224 } | |
1225 learn_parm->epsilon_shrink=2; | |
1226 (*maxdiff)=1; | |
1227 | |
1228 learn_parm->totwords=totwords; | |
1229 | |
1230 chosen = (long *)my_malloc(sizeof(long)*totdoc); | |
1231 last_suboptimal_at = (long *)my_malloc(sizeof(long)*totdoc); | |
1232 key = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1233 selcrit = (double *)my_malloc(sizeof(double)*totdoc); | |
1234 selexam = (long *)my_malloc(sizeof(long)*totdoc); | |
1235 a_old = (double *)my_malloc(sizeof(double)*totdoc); | |
1236 aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); | |
1237 working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1238 active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1239 qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1240 qp.opt_ce0 = (double *)my_malloc(sizeof(double)); | |
1241 qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize | |
1242 *learn_parm->svm_maxqpsize); | |
1243 qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1244 qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1245 qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1246 qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1247 weights=(double *)my_malloc(sizeof(double)*(totwords+1)); | |
1248 | |
1249 choosenum=0; | |
1250 inconsistentnum=0; | |
1251 transductcycle=0; | |
1252 transduction=0; | |
1253 if(!retrain) retrain=1; | |
1254 iteration=1; | |
1255 bestmaxdiffiter=1; | |
1256 bestmaxdiff=999999999; | |
1257 terminate=0; | |
1258 | |
1259 if(kernel_cache) { | |
1260 kernel_cache->time=iteration; /* for lru cache */ | |
1261 kernel_cache_reset_lru(kernel_cache); | |
1262 } | |
1263 | |
1264 for(i=0;i<totdoc;i++) { /* various inits */ | |
1265 chosen[i]=0; | |
1266 a_old[i]=a[i]; | |
1267 last_suboptimal_at[i]=1; | |
1268 if(inconsistent[i]) | |
1269 inconsistentnum++; | |
1270 if(unlabeled[i]) { | |
1271 transduction=1; | |
1272 } | |
1273 } | |
1274 activenum=compute_index(shrink_state->active,totdoc,active2dnum); | |
1275 inactivenum=totdoc-activenum; | |
1276 clear_index(working2dnum); | |
1277 | |
1278 /* repeat this loop until we have convergence */ | |
1279 for(;retrain && (!terminate);iteration++) { | |
1280 | |
1281 if(kernel_cache) | |
1282 kernel_cache->time=iteration; /* for lru cache */ | |
1283 if(verbosity>=2) { | |
1284 printf( | |
1285 "Iteration %ld: ",iteration); fflush(stdout); | |
1286 } | |
1287 else if(verbosity==1) { | |
1288 printf("."); fflush(stdout); | |
1289 } | |
1290 | |
1291 if(verbosity>=2) t0=get_runtime(); | |
1292 if(verbosity>=3) { | |
1293 printf("\nSelecting working set... "); fflush(stdout); | |
1294 } | |
1295 | |
1296 if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize) | |
1297 learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; | |
1298 | |
1299 i=0; | |
1300 for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */ | |
1301 if((chosen[j]>=(learn_parm->svm_maxqpsize/ | |
1302 minl(learn_parm->svm_maxqpsize, | |
1303 learn_parm->svm_newvarsinqp))) | |
1304 || (inconsistent[j]) | |
1305 || (j == heldout)) { | |
1306 chosen[j]=0; | |
1307 choosenum--; | |
1308 } | |
1309 else { | |
1310 chosen[j]++; | |
1311 working2dnum[i++]=j; | |
1312 } | |
1313 } | |
1314 working2dnum[i]=-1; | |
1315 | |
1316 if(retrain == 2) { | |
1317 choosenum=0; | |
1318 for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* fully clear working set */ | |
1319 chosen[j]=0; | |
1320 } | |
1321 clear_index(working2dnum); | |
1322 for(i=0;i<totdoc;i++) { /* set inconsistent examples to zero (-i 1) */ | |
1323 if((inconsistent[i] || (heldout==i)) && (a[i] != 0.0)) { | |
1324 chosen[i]=99999; | |
1325 choosenum++; | |
1326 a[i]=0; | |
1327 } | |
1328 } | |
1329 if(learn_parm->biased_hyperplane) { | |
1330 eq=0; | |
1331 for(i=0;i<totdoc;i++) { /* make sure we fulfill equality constraint */ | |
1332 eq+=a[i]*label[i]; | |
1333 } | |
1334 for(i=0;(i<totdoc) && (fabs(eq) > learn_parm->epsilon_a);i++) { | |
1335 if((eq*label[i] > 0) && (a[i] > 0)) { | |
1336 chosen[i]=88888; | |
1337 choosenum++; | |
1338 if((eq*label[i]) > a[i]) { | |
1339 eq-=(a[i]*label[i]); | |
1340 a[i]=0; | |
1341 } | |
1342 else { | |
1343 a[i]-=(eq*label[i]); | |
1344 eq=0; | |
1345 } | |
1346 } | |
1347 } | |
1348 } | |
1349 compute_index(chosen,totdoc,working2dnum); | |
1350 } | |
1351 else { /* select working set according to steepest gradient */ | |
1352 if(iteration % 101) { | |
1353 already_chosen=0; | |
1354 if((minl(learn_parm->svm_newvarsinqp, | |
1355 learn_parm->svm_maxqpsize-choosenum)>=4) | |
1356 && (kernel_parm->kernel_type != LINEAR)) { | |
1357 /* select part of the working set from cache */ | |
1358 already_chosen=select_next_qp_subproblem_grad( | |
1359 label,unlabeled,a,lin,c,totdoc, | |
1360 (long)(minl(learn_parm->svm_maxqpsize-choosenum, | |
1361 learn_parm->svm_newvarsinqp) | |
1362 /2), | |
1363 learn_parm,inconsistent,active2dnum, | |
1364 working2dnum,selcrit,selexam,kernel_cache,1, | |
1365 key,chosen); | |
1366 choosenum+=already_chosen; | |
1367 } | |
1368 choosenum+=select_next_qp_subproblem_grad( | |
1369 label,unlabeled,a,lin,c,totdoc, | |
1370 minl(learn_parm->svm_maxqpsize-choosenum, | |
1371 learn_parm->svm_newvarsinqp-already_chosen), | |
1372 learn_parm,inconsistent,active2dnum, | |
1373 working2dnum,selcrit,selexam,kernel_cache,0,key, | |
1374 chosen); | |
1375 } | |
1376 else { /* once in a while, select a somewhat random working set | |
1377 to get unlocked of infinite loops due to numerical | |
1378 inaccuracies in the core qp-solver */ | |
1379 choosenum+=select_next_qp_subproblem_rand( | |
1380 label,unlabeled,a,lin,c,totdoc, | |
1381 minl(learn_parm->svm_maxqpsize-choosenum, | |
1382 learn_parm->svm_newvarsinqp), | |
1383 learn_parm,inconsistent,active2dnum, | |
1384 working2dnum,selcrit,selexam,kernel_cache,key, | |
1385 chosen,iteration); | |
1386 } | |
1387 } | |
1388 | |
1389 if(verbosity>=2) { | |
1390 printf(" %ld vectors chosen\n",choosenum); fflush(stdout); | |
1391 } | |
1392 | |
1393 if(verbosity>=2) t1=get_runtime(); | |
1394 | |
1395 if(kernel_cache) | |
1396 cache_multiple_kernel_rows(kernel_cache,docs,working2dnum, | |
1397 choosenum,kernel_parm); | |
1398 | |
1399 if(verbosity>=2) t2=get_runtime(); | |
1400 if(retrain != 2) { | |
1401 optimize_svm(docs,label,unlabeled,inconsistent,0.0,chosen,active2dnum, | |
1402 model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm, | |
1403 aicache,kernel_parm,&qp,&epsilon_crit_org); | |
1404 } | |
1405 | |
1406 if(verbosity>=2) t3=get_runtime(); | |
1407 update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc, | |
1408 totwords,kernel_parm,kernel_cache,lin,aicache, | |
1409 weights); | |
1410 | |
1411 if(verbosity>=2) t4=get_runtime(); | |
1412 supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c, | |
1413 learn_parm,working2dnum,active2dnum,model); | |
1414 | |
1415 if(verbosity>=2) t5=get_runtime(); | |
1416 | |
1417 /* The following computation of the objective function works only */ | |
1418 /* relative to the active variables */ | |
1419 if(verbosity>=3) { | |
1420 criterion=compute_objective_function(a,lin,c,learn_parm->eps,label, | |
1421 active2dnum); | |
1422 printf("Objective function (over active variables): %.16f\n",criterion); | |
1423 fflush(stdout); | |
1424 } | |
1425 | |
1426 for(jj=0;(i=working2dnum[jj])>=0;jj++) { | |
1427 a_old[i]=a[i]; | |
1428 } | |
1429 | |
1430 if(retrain == 2) { /* reset inconsistent unlabeled examples */ | |
1431 for(i=0;(i<totdoc);i++) { | |
1432 if(inconsistent[i] && unlabeled[i]) { | |
1433 inconsistent[i]=0; | |
1434 label[i]=0; | |
1435 } | |
1436 } | |
1437 } | |
1438 | |
1439 retrain=check_optimality(model,label,unlabeled,a,lin,c,totdoc,learn_parm, | |
1440 maxdiff,epsilon_crit_org,&misclassified, | |
1441 inconsistent,active2dnum,last_suboptimal_at, | |
1442 iteration,kernel_parm); | |
1443 | |
1444 if(verbosity>=2) { | |
1445 t6=get_runtime(); | |
1446 timing_profile->time_select+=t1-t0; | |
1447 timing_profile->time_kernel+=t2-t1; | |
1448 timing_profile->time_opti+=t3-t2; | |
1449 timing_profile->time_update+=t4-t3; | |
1450 timing_profile->time_model+=t5-t4; | |
1451 timing_profile->time_check+=t6-t5; | |
1452 } | |
1453 | |
1454 /* checking whether optimizer got stuck */ | |
1455 if((*maxdiff) < bestmaxdiff) { | |
1456 bestmaxdiff=(*maxdiff); | |
1457 bestmaxdiffiter=iteration; | |
1458 } | |
1459 if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) { | |
1460 /* long time no progress? */ | |
1461 terminate=1; | |
1462 retrain=0; | |
1463 if(verbosity>=1) | |
1464 printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n"); | |
1465 } | |
1466 | |
1467 noshrink=0; | |
1468 if((!retrain) && (inactivenum>0) | |
1469 && ((!learn_parm->skip_final_opt_check) | |
1470 || (kernel_parm->kernel_type == LINEAR))) { | |
1471 if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) | |
1472 || (verbosity>=2)) { | |
1473 if(verbosity==1) { | |
1474 printf("\n"); | |
1475 } | |
1476 printf(" Checking optimality of inactive variables..."); | |
1477 fflush(stdout); | |
1478 } | |
1479 t1=get_runtime(); | |
1480 reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc, | |
1481 totwords,iteration,learn_parm,inconsistent, | |
1482 docs,kernel_parm,kernel_cache,model,aicache, | |
1483 weights,maxdiff); | |
1484 /* Update to new active variables. */ | |
1485 activenum=compute_index(shrink_state->active,totdoc,active2dnum); | |
1486 inactivenum=totdoc-activenum; | |
1487 /* reset watchdog */ | |
1488 bestmaxdiff=(*maxdiff); | |
1489 bestmaxdiffiter=iteration; | |
1490 /* termination criterion */ | |
1491 noshrink=1; | |
1492 retrain=0; | |
1493 if((*maxdiff) > learn_parm->epsilon_crit) | |
1494 retrain=1; | |
1495 timing_profile->time_shrink+=get_runtime()-t1; | |
1496 if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) | |
1497 || (verbosity>=2)) { | |
1498 printf("done.\n"); fflush(stdout); | |
1499 printf(" Number of inactive variables = %ld\n",inactivenum); | |
1500 } | |
1501 } | |
1502 | |
1503 if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff))) | |
1504 learn_parm->epsilon_crit=(*maxdiff); | |
1505 if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) { | |
1506 learn_parm->epsilon_crit/=2.0; | |
1507 retrain=1; | |
1508 noshrink=1; | |
1509 } | |
1510 if(learn_parm->epsilon_crit<epsilon_crit_org) | |
1511 learn_parm->epsilon_crit=epsilon_crit_org; | |
1512 | |
1513 if(verbosity>=2) { | |
1514 printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n", | |
1515 supvecnum,model->at_upper_bound,(*maxdiff)); | |
1516 fflush(stdout); | |
1517 } | |
1518 if(verbosity>=3) { | |
1519 printf("\n"); | |
1520 } | |
1521 | |
1522 if((!retrain) && (transduction)) { | |
1523 for(i=0;(i<totdoc);i++) { | |
1524 shrink_state->active[i]=1; | |
1525 } | |
1526 activenum=compute_index(shrink_state->active,totdoc,active2dnum); | |
1527 inactivenum=0; | |
1528 if(verbosity==1) printf("done\n"); | |
1529 retrain=incorporate_unlabeled_examples(model,label,inconsistent, | |
1530 unlabeled,a,lin,totdoc, | |
1531 selcrit,selexam,key, | |
1532 transductcycle,kernel_parm, | |
1533 learn_parm); | |
1534 epsilon_crit_org=learn_parm->epsilon_crit; | |
1535 if(kernel_parm->kernel_type == LINEAR) | |
1536 learn_parm->epsilon_crit=1; | |
1537 transductcycle++; | |
1538 /* reset watchdog */ | |
1539 bestmaxdiff=(*maxdiff); | |
1540 bestmaxdiffiter=iteration; | |
1541 } | |
1542 else if(((iteration % 10) == 0) && (!noshrink)) { | |
1543 activenum=shrink_problem(docs,learn_parm,shrink_state,kernel_parm, | |
1544 active2dnum,last_suboptimal_at,iteration,totdoc, | |
1545 maxl((long)(activenum/10), | |
1546 maxl((long)(totdoc/500),100)), | |
1547 a,inconsistent); | |
1548 inactivenum=totdoc-activenum; | |
1549 if((kernel_cache) | |
1550 && (supvecnum>kernel_cache->max_elems) | |
1551 && ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) { | |
1552 kernel_cache_shrink(kernel_cache,totdoc, | |
1553 minl((kernel_cache->activenum-activenum), | |
1554 (kernel_cache->activenum-supvecnum)), | |
1555 shrink_state->active); | |
1556 } | |
1557 } | |
1558 | |
1559 if((!retrain) && learn_parm->remove_inconsistent) { | |
1560 if(verbosity>=1) { | |
1561 printf(" Moving training errors to inconsistent examples..."); | |
1562 fflush(stdout); | |
1563 } | |
1564 if(learn_parm->remove_inconsistent == 1) { | |
1565 retrain=identify_inconsistent(a,label,unlabeled,totdoc,learn_parm, | |
1566 &inconsistentnum,inconsistent); | |
1567 } | |
1568 else if(learn_parm->remove_inconsistent == 2) { | |
1569 retrain=identify_misclassified(lin,label,unlabeled,totdoc, | |
1570 model,&inconsistentnum,inconsistent); | |
1571 } | |
1572 else if(learn_parm->remove_inconsistent == 3) { | |
1573 retrain=identify_one_misclassified(lin,label,unlabeled,totdoc, | |
1574 model,&inconsistentnum,inconsistent); | |
1575 } | |
1576 if(retrain) { | |
1577 if(kernel_parm->kernel_type == LINEAR) { /* reinit shrinking */ | |
1578 learn_parm->epsilon_crit=2.0; | |
1579 } | |
1580 } | |
1581 if(verbosity>=1) { | |
1582 printf("done.\n"); | |
1583 if(retrain) { | |
1584 printf(" Now %ld inconsistent examples.\n",inconsistentnum); | |
1585 } | |
1586 } | |
1587 } | |
1588 } /* end of loop */ | |
1589 | |
1590 free(chosen); | |
1591 free(last_suboptimal_at); | |
1592 free(key); | |
1593 free(selcrit); | |
1594 free(selexam); | |
1595 free(a_old); | |
1596 free(aicache); | |
1597 free(working2dnum); | |
1598 free(active2dnum); | |
1599 free(qp.opt_ce); | |
1600 free(qp.opt_ce0); | |
1601 free(qp.opt_g); | |
1602 free(qp.opt_g0); | |
1603 free(qp.opt_xinit); | |
1604 free(qp.opt_low); | |
1605 free(qp.opt_up); | |
1606 free(weights); | |
1607 | |
1608 learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */ | |
1609 model->maxdiff=(*maxdiff); | |
1610 | |
1611 return(iteration); | |
1612 } | |
1613 | |
1614 long optimize_to_convergence_sharedslack(DOC **docs, long int *label, | |
1615 long int totdoc, | |
1616 long int totwords, LEARN_PARM *learn_parm, | |
1617 KERNEL_PARM *kernel_parm, | |
1618 KERNEL_CACHE *kernel_cache, | |
1619 SHRINK_STATE *shrink_state, MODEL *model, | |
1620 double *a, double *lin, double *c, | |
1621 TIMING *timing_profile, double *maxdiff) | |
1622 /* docs: Training vectors (x-part) */ | |
1623 /* label: Training labels/value (y-part, zero if test example for | |
1624 transduction) */ | |
1625 /* totdoc: Number of examples in docs/label */ | |
1626 /* totwords: Number of features (i.e. highest feature index) */ | |
1627 /* learn_parm: Learning paramenters */ | |
1628 /* kernel_parm: Kernel paramenters */ | |
1629 /* kernel_cache: Initialized/partly filled Cache, if using a kernel. | |
1630 NULL if linear. */ | |
1631 /* shrink_state: State of active variables */ | |
1632 /* model: Returns learning result */ | |
1633 /* a: alphas */ | |
1634 /* lin: linear component of gradient */ | |
1635 /* c: right hand side of inequalities (margin) */ | |
1636 /* maxdiff: returns maximum violation of KT-conditions */ | |
1637 { | |
1638 long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink,*unlabeled; | |
1639 long *inconsistent,choosenum,already_chosen=0,iteration; | |
1640 long misclassified,supvecnum=0,*active2dnum,inactivenum; | |
1641 long *working2dnum,*selexam,*ignore; | |
1642 long activenum,retrain,maxslackid,slackset,jointstep; | |
1643 double criterion,eq_target; | |
1644 double *a_old,*alphaslack; | |
1645 long t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */ | |
1646 double epsilon_crit_org,maxsharedviol; | |
1647 double bestmaxdiff; | |
1648 long bestmaxdiffiter,terminate; | |
1649 | |
1650 double *selcrit; /* buffer for sorting */ | |
1651 CFLOAT *aicache; /* buffer to keep one row of hessian */ | |
1652 double *weights; /* buffer for weight vector in linear case */ | |
1653 QP qp; /* buffer for one quadratic program */ | |
1654 double *slack; /* vector of slack variables for optimization with | |
1655 shared slacks */ | |
1656 | |
1657 epsilon_crit_org=learn_parm->epsilon_crit; /* save org */ | |
1658 if(kernel_parm->kernel_type == LINEAR) { | |
1659 learn_parm->epsilon_crit=2.0; | |
1660 kernel_cache=NULL; /* caching makes no sense for linear kernel */ | |
1661 } | |
1662 learn_parm->epsilon_shrink=2; | |
1663 (*maxdiff)=1; | |
1664 | |
1665 learn_parm->totwords=totwords; | |
1666 | |
1667 chosen = (long *)my_malloc(sizeof(long)*totdoc); | |
1668 unlabeled = (long *)my_malloc(sizeof(long)*totdoc); | |
1669 inconsistent = (long *)my_malloc(sizeof(long)*totdoc); | |
1670 ignore = (long *)my_malloc(sizeof(long)*totdoc); | |
1671 key = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1672 selcrit = (double *)my_malloc(sizeof(double)*totdoc); | |
1673 selexam = (long *)my_malloc(sizeof(long)*totdoc); | |
1674 a_old = (double *)my_malloc(sizeof(double)*totdoc); | |
1675 aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); | |
1676 working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1677 active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
1678 qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1679 qp.opt_ce0 = (double *)my_malloc(sizeof(double)); | |
1680 qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize | |
1681 *learn_parm->svm_maxqpsize); | |
1682 qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1683 qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1684 qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1685 qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); | |
1686 weights=(double *)my_malloc(sizeof(double)*(totwords+1)); | |
1687 maxslackid=0; | |
1688 for(i=0;i<totdoc;i++) { /* determine size of slack array */ | |
1689 if(maxslackid<docs[i]->slackid) | |
1690 maxslackid=docs[i]->slackid; | |
1691 } | |
1692 slack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); | |
1693 alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); | |
1694 last_suboptimal_at = (long *)my_malloc(sizeof(long)*(maxslackid+1)); | |
1695 for(i=0;i<=maxslackid;i++) { /* init shared slacks */ | |
1696 slack[i]=0; | |
1697 alphaslack[i]=0; | |
1698 last_suboptimal_at[i]=1; | |
1699 } | |
1700 | |
1701 choosenum=0; | |
1702 retrain=1; | |
1703 iteration=1; | |
1704 bestmaxdiffiter=1; | |
1705 bestmaxdiff=999999999; | |
1706 terminate=0; | |
1707 | |
1708 if(kernel_cache) { | |
1709 kernel_cache->time=iteration; /* for lru cache */ | |
1710 kernel_cache_reset_lru(kernel_cache); | |
1711 } | |
1712 | |
1713 for(i=0;i<totdoc;i++) { /* various inits */ | |
1714 chosen[i]=0; | |
1715 unlabeled[i]=0; | |
1716 inconsistent[i]=0; | |
1717 ignore[i]=0; | |
1718 a_old[i]=a[i]; | |
1719 } | |
1720 activenum=compute_index(shrink_state->active,totdoc,active2dnum); | |
1721 inactivenum=totdoc-activenum; | |
1722 clear_index(working2dnum); | |
1723 | |
1724 /* call to init slack and alphaslack */ | |
1725 compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm, | |
1726 slack,alphaslack); | |
1727 | |
1728 /* repeat this loop until we have convergence */ | |
1729 for(;retrain && (!terminate);iteration++) { | |
1730 | |
1731 if(kernel_cache) | |
1732 kernel_cache->time=iteration; /* for lru cache */ | |
1733 if(verbosity>=2) { | |
1734 printf( | |
1735 "Iteration %ld: ",iteration); fflush(stdout); | |
1736 } | |
1737 else if(verbosity==1) { | |
1738 printf("."); fflush(stdout); | |
1739 } | |
1740 | |
1741 if(verbosity>=2) t0=get_runtime(); | |
1742 if(verbosity>=3) { | |
1743 printf("\nSelecting working set... "); fflush(stdout); | |
1744 } | |
1745 | |
1746 if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize) | |
1747 learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; | |
1748 | |
1749 /* select working set according to steepest gradient */ | |
1750 jointstep=0; | |
1751 eq_target=0; | |
1752 if(iteration % 101) { | |
1753 slackset=select_next_qp_slackset(docs,label,a,lin,slack,alphaslack,c, | |
1754 learn_parm,active2dnum,&maxsharedviol); | |
1755 if((iteration % 2) | |
1756 || (!slackset) || (maxsharedviol<learn_parm->epsilon_crit)){ | |
1757 /* do a step with examples from different slack sets */ | |
1758 if(verbosity >= 2) { | |
1759 printf("(i-step)"); fflush(stdout); | |
1760 } | |
1761 i=0; | |
1762 for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear old part of working set */ | |
1763 if((chosen[j]>=(learn_parm->svm_maxqpsize/ | |
1764 minl(learn_parm->svm_maxqpsize, | |
1765 learn_parm->svm_newvarsinqp)))) { | |
1766 chosen[j]=0; | |
1767 choosenum--; | |
1768 } | |
1769 else { | |
1770 chosen[j]++; | |
1771 working2dnum[i++]=j; | |
1772 } | |
1773 } | |
1774 working2dnum[i]=-1; | |
1775 | |
1776 already_chosen=0; | |
1777 if((minl(learn_parm->svm_newvarsinqp, | |
1778 learn_parm->svm_maxqpsize-choosenum)>=4) | |
1779 && (kernel_parm->kernel_type != LINEAR)) { | |
1780 /* select part of the working set from cache */ | |
1781 already_chosen=select_next_qp_subproblem_grad( | |
1782 label,unlabeled,a,lin,c,totdoc, | |
1783 (long)(minl(learn_parm->svm_maxqpsize-choosenum, | |
1784 learn_parm->svm_newvarsinqp) | |
1785 /2), | |
1786 learn_parm,inconsistent,active2dnum, | |
1787 working2dnum,selcrit,selexam,kernel_cache, | |
1788 (long)1,key,chosen); | |
1789 choosenum+=already_chosen; | |
1790 } | |
1791 choosenum+=select_next_qp_subproblem_grad( | |
1792 label,unlabeled,a,lin,c,totdoc, | |
1793 minl(learn_parm->svm_maxqpsize-choosenum, | |
1794 learn_parm->svm_newvarsinqp-already_chosen), | |
1795 learn_parm,inconsistent,active2dnum, | |
1796 working2dnum,selcrit,selexam,kernel_cache, | |
1797 (long)0,key,chosen); | |
1798 } | |
1799 else { /* do a step with all examples from same slack set */ | |
1800 if(verbosity >= 2) { | |
1801 printf("(j-step on %ld)",slackset); fflush(stdout); | |
1802 } | |
1803 jointstep=1; | |
1804 for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */ | |
1805 chosen[j]=0; | |
1806 } | |
1807 working2dnum[0]=-1; | |
1808 eq_target=alphaslack[slackset]; | |
1809 for(j=0;j<totdoc;j++) { /* mask all but slackset */ | |
1810 /* for(jj=0;(j=active2dnum[jj])>=0;jj++) { */ | |
1811 if(docs[j]->slackid != slackset) | |
1812 ignore[j]=1; | |
1813 else { | |
1814 ignore[j]=0; | |
1815 learn_parm->svm_cost[j]=learn_parm->svm_c; | |
1816 /* printf("Inslackset(%ld,%ld)",j,shrink_state->active[j]); */ | |
1817 } | |
1818 } | |
1819 learn_parm->biased_hyperplane=1; | |
1820 choosenum=select_next_qp_subproblem_grad( | |
1821 label,unlabeled,a,lin,c,totdoc, | |
1822 learn_parm->svm_maxqpsize, | |
1823 learn_parm,ignore,active2dnum, | |
1824 working2dnum,selcrit,selexam,kernel_cache, | |
1825 (long)0,key,chosen); | |
1826 learn_parm->biased_hyperplane=0; | |
1827 } | |
1828 } | |
1829 else { /* once in a while, select a somewhat random working set | |
1830 to get unlocked of infinite loops due to numerical | |
1831 inaccuracies in the core qp-solver */ | |
1832 choosenum+=select_next_qp_subproblem_rand( | |
1833 label,unlabeled,a,lin,c,totdoc, | |
1834 minl(learn_parm->svm_maxqpsize-choosenum, | |
1835 learn_parm->svm_newvarsinqp), | |
1836 learn_parm,inconsistent,active2dnum, | |
1837 working2dnum,selcrit,selexam,kernel_cache,key, | |
1838 chosen,iteration); | |
1839 } | |
1840 | |
1841 if(verbosity>=2) { | |
1842 printf(" %ld vectors chosen\n",choosenum); fflush(stdout); | |
1843 } | |
1844 | |
1845 if(verbosity>=2) t1=get_runtime(); | |
1846 | |
1847 if(kernel_cache) | |
1848 cache_multiple_kernel_rows(kernel_cache,docs,working2dnum, | |
1849 choosenum,kernel_parm); | |
1850 | |
1851 if(verbosity>=2) t2=get_runtime(); | |
1852 if(jointstep) learn_parm->biased_hyperplane=1; | |
1853 optimize_svm(docs,label,unlabeled,ignore,eq_target,chosen,active2dnum, | |
1854 model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm, | |
1855 aicache,kernel_parm,&qp,&epsilon_crit_org); | |
1856 learn_parm->biased_hyperplane=0; | |
1857 | |
1858 for(jj=0;(i=working2dnum[jj])>=0;jj++) /* recompute sums of alphas */ | |
1859 alphaslack[docs[i]->slackid]+=(a[i]-a_old[i]); | |
1860 for(jj=0;(i=working2dnum[jj])>=0;jj++) { /* reduce alpha to fulfill | |
1861 constraints */ | |
1862 if(alphaslack[docs[i]->slackid] > learn_parm->svm_c) { | |
1863 if(a[i] < (alphaslack[docs[i]->slackid]-learn_parm->svm_c)) { | |
1864 alphaslack[docs[i]->slackid]-=a[i]; | |
1865 a[i]=0; | |
1866 } | |
1867 else { | |
1868 a[i]-=(alphaslack[docs[i]->slackid]-learn_parm->svm_c); | |
1869 alphaslack[docs[i]->slackid]=learn_parm->svm_c; | |
1870 } | |
1871 } | |
1872 } | |
1873 for(jj=0;(i=active2dnum[jj])>=0;jj++) | |
1874 learn_parm->svm_cost[i]=a[i]+(learn_parm->svm_c | |
1875 -alphaslack[docs[i]->slackid]); | |
1876 | |
1877 if(verbosity>=2) t3=get_runtime(); | |
1878 update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc, | |
1879 totwords,kernel_parm,kernel_cache,lin,aicache, | |
1880 weights); | |
1881 compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm, | |
1882 slack,alphaslack); | |
1883 | |
1884 if(verbosity>=2) t4=get_runtime(); | |
1885 supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c, | |
1886 learn_parm,working2dnum,active2dnum,model); | |
1887 | |
1888 if(verbosity>=2) t5=get_runtime(); | |
1889 | |
1890 /* The following computation of the objective function works only */ | |
1891 /* relative to the active variables */ | |
1892 if(verbosity>=3) { | |
1893 criterion=compute_objective_function(a,lin,c,learn_parm->eps,label, | |
1894 active2dnum); | |
1895 printf("Objective function (over active variables): %.16f\n",criterion); | |
1896 fflush(stdout); | |
1897 } | |
1898 | |
1899 for(jj=0;(i=working2dnum[jj])>=0;jj++) { | |
1900 a_old[i]=a[i]; | |
1901 } | |
1902 | |
1903 retrain=check_optimality_sharedslack(docs,model,label,a,lin,c, | |
1904 slack,alphaslack,totdoc,learn_parm, | |
1905 maxdiff,epsilon_crit_org,&misclassified, | |
1906 active2dnum,last_suboptimal_at, | |
1907 iteration,kernel_parm); | |
1908 | |
1909 if(verbosity>=2) { | |
1910 t6=get_runtime(); | |
1911 timing_profile->time_select+=t1-t0; | |
1912 timing_profile->time_kernel+=t2-t1; | |
1913 timing_profile->time_opti+=t3-t2; | |
1914 timing_profile->time_update+=t4-t3; | |
1915 timing_profile->time_model+=t5-t4; | |
1916 timing_profile->time_check+=t6-t5; | |
1917 } | |
1918 | |
1919 /* checking whether optimizer got stuck */ | |
1920 if((*maxdiff) < bestmaxdiff) { | |
1921 bestmaxdiff=(*maxdiff); | |
1922 bestmaxdiffiter=iteration; | |
1923 } | |
1924 if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) { | |
1925 /* long time no progress? */ | |
1926 terminate=1; | |
1927 retrain=0; | |
1928 if(verbosity>=1) | |
1929 printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n"); | |
1930 } | |
1931 | |
1932 noshrink=0; | |
1933 | |
1934 if((!retrain) && (inactivenum>0) | |
1935 && ((!learn_parm->skip_final_opt_check) | |
1936 || (kernel_parm->kernel_type == LINEAR))) { | |
1937 if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) | |
1938 || (verbosity>=2)) { | |
1939 if(verbosity==1) { | |
1940 printf("\n"); | |
1941 } | |
1942 printf(" Checking optimality of inactive variables..."); | |
1943 fflush(stdout); | |
1944 } | |
1945 t1=get_runtime(); | |
1946 reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc, | |
1947 totwords,iteration,learn_parm,inconsistent, | |
1948 docs,kernel_parm,kernel_cache,model,aicache, | |
1949 weights,maxdiff); | |
1950 /* Update to new active variables. */ | |
1951 activenum=compute_index(shrink_state->active,totdoc,active2dnum); | |
1952 inactivenum=totdoc-activenum; | |
1953 /* check optimality, since check in reactivate does not work for | |
1954 sharedslacks */ | |
1955 retrain=check_optimality_sharedslack(docs,model,label,a,lin,c, | |
1956 slack,alphaslack,totdoc,learn_parm, | |
1957 maxdiff,epsilon_crit_org,&misclassified, | |
1958 active2dnum,last_suboptimal_at, | |
1959 iteration,kernel_parm); | |
1960 | |
1961 /* reset watchdog */ | |
1962 bestmaxdiff=(*maxdiff); | |
1963 bestmaxdiffiter=iteration; | |
1964 /* termination criterion */ | |
1965 noshrink=1; | |
1966 retrain=0; | |
1967 if((*maxdiff) > learn_parm->epsilon_crit) | |
1968 retrain=1; | |
1969 timing_profile->time_shrink+=get_runtime()-t1; | |
1970 if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) | |
1971 || (verbosity>=2)) { | |
1972 printf("done.\n"); fflush(stdout); | |
1973 printf(" Number of inactive variables = %ld\n",inactivenum); | |
1974 } | |
1975 } | |
1976 | |
1977 if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff))) | |
1978 learn_parm->epsilon_crit=(*maxdiff); | |
1979 if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) { | |
1980 learn_parm->epsilon_crit/=2.0; | |
1981 retrain=1; | |
1982 noshrink=1; | |
1983 } | |
1984 if(learn_parm->epsilon_crit<epsilon_crit_org) | |
1985 learn_parm->epsilon_crit=epsilon_crit_org; | |
1986 | |
1987 if(verbosity>=2) { | |
1988 printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n", | |
1989 supvecnum,model->at_upper_bound,(*maxdiff)); | |
1990 fflush(stdout); | |
1991 } | |
1992 if(verbosity>=3) { | |
1993 printf("\n"); | |
1994 } | |
1995 | |
1996 if(((iteration % 10) == 0) && (!noshrink)) { | |
1997 activenum=shrink_problem(docs,learn_parm,shrink_state, | |
1998 kernel_parm,active2dnum, | |
1999 last_suboptimal_at,iteration,totdoc, | |
2000 maxl((long)(activenum/10), | |
2001 maxl((long)(totdoc/500),100)), | |
2002 a,inconsistent); | |
2003 inactivenum=totdoc-activenum; | |
2004 if((kernel_cache) | |
2005 && (supvecnum>kernel_cache->max_elems) | |
2006 && ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) { | |
2007 kernel_cache_shrink(kernel_cache,totdoc, | |
2008 minl((kernel_cache->activenum-activenum), | |
2009 (kernel_cache->activenum-supvecnum)), | |
2010 shrink_state->active); | |
2011 } | |
2012 } | |
2013 | |
2014 } /* end of loop */ | |
2015 | |
2016 | |
2017 free(alphaslack); | |
2018 free(slack); | |
2019 free(chosen); | |
2020 free(unlabeled); | |
2021 free(inconsistent); | |
2022 free(ignore); | |
2023 free(last_suboptimal_at); | |
2024 free(key); | |
2025 free(selcrit); | |
2026 free(selexam); | |
2027 free(a_old); | |
2028 free(aicache); | |
2029 free(working2dnum); | |
2030 free(active2dnum); | |
2031 free(qp.opt_ce); | |
2032 free(qp.opt_ce0); | |
2033 free(qp.opt_g); | |
2034 free(qp.opt_g0); | |
2035 free(qp.opt_xinit); | |
2036 free(qp.opt_low); | |
2037 free(qp.opt_up); | |
2038 free(weights); | |
2039 | |
2040 learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */ | |
2041 model->maxdiff=(*maxdiff); | |
2042 | |
2043 return(iteration); | |
2044 } | |
2045 | |
2046 | |
2047 double compute_objective_function(double *a, double *lin, double *c, | |
2048 double eps, long int *label, | |
2049 long int *active2dnum) | |
2050 /* Return value of objective function. */ | |
2051 /* Works only relative to the active variables! */ | |
2052 { | |
2053 long i,ii; | |
2054 double criterion; | |
2055 /* calculate value of objective function */ | |
2056 criterion=0; | |
2057 for(ii=0;active2dnum[ii]>=0;ii++) { | |
2058 i=active2dnum[ii]; | |
2059 criterion=criterion+(eps-(double)label[i]*c[i])*a[i]+0.5*a[i]*label[i]*lin[i]; | |
2060 } | |
2061 return(criterion); | |
2062 } | |
2063 | |
2064 void clear_index(long int *index) | |
2065 /* initializes and empties index */ | |
2066 { | |
2067 index[0]=-1; | |
2068 } | |
2069 | |
2070 void add_to_index(long int *index, long int elem) | |
2071 /* initializes and empties index */ | |
2072 { | |
2073 register long i; | |
2074 for(i=0;index[i] != -1;i++); | |
2075 index[i]=elem; | |
2076 index[i+1]=-1; | |
2077 } | |
2078 | |
2079 long compute_index(long int *binfeature, long int range, long int *index) | |
2080 /* create an inverted index of binfeature */ | |
2081 { | |
2082 register long i,ii; | |
2083 | |
2084 ii=0; | |
2085 for(i=0;i<range;i++) { | |
2086 if(binfeature[i]) { | |
2087 index[ii]=i; | |
2088 ii++; | |
2089 } | |
2090 } | |
2091 for(i=0;i<4;i++) { | |
2092 index[ii+i]=-1; | |
2093 } | |
2094 return(ii); | |
2095 } | |
2096 | |
2097 | |
2098 void optimize_svm(DOC **docs, long int *label, long int *unlabeled, | |
2099 long int *exclude_from_eq_const, double eq_target, | |
2100 long int *chosen, long int *active2dnum, MODEL *model, | |
2101 long int totdoc, long int *working2dnum, long int varnum, | |
2102 double *a, double *lin, double *c, LEARN_PARM *learn_parm, | |
2103 CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp, | |
2104 double *epsilon_crit_target) | |
2105 /* Do optimization on the working set. */ | |
2106 { | |
2107 long i; | |
2108 double *a_v; | |
2109 | |
2110 compute_matrices_for_optimization(docs,label,unlabeled, | |
2111 exclude_from_eq_const,eq_target,chosen, | |
2112 active2dnum,working2dnum,model,a,lin,c, | |
2113 varnum,totdoc,learn_parm,aicache, | |
2114 kernel_parm,qp); | |
2115 | |
2116 if(verbosity>=3) { | |
2117 printf("Running optimizer..."); fflush(stdout); | |
2118 } | |
2119 /* call the qp-subsolver */ | |
2120 a_v=optimize_qp(qp,epsilon_crit_target, | |
2121 learn_parm->svm_maxqpsize, | |
2122 &(model->b), /* in case the optimizer gives us */ | |
2123 /* the threshold for free. otherwise */ | |
2124 /* b is calculated in calculate_model. */ | |
2125 learn_parm); | |
2126 if(verbosity>=3) { | |
2127 printf("done\n"); | |
2128 } | |
2129 | |
2130 for(i=0;i<varnum;i++) { | |
2131 a[working2dnum[i]]=a_v[i]; | |
2132 /* | |
2133 if(a_v[i]<=(0+learn_parm->epsilon_a)) { | |
2134 a[working2dnum[i]]=0; | |
2135 } | |
2136 else if(a_v[i]>=(learn_parm->svm_cost[working2dnum[i]]-learn_parm->epsilon_a)) { | |
2137 a[working2dnum[i]]=learn_parm->svm_cost[working2dnum[i]]; | |
2138 } | |
2139 */ | |
2140 } | |
2141 } | |
2142 | |
2143 void compute_matrices_for_optimization(DOC **docs, long int *label, | |
2144 long int *unlabeled, long *exclude_from_eq_const, double eq_target, | |
2145 long int *chosen, long int *active2dnum, | |
2146 long int *key, MODEL *model, double *a, double *lin, double *c, | |
2147 long int varnum, long int totdoc, LEARN_PARM *learn_parm, | |
2148 CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp) | |
2149 { | |
2150 register long ki,kj,i,j; | |
2151 register double kernel_temp; | |
2152 | |
2153 if(verbosity>=3) { | |
2154 fprintf(stdout,"Computing qp-matrices (type %ld kernel [degree %ld, rbf_gamma %f, coef_lin %f, coef_const %f])...",kernel_parm->kernel_type,kernel_parm->poly_degree,kernel_parm->rbf_gamma,kernel_parm->coef_lin,kernel_parm->coef_const); | |
2155 fflush(stdout); | |
2156 } | |
2157 | |
2158 qp->opt_n=varnum; | |
2159 qp->opt_ce0[0]=-eq_target; /* compute the constant for equality constraint */ | |
2160 for(j=1;j<model->sv_num;j++) { /* start at 1 */ | |
2161 if((!chosen[(model->supvec[j])->docnum]) | |
2162 && (!exclude_from_eq_const[(model->supvec[j])->docnum])) { | |
2163 qp->opt_ce0[0]+=model->alpha[j]; | |
2164 } | |
2165 } | |
2166 if(learn_parm->biased_hyperplane) | |
2167 qp->opt_m=1; | |
2168 else | |
2169 qp->opt_m=0; /* eq-constraint will be ignored */ | |
2170 | |
2171 /* init linear part of objective function */ | |
2172 for(i=0;i<varnum;i++) { | |
2173 qp->opt_g0[i]=lin[key[i]]; | |
2174 } | |
2175 | |
2176 for(i=0;i<varnum;i++) { | |
2177 ki=key[i]; | |
2178 | |
2179 /* Compute the matrix for equality constraints */ | |
2180 qp->opt_ce[i]=label[ki]; | |
2181 qp->opt_low[i]=0; | |
2182 qp->opt_up[i]=learn_parm->svm_cost[ki]; | |
2183 | |
2184 kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[ki]); | |
2185 /* compute linear part of objective function */ | |
2186 qp->opt_g0[i]-=(kernel_temp*a[ki]*(double)label[ki]); | |
2187 /* compute quadratic part of objective function */ | |
2188 qp->opt_g[varnum*i+i]=kernel_temp; | |
2189 for(j=i+1;j<varnum;j++) { | |
2190 kj=key[j]; | |
2191 kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[kj]); | |
2192 /* compute linear part of objective function */ | |
2193 qp->opt_g0[i]-=(kernel_temp*a[kj]*(double)label[kj]); | |
2194 qp->opt_g0[j]-=(kernel_temp*a[ki]*(double)label[ki]); | |
2195 /* compute quadratic part of objective function */ | |
2196 qp->opt_g[varnum*i+j]=(double)label[ki]*(double)label[kj]*kernel_temp; | |
2197 qp->opt_g[varnum*j+i]=(double)label[ki]*(double)label[kj]*kernel_temp; | |
2198 } | |
2199 | |
2200 if(verbosity>=3) { | |
2201 if(i % 20 == 0) { | |
2202 fprintf(stdout,"%ld..",i); fflush(stdout); | |
2203 } | |
2204 } | |
2205 } | |
2206 | |
2207 for(i=0;i<varnum;i++) { | |
2208 /* assure starting at feasible point */ | |
2209 qp->opt_xinit[i]=a[key[i]]; | |
2210 /* set linear part of objective function */ | |
2211 qp->opt_g0[i]=(learn_parm->eps-(double)label[key[i]]*c[key[i]])+qp->opt_g0[i]*(double)label[key[i]]; | |
2212 } | |
2213 | |
2214 if(verbosity>=3) { | |
2215 fprintf(stdout,"done\n"); | |
2216 } | |
2217 } | |
2218 | |
2219 long calculate_svm_model(DOC **docs, long int *label, long int *unlabeled, | |
2220 double *lin, double *a, double *a_old, double *c, | |
2221 LEARN_PARM *learn_parm, long int *working2dnum, | |
2222 long int *active2dnum, MODEL *model) | |
2223 /* Compute decision function based on current values */ | |
2224 /* of alpha. */ | |
2225 { | |
2226 long i,ii,pos,b_calculated=0,first_low,first_high; | |
2227 double ex_c,b_temp,b_low,b_high; | |
2228 | |
2229 if(verbosity>=3) { | |
2230 printf("Calculating model..."); fflush(stdout); | |
2231 } | |
2232 | |
2233 if(!learn_parm->biased_hyperplane) { | |
2234 model->b=0; | |
2235 b_calculated=1; | |
2236 } | |
2237 | |
2238 for(ii=0;(i=working2dnum[ii])>=0;ii++) { | |
2239 if((a_old[i]>0) && (a[i]==0)) { /* remove from model */ | |
2240 pos=model->index[i]; | |
2241 model->index[i]=-1; | |
2242 (model->sv_num)--; | |
2243 model->supvec[pos]=model->supvec[model->sv_num]; | |
2244 model->alpha[pos]=model->alpha[model->sv_num]; | |
2245 model->index[(model->supvec[pos])->docnum]=pos; | |
2246 } | |
2247 else if((a_old[i]==0) && (a[i]>0)) { /* add to model */ | |
2248 model->supvec[model->sv_num]=docs[i]; | |
2249 model->alpha[model->sv_num]=a[i]*(double)label[i]; | |
2250 model->index[i]=model->sv_num; | |
2251 (model->sv_num)++; | |
2252 } | |
2253 else if(a_old[i]==a[i]) { /* nothing to do */ | |
2254 } | |
2255 else { /* just update alpha */ | |
2256 model->alpha[model->index[i]]=a[i]*(double)label[i]; | |
2257 } | |
2258 | |
2259 ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; | |
2260 if((a_old[i]>=ex_c) && (a[i]<ex_c)) { | |
2261 (model->at_upper_bound)--; | |
2262 } | |
2263 else if((a_old[i]<ex_c) && (a[i]>=ex_c)) { | |
2264 (model->at_upper_bound)++; | |
2265 } | |
2266 | |
2267 if((!b_calculated) | |
2268 && (a[i]>learn_parm->epsilon_a) && (a[i]<ex_c)) { /* calculate b */ | |
2269 model->b=((double)label[i]*learn_parm->eps-c[i]+lin[i]); | |
2270 /* model->b=(-(double)label[i]+lin[i]); */ | |
2271 b_calculated=1; | |
2272 } | |
2273 } | |
2274 | |
2275 /* No alpha in the working set not at bounds, so b was not | |
2276 calculated in the usual way. The following handles this special | |
2277 case. */ | |
2278 if(learn_parm->biased_hyperplane | |
2279 && (!b_calculated) | |
2280 && (model->sv_num-1 == model->at_upper_bound)) { | |
2281 first_low=1; | |
2282 first_high=1; | |
2283 b_low=0; | |
2284 b_high=0; | |
2285 for(ii=0;(i=active2dnum[ii])>=0;ii++) { | |
2286 ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; | |
2287 if(a[i]<ex_c) { | |
2288 if(label[i]>0) { | |
2289 b_temp=-(learn_parm->eps-c[i]+lin[i]); | |
2290 if((b_temp>b_low) || (first_low)) { | |
2291 b_low=b_temp; | |
2292 first_low=0; | |
2293 } | |
2294 } | |
2295 else { | |
2296 b_temp=-(-learn_parm->eps-c[i]+lin[i]); | |
2297 if((b_temp<b_high) || (first_high)) { | |
2298 b_high=b_temp; | |
2299 first_high=0; | |
2300 } | |
2301 } | |
2302 } | |
2303 else { | |
2304 if(label[i]<0) { | |
2305 b_temp=-(-learn_parm->eps-c[i]+lin[i]); | |
2306 if((b_temp>b_low) || (first_low)) { | |
2307 b_low=b_temp; | |
2308 first_low=0; | |
2309 } | |
2310 } | |
2311 else { | |
2312 b_temp=-(learn_parm->eps-c[i]+lin[i]); | |
2313 if((b_temp<b_high) || (first_high)) { | |
2314 b_high=b_temp; | |
2315 first_high=0; | |
2316 } | |
2317 } | |
2318 } | |
2319 } | |
2320 if(first_high) { | |
2321 model->b=-b_low; | |
2322 } | |
2323 else if(first_low) { | |
2324 model->b=-b_high; | |
2325 } | |
2326 else { | |
2327 model->b=-(b_high+b_low)/2.0; /* select b as the middle of range */ | |
2328 /* printf("\nb_low=%f, b_high=%f,b=%f\n",b_low,b_high,model->b); */ | |
2329 } | |
2330 } | |
2331 | |
2332 if(verbosity>=3) { | |
2333 printf("done\n"); fflush(stdout); | |
2334 } | |
2335 | |
2336 return(model->sv_num-1); /* have to substract one, since element 0 is empty*/ | |
2337 } | |
2338 | |
2339 long check_optimality(MODEL *model, long int *label, long int *unlabeled, | |
2340 double *a, double *lin, double *c, long int totdoc, | |
2341 LEARN_PARM *learn_parm, double *maxdiff, | |
2342 double epsilon_crit_org, long int *misclassified, | |
2343 long int *inconsistent, long int *active2dnum, | |
2344 long int *last_suboptimal_at, | |
2345 long int iteration, KERNEL_PARM *kernel_parm) | |
2346 /* Check KT-conditions */ | |
2347 { | |
2348 long i,ii,retrain; | |
2349 double dist,ex_c,target; | |
2350 | |
2351 if(kernel_parm->kernel_type == LINEAR) { /* be optimistic */ | |
2352 learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org; | |
2353 } | |
2354 else { /* be conservative */ | |
2355 learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3; | |
2356 } | |
2357 retrain=0; | |
2358 (*maxdiff)=0; | |
2359 (*misclassified)=0; | |
2360 for(ii=0;(i=active2dnum[ii])>=0;ii++) { | |
2361 if((!inconsistent[i]) && label[i]) { | |
2362 dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from | |
2363 hyperplane*/ | |
2364 target=-(learn_parm->eps-(double)label[i]*c[i]); | |
2365 ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; | |
2366 if(dist <= 0) { | |
2367 (*misclassified)++; /* does not work due to deactivation of var */ | |
2368 } | |
2369 if((a[i]>learn_parm->epsilon_a) && (dist > target)) { | |
2370 if((dist-target)>(*maxdiff)) /* largest violation */ | |
2371 (*maxdiff)=dist-target; | |
2372 } | |
2373 else if((a[i]<ex_c) && (dist < target)) { | |
2374 if((target-dist)>(*maxdiff)) /* largest violation */ | |
2375 (*maxdiff)=target-dist; | |
2376 } | |
2377 /* Count how long a variable was at lower/upper bound (and optimal).*/ | |
2378 /* Variables, which were at the bound and optimal for a long */ | |
2379 /* time are unlikely to become support vectors. In case our */ | |
2380 /* cache is filled up, those variables are excluded to save */ | |
2381 /* kernel evaluations. (See chapter 'Shrinking').*/ | |
2382 if((a[i]>(learn_parm->epsilon_a)) | |
2383 && (a[i]<ex_c)) { | |
2384 last_suboptimal_at[i]=iteration; /* not at bound */ | |
2385 } | |
2386 else if((a[i]<=(learn_parm->epsilon_a)) | |
2387 && (dist < (target+learn_parm->epsilon_shrink))) { | |
2388 last_suboptimal_at[i]=iteration; /* not likely optimal */ | |
2389 } | |
2390 else if((a[i]>=ex_c) | |
2391 && (dist > (target-learn_parm->epsilon_shrink))) { | |
2392 last_suboptimal_at[i]=iteration; /* not likely optimal */ | |
2393 } | |
2394 } | |
2395 } | |
2396 /* termination criterion */ | |
2397 if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) { | |
2398 retrain=1; | |
2399 } | |
2400 return(retrain); | |
2401 } | |
2402 | |
2403 long check_optimality_sharedslack(DOC **docs, MODEL *model, long int *label, | |
2404 double *a, double *lin, double *c, double *slack, | |
2405 double *alphaslack, | |
2406 long int totdoc, | |
2407 LEARN_PARM *learn_parm, double *maxdiff, | |
2408 double epsilon_crit_org, long int *misclassified, | |
2409 long int *active2dnum, | |
2410 long int *last_suboptimal_at, | |
2411 long int iteration, KERNEL_PARM *kernel_parm) | |
2412 /* Check KT-conditions */ | |
2413 { | |
2414 long i,ii,retrain; | |
2415 double dist,ex_c=0,target; | |
2416 | |
2417 if(kernel_parm->kernel_type == LINEAR) { /* be optimistic */ | |
2418 learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org; | |
2419 } | |
2420 else { /* be conservative */ | |
2421 learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3; | |
2422 } | |
2423 | |
2424 retrain=0; | |
2425 (*maxdiff)=0; | |
2426 (*misclassified)=0; | |
2427 for(ii=0;(i=active2dnum[ii])>=0;ii++) { | |
2428 /* 'distance' from hyperplane*/ | |
2429 dist=(lin[i]-model->b)*(double)label[i]+slack[docs[i]->slackid]; | |
2430 target=-(learn_parm->eps-(double)label[i]*c[i]); | |
2431 ex_c=learn_parm->svm_c-learn_parm->epsilon_a; | |
2432 if((a[i]>learn_parm->epsilon_a) && (dist > target)) { | |
2433 if((dist-target)>(*maxdiff)) { /* largest violation */ | |
2434 (*maxdiff)=dist-target; | |
2435 if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]); | |
2436 if(verbosity>=5) printf(" (single %f)\n",(*maxdiff)); | |
2437 } | |
2438 } | |
2439 if((alphaslack[docs[i]->slackid]<ex_c) && (slack[docs[i]->slackid]>0)) { | |
2440 if((slack[docs[i]->slackid])>(*maxdiff)) { /* largest violation */ | |
2441 (*maxdiff)=slack[docs[i]->slackid]; | |
2442 if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]); | |
2443 if(verbosity>=5) printf(" (joint %f)\n",(*maxdiff)); | |
2444 } | |
2445 } | |
2446 /* Count how long a variable was at lower/upper bound (and optimal).*/ | |
2447 /* Variables, which were at the bound and optimal for a long */ | |
2448 /* time are unlikely to become support vectors. In case our */ | |
2449 /* cache is filled up, those variables are excluded to save */ | |
2450 /* kernel evaluations. (See chapter 'Shrinking').*/ | |
2451 if((a[i]>(learn_parm->epsilon_a)) | |
2452 && (a[i]<ex_c)) { | |
2453 last_suboptimal_at[docs[i]->slackid]=iteration; /* not at bound */ | |
2454 } | |
2455 else if((a[i]<=(learn_parm->epsilon_a)) | |
2456 && (dist < (target+learn_parm->epsilon_shrink))) { | |
2457 last_suboptimal_at[docs[i]->slackid]=iteration; /* not likely optimal */ | |
2458 } | |
2459 else if((a[i]>=ex_c) | |
2460 && (slack[docs[i]->slackid] < learn_parm->epsilon_shrink)) { | |
2461 last_suboptimal_at[docs[i]->slackid]=iteration; /* not likely optimal */ | |
2462 } | |
2463 } | |
2464 /* termination criterion */ | |
2465 if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) { | |
2466 retrain=1; | |
2467 } | |
2468 return(retrain); | |
2469 } | |
2470 | |
2471 void compute_shared_slacks(DOC **docs, long int *label, | |
2472 double *a, double *lin, | |
2473 double *c, long int *active2dnum, | |
2474 LEARN_PARM *learn_parm, | |
2475 double *slack, double *alphaslack) | |
2476 /* compute the value of shared slacks and the joint alphas */ | |
2477 { | |
2478 long jj,i; | |
2479 double dist,target; | |
2480 | |
2481 for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* clear slack variables */ | |
2482 slack[docs[i]->slackid]=0.0; | |
2483 alphaslack[docs[i]->slackid]=0.0; | |
2484 } | |
2485 for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* recompute slack variables */ | |
2486 dist=(lin[i])*(double)label[i]; | |
2487 target=-(learn_parm->eps-(double)label[i]*c[i]); | |
2488 if((target-dist) > slack[docs[i]->slackid]) | |
2489 slack[docs[i]->slackid]=target-dist; | |
2490 alphaslack[docs[i]->slackid]+=a[i]; | |
2491 } | |
2492 } | |
2493 | |
2494 | |
2495 long identify_inconsistent(double *a, long int *label, | |
2496 long int *unlabeled, long int totdoc, | |
2497 LEARN_PARM *learn_parm, | |
2498 long int *inconsistentnum, long int *inconsistent) | |
2499 { | |
2500 long i,retrain; | |
2501 | |
2502 /* Throw out examples with multipliers at upper bound. This */ | |
2503 /* corresponds to the -i 1 option. */ | |
2504 /* ATTENTION: this is just a heuristic for finding a close */ | |
2505 /* to minimum number of examples to exclude to */ | |
2506 /* make the problem separable with desired margin */ | |
2507 retrain=0; | |
2508 for(i=0;i<totdoc;i++) { | |
2509 if((!inconsistent[i]) && (!unlabeled[i]) | |
2510 && (a[i]>=(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) { | |
2511 (*inconsistentnum)++; | |
2512 inconsistent[i]=1; /* never choose again */ | |
2513 retrain=2; /* start over */ | |
2514 if(verbosity>=3) { | |
2515 printf("inconsistent(%ld)..",i); fflush(stdout); | |
2516 } | |
2517 } | |
2518 } | |
2519 return(retrain); | |
2520 } | |
2521 | |
2522 long identify_misclassified(double *lin, long int *label, | |
2523 long int *unlabeled, long int totdoc, | |
2524 MODEL *model, long int *inconsistentnum, | |
2525 long int *inconsistent) | |
2526 { | |
2527 long i,retrain; | |
2528 double dist; | |
2529 | |
2530 /* Throw out misclassified examples. This */ | |
2531 /* corresponds to the -i 2 option. */ | |
2532 /* ATTENTION: this is just a heuristic for finding a close */ | |
2533 /* to minimum number of examples to exclude to */ | |
2534 /* make the problem separable with desired margin */ | |
2535 retrain=0; | |
2536 for(i=0;i<totdoc;i++) { | |
2537 dist=(lin[i]-model->b)*(double)label[i]; /* 'distance' from hyperplane*/ | |
2538 if((!inconsistent[i]) && (!unlabeled[i]) && (dist <= 0)) { | |
2539 (*inconsistentnum)++; | |
2540 inconsistent[i]=1; /* never choose again */ | |
2541 retrain=2; /* start over */ | |
2542 if(verbosity>=3) { | |
2543 printf("inconsistent(%ld)..",i); fflush(stdout); | |
2544 } | |
2545 } | |
2546 } | |
2547 return(retrain); | |
2548 } | |
2549 | |
2550 long identify_one_misclassified(double *lin, long int *label, | |
2551 long int *unlabeled, | |
2552 long int totdoc, MODEL *model, | |
2553 long int *inconsistentnum, | |
2554 long int *inconsistent) | |
2555 { | |
2556 long i,retrain,maxex=-1; | |
2557 double dist,maxdist=0; | |
2558 | |
2559 /* Throw out the 'most misclassified' example. This */ | |
2560 /* corresponds to the -i 3 option. */ | |
2561 /* ATTENTION: this is just a heuristic for finding a close */ | |
2562 /* to minimum number of examples to exclude to */ | |
2563 /* make the problem separable with desired margin */ | |
2564 retrain=0; | |
2565 for(i=0;i<totdoc;i++) { | |
2566 if((!inconsistent[i]) && (!unlabeled[i])) { | |
2567 dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from hyperplane*/ | |
2568 if(dist<maxdist) { | |
2569 maxdist=dist; | |
2570 maxex=i; | |
2571 } | |
2572 } | |
2573 } | |
2574 if(maxex>=0) { | |
2575 (*inconsistentnum)++; | |
2576 inconsistent[maxex]=1; /* never choose again */ | |
2577 retrain=2; /* start over */ | |
2578 if(verbosity>=3) { | |
2579 printf("inconsistent(%ld)..",i); fflush(stdout); | |
2580 } | |
2581 } | |
2582 return(retrain); | |
2583 } | |
2584 | |
2585 void update_linear_component(DOC **docs, long int *label, | |
2586 long int *active2dnum, double *a, | |
2587 double *a_old, long int *working2dnum, | |
2588 long int totdoc, long int totwords, | |
2589 KERNEL_PARM *kernel_parm, | |
2590 KERNEL_CACHE *kernel_cache, | |
2591 double *lin, CFLOAT *aicache, double *weights) | |
2592 /* keep track of the linear component */ | |
2593 /* lin of the gradient etc. by updating */ | |
2594 /* based on the change of the variables */ | |
2595 /* in the current working set */ | |
2596 { | |
2597 register long i,ii,j,jj; | |
2598 register double tec; | |
2599 SVECTOR *f; | |
2600 | |
2601 if(kernel_parm->kernel_type==0) { /* special linear case */ | |
2602 clear_vector_n(weights,totwords); | |
2603 for(ii=0;(i=working2dnum[ii])>=0;ii++) { | |
2604 if(a[i] != a_old[i]) { | |
2605 for(f=docs[i]->fvec;f;f=f->next) | |
2606 add_vector_ns(weights,f, | |
2607 f->factor*((a[i]-a_old[i])*(double)label[i])); | |
2608 } | |
2609 } | |
2610 for(jj=0;(j=active2dnum[jj])>=0;jj++) { | |
2611 for(f=docs[j]->fvec;f;f=f->next) | |
2612 lin[j]+=f->factor*sprod_ns(weights,f); | |
2613 } | |
2614 } | |
2615 else { /* general case */ | |
2616 for(jj=0;(i=working2dnum[jj])>=0;jj++) { | |
2617 if(a[i] != a_old[i]) { | |
2618 get_kernel_row(kernel_cache,docs,i,totdoc,active2dnum,aicache, | |
2619 kernel_parm); | |
2620 for(ii=0;(j=active2dnum[ii])>=0;ii++) { | |
2621 tec=aicache[j]; | |
2622 lin[j]+=(((a[i]*tec)-(a_old[i]*tec))*(double)label[i]); | |
2623 } | |
2624 } | |
2625 } | |
2626 } | |
2627 } | |
2628 | |
2629 | |
2630 long incorporate_unlabeled_examples(MODEL *model, long int *label, | |
2631 long int *inconsistent, | |
2632 long int *unlabeled, | |
2633 double *a, double *lin, | |
2634 long int totdoc, double *selcrit, | |
2635 long int *select, long int *key, | |
2636 long int transductcycle, | |
2637 KERNEL_PARM *kernel_parm, | |
2638 LEARN_PARM *learn_parm) | |
2639 { | |
2640 long i,j,k,j1,j2,j3,j4,unsupaddnum1=0,unsupaddnum2=0; | |
2641 long pos,neg,upos,uneg,orgpos,orgneg,nolabel,newpos,newneg,allunlab; | |
2642 double dist,model_length,posratio,negratio; | |
2643 long check_every=2; | |
2644 double loss; | |
2645 static double switchsens=0.0,switchsensorg=0.0; | |
2646 double umin,umax,sumalpha; | |
2647 long imin=0,imax=0; | |
2648 static long switchnum=0; | |
2649 | |
2650 switchsens/=1.2; | |
2651 | |
2652 /* assumes that lin[] is up to date -> no inactive vars */ | |
2653 | |
2654 orgpos=0; | |
2655 orgneg=0; | |
2656 newpos=0; | |
2657 newneg=0; | |
2658 nolabel=0; | |
2659 allunlab=0; | |
2660 for(i=0;i<totdoc;i++) { | |
2661 if(!unlabeled[i]) { | |
2662 if(label[i] > 0) { | |
2663 orgpos++; | |
2664 } | |
2665 else { | |
2666 orgneg++; | |
2667 } | |
2668 } | |
2669 else { | |
2670 allunlab++; | |
2671 if(unlabeled[i]) { | |
2672 if(label[i] > 0) { | |
2673 newpos++; | |
2674 } | |
2675 else if(label[i] < 0) { | |
2676 newneg++; | |
2677 } | |
2678 } | |
2679 } | |
2680 if(label[i]==0) { | |
2681 nolabel++; | |
2682 } | |
2683 } | |
2684 | |
2685 if(learn_parm->transduction_posratio >= 0) { | |
2686 posratio=learn_parm->transduction_posratio; | |
2687 } | |
2688 else { | |
2689 posratio=(double)orgpos/(double)(orgpos+orgneg); /* use ratio of pos/neg */ | |
2690 } /* in training data */ | |
2691 negratio=1.0-posratio; | |
2692 | |
2693 learn_parm->svm_costratio=1.0; /* global */ | |
2694 if(posratio>0) { | |
2695 learn_parm->svm_costratio_unlab=negratio/posratio; | |
2696 } | |
2697 else { | |
2698 learn_parm->svm_costratio_unlab=1.0; | |
2699 } | |
2700 | |
2701 pos=0; | |
2702 neg=0; | |
2703 upos=0; | |
2704 uneg=0; | |
2705 for(i=0;i<totdoc;i++) { | |
2706 dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ | |
2707 if(dist>0) { | |
2708 pos++; | |
2709 } | |
2710 else { | |
2711 neg++; | |
2712 } | |
2713 if(unlabeled[i]) { | |
2714 if(dist>0) { | |
2715 upos++; | |
2716 } | |
2717 else { | |
2718 uneg++; | |
2719 } | |
2720 } | |
2721 if((!unlabeled[i]) && (a[i]>(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) { | |
2722 /* printf("Ubounded %ld (class %ld, unlabeled %ld)\n",i,label[i],unlabeled[i]); */ | |
2723 } | |
2724 } | |
2725 if(verbosity>=2) { | |
2726 printf("POS=%ld, ORGPOS=%ld, ORGNEG=%ld\n",pos,orgpos,orgneg); | |
2727 printf("POS=%ld, NEWPOS=%ld, NEWNEG=%ld\n",pos,newpos,newneg); | |
2728 printf("pos ratio = %f (%f).\n",(double)(upos)/(double)(allunlab),posratio); | |
2729 fflush(stdout); | |
2730 } | |
2731 | |
2732 if(transductcycle == 0) { | |
2733 j1=0; | |
2734 j2=0; | |
2735 j4=0; | |
2736 for(i=0;i<totdoc;i++) { | |
2737 dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ | |
2738 if((label[i]==0) && (unlabeled[i])) { | |
2739 selcrit[j4]=dist; | |
2740 key[j4]=i; | |
2741 j4++; | |
2742 } | |
2743 } | |
2744 unsupaddnum1=0; | |
2745 unsupaddnum2=0; | |
2746 select_top_n(selcrit,j4,select,(long)(allunlab*posratio+0.5)); | |
2747 for(k=0;(k<(long)(allunlab*posratio+0.5));k++) { | |
2748 i=key[select[k]]; | |
2749 label[i]=1; | |
2750 unsupaddnum1++; | |
2751 j1++; | |
2752 } | |
2753 for(i=0;i<totdoc;i++) { | |
2754 if((label[i]==0) && (unlabeled[i])) { | |
2755 label[i]=-1; | |
2756 j2++; | |
2757 unsupaddnum2++; | |
2758 } | |
2759 } | |
2760 for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ | |
2761 if(unlabeled[i]) { | |
2762 if(label[i] == 1) { | |
2763 learn_parm->svm_cost[i]=learn_parm->svm_c* | |
2764 learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; | |
2765 } | |
2766 else if(label[i] == -1) { | |
2767 learn_parm->svm_cost[i]=learn_parm->svm_c* | |
2768 learn_parm->svm_unlabbound; | |
2769 } | |
2770 } | |
2771 } | |
2772 if(verbosity>=1) { | |
2773 /* printf("costratio %f, costratio_unlab %f, unlabbound %f\n", | |
2774 learn_parm->svm_costratio,learn_parm->svm_costratio_unlab, | |
2775 learn_parm->svm_unlabbound); */ | |
2776 printf("Classifying unlabeled data as %ld POS / %ld NEG.\n", | |
2777 unsupaddnum1,unsupaddnum2); | |
2778 fflush(stdout); | |
2779 } | |
2780 if(verbosity >= 1) | |
2781 printf("Retraining."); | |
2782 if(verbosity >= 2) printf("\n"); | |
2783 return((long)3); | |
2784 } | |
2785 if((transductcycle % check_every) == 0) { | |
2786 if(verbosity >= 1) | |
2787 printf("Retraining."); | |
2788 if(verbosity >= 2) printf("\n"); | |
2789 j1=0; | |
2790 j2=0; | |
2791 unsupaddnum1=0; | |
2792 unsupaddnum2=0; | |
2793 for(i=0;i<totdoc;i++) { | |
2794 if((unlabeled[i] == 2)) { | |
2795 unlabeled[i]=1; | |
2796 label[i]=1; | |
2797 j1++; | |
2798 unsupaddnum1++; | |
2799 } | |
2800 else if((unlabeled[i] == 3)) { | |
2801 unlabeled[i]=1; | |
2802 label[i]=-1; | |
2803 j2++; | |
2804 unsupaddnum2++; | |
2805 } | |
2806 } | |
2807 for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ | |
2808 if(unlabeled[i]) { | |
2809 if(label[i] == 1) { | |
2810 learn_parm->svm_cost[i]=learn_parm->svm_c* | |
2811 learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; | |
2812 } | |
2813 else if(label[i] == -1) { | |
2814 learn_parm->svm_cost[i]=learn_parm->svm_c* | |
2815 learn_parm->svm_unlabbound; | |
2816 } | |
2817 } | |
2818 } | |
2819 | |
2820 if(verbosity>=2) { | |
2821 /* printf("costratio %f, costratio_unlab %f, unlabbound %f\n", | |
2822 learn_parm->svm_costratio,learn_parm->svm_costratio_unlab, | |
2823 learn_parm->svm_unlabbound); */ | |
2824 printf("%ld positive -> Added %ld POS / %ld NEG unlabeled examples.\n", | |
2825 upos,unsupaddnum1,unsupaddnum2); | |
2826 fflush(stdout); | |
2827 } | |
2828 | |
2829 if(learn_parm->svm_unlabbound == 1) { | |
2830 learn_parm->epsilon_crit=0.001; /* do the last run right */ | |
2831 } | |
2832 else { | |
2833 learn_parm->epsilon_crit=0.01; /* otherwise, no need to be so picky */ | |
2834 } | |
2835 | |
2836 return((long)3); | |
2837 } | |
2838 else if(((transductcycle % check_every) < check_every)) { | |
2839 model_length=0; | |
2840 sumalpha=0; | |
2841 loss=0; | |
2842 for(i=0;i<totdoc;i++) { | |
2843 model_length+=a[i]*label[i]*lin[i]; | |
2844 sumalpha+=a[i]; | |
2845 dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ | |
2846 if((label[i]*dist)<(1.0-learn_parm->epsilon_crit)) { | |
2847 loss+=(1.0-(label[i]*dist))*learn_parm->svm_cost[i]; | |
2848 } | |
2849 } | |
2850 model_length=sqrt(model_length); | |
2851 if(verbosity>=2) { | |
2852 printf("Model-length = %f (%f), loss = %f, objective = %f\n", | |
2853 model_length,sumalpha,loss,loss+0.5*model_length*model_length); | |
2854 fflush(stdout); | |
2855 } | |
2856 j1=0; | |
2857 j2=0; | |
2858 j3=0; | |
2859 j4=0; | |
2860 unsupaddnum1=0; | |
2861 unsupaddnum2=0; | |
2862 umin=99999; | |
2863 umax=-99999; | |
2864 j4=1; | |
2865 while(j4) { | |
2866 umin=99999; | |
2867 umax=-99999; | |
2868 for(i=0;(i<totdoc);i++) { | |
2869 dist=(lin[i]-model->b); | |
2870 if((label[i]>0) && (unlabeled[i]) && (!inconsistent[i]) | |
2871 && (dist<umin)) { | |
2872 umin=dist; | |
2873 imin=i; | |
2874 } | |
2875 if((label[i]<0) && (unlabeled[i]) && (!inconsistent[i]) | |
2876 && (dist>umax)) { | |
2877 umax=dist; | |
2878 imax=i; | |
2879 } | |
2880 } | |
2881 if((umin < (umax+switchsens-1E-4))) { | |
2882 j1++; | |
2883 j2++; | |
2884 unsupaddnum1++; | |
2885 unlabeled[imin]=3; | |
2886 inconsistent[imin]=1; | |
2887 unsupaddnum2++; | |
2888 unlabeled[imax]=2; | |
2889 inconsistent[imax]=1; | |
2890 } | |
2891 else | |
2892 j4=0; | |
2893 j4=0; | |
2894 } | |
2895 for(j=0;(j<totdoc);j++) { | |
2896 if(unlabeled[j] && (!inconsistent[j])) { | |
2897 if(label[j]>0) { | |
2898 unlabeled[j]=2; | |
2899 } | |
2900 else if(label[j]<0) { | |
2901 unlabeled[j]=3; | |
2902 } | |
2903 /* inconsistent[j]=1; */ | |
2904 j3++; | |
2905 } | |
2906 } | |
2907 switchnum+=unsupaddnum1+unsupaddnum2; | |
2908 | |
2909 /* stop and print out current margin | |
2910 printf("switchnum %ld %ld\n",switchnum,kernel_parm->poly_degree); | |
2911 if(switchnum == 2*kernel_parm->poly_degree) { | |
2912 learn_parm->svm_unlabbound=1; | |
2913 } | |
2914 */ | |
2915 | |
2916 if((!unsupaddnum1) && (!unsupaddnum2)) { | |
2917 if((learn_parm->svm_unlabbound>=1) && ((newpos+newneg) == allunlab)) { | |
2918 for(j=0;(j<totdoc);j++) { | |
2919 inconsistent[j]=0; | |
2920 if(unlabeled[j]) unlabeled[j]=1; | |
2921 } | |
2922 write_prediction(learn_parm->predfile,model,lin,a,unlabeled,label, | |
2923 totdoc,learn_parm); | |
2924 if(verbosity>=1) | |
2925 printf("Number of switches: %ld\n",switchnum); | |
2926 return((long)0); | |
2927 } | |
2928 switchsens=switchsensorg; | |
2929 learn_parm->svm_unlabbound*=1.5; | |
2930 if(learn_parm->svm_unlabbound>1) { | |
2931 learn_parm->svm_unlabbound=1; | |
2932 } | |
2933 model->at_upper_bound=0; /* since upper bound increased */ | |
2934 if(verbosity>=1) | |
2935 printf("Increasing influence of unlabeled examples to %f%% .", | |
2936 learn_parm->svm_unlabbound*100.0); | |
2937 } | |
2938 else if(verbosity>=1) { | |
2939 printf("%ld positive -> Switching labels of %ld POS / %ld NEG unlabeled examples.", | |
2940 upos,unsupaddnum1,unsupaddnum2); | |
2941 fflush(stdout); | |
2942 } | |
2943 | |
2944 if(verbosity >= 2) printf("\n"); | |
2945 | |
2946 learn_parm->epsilon_crit=0.5; /* don't need to be so picky */ | |
2947 | |
2948 for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ | |
2949 if(unlabeled[i]) { | |
2950 if(label[i] == 1) { | |
2951 learn_parm->svm_cost[i]=learn_parm->svm_c* | |
2952 learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; | |
2953 } | |
2954 else if(label[i] == -1) { | |
2955 learn_parm->svm_cost[i]=learn_parm->svm_c* | |
2956 learn_parm->svm_unlabbound; | |
2957 } | |
2958 } | |
2959 } | |
2960 | |
2961 return((long)2); | |
2962 } | |
2963 | |
2964 return((long)0); | |
2965 } | |
2966 | |
2967 /*************************** Working set selection ***************************/ | |
2968 | |
2969 long select_next_qp_subproblem_grad(long int *label, | |
2970 long int *unlabeled, | |
2971 double *a, double *lin, | |
2972 double *c, long int totdoc, | |
2973 long int qp_size, | |
2974 LEARN_PARM *learn_parm, | |
2975 long int *inconsistent, | |
2976 long int *active2dnum, | |
2977 long int *working2dnum, | |
2978 double *selcrit, | |
2979 long int *select, | |
2980 KERNEL_CACHE *kernel_cache, | |
2981 long int cache_only, | |
2982 long int *key, long int *chosen) | |
2983 /* Use the feasible direction approach to select the next | |
2984 qp-subproblem (see chapter 'Selecting a good working set'). If | |
2985 'cache_only' is true, then the variables are selected only among | |
2986 those for which the kernel evaluations are cached. */ | |
2987 { | |
2988 long choosenum,i,j,k,activedoc,inum,valid; | |
2989 double s; | |
2990 | |
2991 for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */ | |
2992 choosenum=0; | |
2993 activedoc=0; | |
2994 for(i=0;(j=active2dnum[i])>=0;i++) { | |
2995 s=-label[j]; | |
2996 if(kernel_cache && cache_only) | |
2997 valid=(kernel_cache->index[j]>=0); | |
2998 else | |
2999 valid=1; | |
3000 if(valid | |
3001 && (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) | |
3002 && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) | |
3003 && (s>0))) | |
3004 && (!chosen[j]) | |
3005 && (label[j]) | |
3006 && (!inconsistent[j])) | |
3007 { | |
3008 selcrit[activedoc]=(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]); | |
3009 /* selcrit[activedoc]=(double)label[j]*(-1.0+(double)label[j]*lin[j]); */ | |
3010 key[activedoc]=j; | |
3011 activedoc++; | |
3012 } | |
3013 } | |
3014 select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); | |
3015 for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) { | |
3016 /* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */ | |
3017 i=key[select[k]]; | |
3018 chosen[i]=1; | |
3019 working2dnum[inum+choosenum]=i; | |
3020 choosenum+=1; | |
3021 if(kernel_cache) | |
3022 kernel_cache_touch(kernel_cache,i); /* make sure it does not get | |
3023 kicked out of cache */ | |
3024 /* } */ | |
3025 } | |
3026 | |
3027 activedoc=0; | |
3028 for(i=0;(j=active2dnum[i])>=0;i++) { | |
3029 s=label[j]; | |
3030 if(kernel_cache && cache_only) | |
3031 valid=(kernel_cache->index[j]>=0); | |
3032 else | |
3033 valid=1; | |
3034 if(valid | |
3035 && (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) | |
3036 && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) | |
3037 && (s>0))) | |
3038 && (!chosen[j]) | |
3039 && (label[j]) | |
3040 && (!inconsistent[j])) | |
3041 { | |
3042 selcrit[activedoc]=-(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]); | |
3043 /* selcrit[activedoc]=-(double)(label[j]*(-1.0+(double)label[j]*lin[j])); */ | |
3044 key[activedoc]=j; | |
3045 activedoc++; | |
3046 } | |
3047 } | |
3048 select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); | |
3049 for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) { | |
3050 /* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */ | |
3051 i=key[select[k]]; | |
3052 chosen[i]=1; | |
3053 working2dnum[inum+choosenum]=i; | |
3054 choosenum+=1; | |
3055 if(kernel_cache) | |
3056 kernel_cache_touch(kernel_cache,i); /* make sure it does not get | |
3057 kicked out of cache */ | |
3058 /* } */ | |
3059 } | |
3060 working2dnum[inum+choosenum]=-1; /* complete index */ | |
3061 return(choosenum); | |
3062 } | |
3063 | |
3064 long select_next_qp_subproblem_rand(long int *label, | |
3065 long int *unlabeled, | |
3066 double *a, double *lin, | |
3067 double *c, long int totdoc, | |
3068 long int qp_size, | |
3069 LEARN_PARM *learn_parm, | |
3070 long int *inconsistent, | |
3071 long int *active2dnum, | |
3072 long int *working2dnum, | |
3073 double *selcrit, | |
3074 long int *select, | |
3075 KERNEL_CACHE *kernel_cache, | |
3076 long int *key, | |
3077 long int *chosen, | |
3078 long int iteration) | |
3079 /* Use the feasible direction approach to select the next | |
3080 qp-subproblem (see section 'Selecting a good working set'). Chooses | |
3081 a feasible direction at (pseudo) random to help jump over numerical | |
3082 problem. */ | |
3083 { | |
3084 long choosenum,i,j,k,activedoc,inum; | |
3085 double s; | |
3086 | |
3087 for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */ | |
3088 choosenum=0; | |
3089 activedoc=0; | |
3090 for(i=0;(j=active2dnum[i])>=0;i++) { | |
3091 s=-label[j]; | |
3092 if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) | |
3093 && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) | |
3094 && (s>0))) | |
3095 && (!inconsistent[j]) | |
3096 && (label[j]) | |
3097 && (!chosen[j])) { | |
3098 selcrit[activedoc]=(j+iteration) % totdoc; | |
3099 key[activedoc]=j; | |
3100 activedoc++; | |
3101 } | |
3102 } | |
3103 select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); | |
3104 for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) { | |
3105 i=key[select[k]]; | |
3106 chosen[i]=1; | |
3107 working2dnum[inum+choosenum]=i; | |
3108 choosenum+=1; | |
3109 kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */ | |
3110 /* out of cache */ | |
3111 } | |
3112 | |
3113 activedoc=0; | |
3114 for(i=0;(j=active2dnum[i])>=0;i++) { | |
3115 s=label[j]; | |
3116 if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) | |
3117 && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) | |
3118 && (s>0))) | |
3119 && (!inconsistent[j]) | |
3120 && (label[j]) | |
3121 && (!chosen[j])) { | |
3122 selcrit[activedoc]=(j+iteration) % totdoc; | |
3123 key[activedoc]=j; | |
3124 activedoc++; | |
3125 } | |
3126 } | |
3127 select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); | |
3128 for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) { | |
3129 i=key[select[k]]; | |
3130 chosen[i]=1; | |
3131 working2dnum[inum+choosenum]=i; | |
3132 choosenum+=1; | |
3133 kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */ | |
3134 /* out of cache */ | |
3135 } | |
3136 working2dnum[inum+choosenum]=-1; /* complete index */ | |
3137 return(choosenum); | |
3138 } | |
3139 | |
3140 long select_next_qp_slackset(DOC **docs, long int *label, | |
3141 double *a, double *lin, | |
3142 double *slack, double *alphaslack, | |
3143 double *c, | |
3144 LEARN_PARM *learn_parm, | |
3145 long int *active2dnum, double *maxviol) | |
3146 /* returns the slackset with the largest internal violation */ | |
3147 { | |
3148 long i,ii,maxdiffid; | |
3149 double dist,target,maxdiff,ex_c; | |
3150 | |
3151 maxdiff=0; | |
3152 maxdiffid=0; | |
3153 for(ii=0;(i=active2dnum[ii])>=0;ii++) { | |
3154 ex_c=learn_parm->svm_c-learn_parm->epsilon_a; | |
3155 if(alphaslack[docs[i]->slackid] >= ex_c) { | |
3156 dist=(lin[i])*(double)label[i]+slack[docs[i]->slackid]; /* distance */ | |
3157 target=-(learn_parm->eps-(double)label[i]*c[i]); /* rhs of constraint */ | |
3158 if((a[i]>learn_parm->epsilon_a) && (dist > target)) { | |
3159 if((dist-target)>maxdiff) { /* largest violation */ | |
3160 maxdiff=dist-target; | |
3161 maxdiffid=docs[i]->slackid; | |
3162 } | |
3163 } | |
3164 } | |
3165 } | |
3166 (*maxviol)=maxdiff; | |
3167 return(maxdiffid); | |
3168 } | |
3169 | |
3170 | |
3171 void select_top_n(double *selcrit, long int range, long int *select, | |
3172 long int n) | |
3173 { | |
3174 register long i,j; | |
3175 | |
3176 for(i=0;(i<n) && (i<range);i++) { /* Initialize with the first n elements */ | |
3177 for(j=i;j>=0;j--) { | |
3178 if((j>0) && (selcrit[select[j-1]]<selcrit[i])){ | |
3179 select[j]=select[j-1]; | |
3180 } | |
3181 else { | |
3182 select[j]=i; | |
3183 j=-1; | |
3184 } | |
3185 } | |
3186 } | |
3187 if(n>0) { | |
3188 for(i=n;i<range;i++) { | |
3189 if(selcrit[i]>selcrit[select[n-1]]) { | |
3190 for(j=n-1;j>=0;j--) { | |
3191 if((j>0) && (selcrit[select[j-1]]<selcrit[i])) { | |
3192 select[j]=select[j-1]; | |
3193 } | |
3194 else { | |
3195 select[j]=i; | |
3196 j=-1; | |
3197 } | |
3198 } | |
3199 } | |
3200 } | |
3201 } | |
3202 } | |
3203 | |
3204 | |
3205 /******************************** Shrinking *********************************/ | |
3206 | |
3207 void init_shrink_state(SHRINK_STATE *shrink_state, long int totdoc, | |
3208 long int maxhistory) | |
3209 { | |
3210 long i; | |
3211 | |
3212 shrink_state->deactnum=0; | |
3213 shrink_state->active = (long *)my_malloc(sizeof(long)*totdoc); | |
3214 shrink_state->inactive_since = (long *)my_malloc(sizeof(long)*totdoc); | |
3215 shrink_state->a_history = (double **)my_malloc(sizeof(double *)*maxhistory); | |
3216 shrink_state->maxhistory=maxhistory; | |
3217 shrink_state->last_lin = (double *)my_malloc(sizeof(double)*totdoc); | |
3218 shrink_state->last_a = (double *)my_malloc(sizeof(double)*totdoc); | |
3219 | |
3220 for(i=0;i<totdoc;i++) { | |
3221 shrink_state->active[i]=1; | |
3222 shrink_state->inactive_since[i]=0; | |
3223 shrink_state->last_a[i]=0; | |
3224 shrink_state->last_lin[i]=0; | |
3225 } | |
3226 } | |
3227 | |
3228 void shrink_state_cleanup(SHRINK_STATE *shrink_state) | |
3229 { | |
3230 free(shrink_state->active); | |
3231 free(shrink_state->inactive_since); | |
3232 if(shrink_state->deactnum > 0) | |
3233 free(shrink_state->a_history[shrink_state->deactnum-1]); | |
3234 free(shrink_state->a_history); | |
3235 free(shrink_state->last_a); | |
3236 free(shrink_state->last_lin); | |
3237 } | |
3238 | |
3239 long shrink_problem(DOC **docs, | |
3240 LEARN_PARM *learn_parm, | |
3241 SHRINK_STATE *shrink_state, | |
3242 KERNEL_PARM *kernel_parm, | |
3243 long int *active2dnum, | |
3244 long int *last_suboptimal_at, | |
3245 long int iteration, | |
3246 long int totdoc, | |
3247 long int minshrink, | |
3248 double *a, | |
3249 long int *inconsistent) | |
3250 /* Shrink some variables away. Do the shrinking only if at least | |
3251 minshrink variables can be removed. */ | |
3252 { | |
3253 long i,ii,change,activenum,lastiter; | |
3254 double *a_old; | |
3255 | |
3256 activenum=0; | |
3257 change=0; | |
3258 for(ii=0;active2dnum[ii]>=0;ii++) { | |
3259 i=active2dnum[ii]; | |
3260 activenum++; | |
3261 if(learn_parm->sharedslack) | |
3262 lastiter=last_suboptimal_at[docs[i]->slackid]; | |
3263 else | |
3264 lastiter=last_suboptimal_at[i]; | |
3265 if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink) | |
3266 || (inconsistent[i])) { | |
3267 change++; | |
3268 } | |
3269 } | |
3270 if((change>=minshrink) /* shrink only if sufficiently many candidates */ | |
3271 && (shrink_state->deactnum<shrink_state->maxhistory)) { /* and enough memory */ | |
3272 /* Shrink problem by removing those variables which are */ | |
3273 /* optimal at a bound for a minimum number of iterations */ | |
3274 if(verbosity>=2) { | |
3275 printf(" Shrinking..."); fflush(stdout); | |
3276 } | |
3277 if(kernel_parm->kernel_type != LINEAR) { /* non-linear case save alphas */ | |
3278 a_old=(double *)my_malloc(sizeof(double)*totdoc); | |
3279 shrink_state->a_history[shrink_state->deactnum]=a_old; | |
3280 for(i=0;i<totdoc;i++) { | |
3281 a_old[i]=a[i]; | |
3282 } | |
3283 } | |
3284 for(ii=0;active2dnum[ii]>=0;ii++) { | |
3285 i=active2dnum[ii]; | |
3286 if(learn_parm->sharedslack) | |
3287 lastiter=last_suboptimal_at[docs[i]->slackid]; | |
3288 else | |
3289 lastiter=last_suboptimal_at[i]; | |
3290 if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink) | |
3291 || (inconsistent[i])) { | |
3292 shrink_state->active[i]=0; | |
3293 shrink_state->inactive_since[i]=shrink_state->deactnum; | |
3294 } | |
3295 } | |
3296 activenum=compute_index(shrink_state->active,totdoc,active2dnum); | |
3297 shrink_state->deactnum++; | |
3298 if(kernel_parm->kernel_type == LINEAR) { | |
3299 shrink_state->deactnum=0; | |
3300 } | |
3301 if(verbosity>=2) { | |
3302 printf("done.\n"); fflush(stdout); | |
3303 printf(" Number of inactive variables = %ld\n",totdoc-activenum); | |
3304 } | |
3305 } | |
3306 return(activenum); | |
3307 } | |
3308 | |
3309 | |
3310 void reactivate_inactive_examples(long int *label, | |
3311 long int *unlabeled, | |
3312 double *a, | |
3313 SHRINK_STATE *shrink_state, | |
3314 double *lin, | |
3315 double *c, | |
3316 long int totdoc, | |
3317 long int totwords, | |
3318 long int iteration, | |
3319 LEARN_PARM *learn_parm, | |
3320 long int *inconsistent, | |
3321 DOC **docs, | |
3322 KERNEL_PARM *kernel_parm, | |
3323 KERNEL_CACHE *kernel_cache, | |
3324 MODEL *model, | |
3325 CFLOAT *aicache, | |
3326 double *weights, | |
3327 double *maxdiff) | |
3328 /* Make all variables active again which had been removed by | |
3329 shrinking. */ | |
3330 /* Computes lin for those variables from scratch. */ | |
3331 { | |
3332 register long i,j,ii,jj,t,*changed2dnum,*inactive2dnum; | |
3333 long *changed,*inactive; | |
3334 register double kernel_val,*a_old,dist; | |
3335 double ex_c,target; | |
3336 SVECTOR *f; | |
3337 | |
3338 if(kernel_parm->kernel_type == LINEAR) { /* special linear case */ | |
3339 a_old=shrink_state->last_a; | |
3340 clear_vector_n(weights,totwords); | |
3341 for(i=0;i<totdoc;i++) { | |
3342 if(a[i] != a_old[i]) { | |
3343 for(f=docs[i]->fvec;f;f=f->next) | |
3344 add_vector_ns(weights,f, | |
3345 f->factor*((a[i]-a_old[i])*(double)label[i])); | |
3346 a_old[i]=a[i]; | |
3347 } | |
3348 } | |
3349 for(i=0;i<totdoc;i++) { | |
3350 if(!shrink_state->active[i]) { | |
3351 for(f=docs[i]->fvec;f;f=f->next) | |
3352 lin[i]=shrink_state->last_lin[i]+f->factor*sprod_ns(weights,f); | |
3353 } | |
3354 shrink_state->last_lin[i]=lin[i]; | |
3355 } | |
3356 } | |
3357 else { | |
3358 changed=(long *)my_malloc(sizeof(long)*totdoc); | |
3359 changed2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11)); | |
3360 inactive=(long *)my_malloc(sizeof(long)*totdoc); | |
3361 inactive2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11)); | |
3362 for(t=shrink_state->deactnum-1;(t>=0) && shrink_state->a_history[t];t--) { | |
3363 if(verbosity>=2) { | |
3364 printf("%ld..",t); fflush(stdout); | |
3365 } | |
3366 a_old=shrink_state->a_history[t]; | |
3367 for(i=0;i<totdoc;i++) { | |
3368 inactive[i]=((!shrink_state->active[i]) | |
3369 && (shrink_state->inactive_since[i] == t)); | |
3370 changed[i]= (a[i] != a_old[i]); | |
3371 } | |
3372 compute_index(inactive,totdoc,inactive2dnum); | |
3373 compute_index(changed,totdoc,changed2dnum); | |
3374 | |
3375 for(ii=0;(i=changed2dnum[ii])>=0;ii++) { | |
3376 get_kernel_row(kernel_cache,docs,i,totdoc,inactive2dnum,aicache, | |
3377 kernel_parm); | |
3378 for(jj=0;(j=inactive2dnum[jj])>=0;jj++) { | |
3379 kernel_val=aicache[j]; | |
3380 lin[j]+=(((a[i]*kernel_val)-(a_old[i]*kernel_val))*(double)label[i]); | |
3381 } | |
3382 } | |
3383 } | |
3384 free(changed); | |
3385 free(changed2dnum); | |
3386 free(inactive); | |
3387 free(inactive2dnum); | |
3388 } | |
3389 (*maxdiff)=0; | |
3390 for(i=0;i<totdoc;i++) { | |
3391 shrink_state->inactive_since[i]=shrink_state->deactnum-1; | |
3392 if(!inconsistent[i]) { | |
3393 dist=(lin[i]-model->b)*(double)label[i]; | |
3394 target=-(learn_parm->eps-(double)label[i]*c[i]); | |
3395 ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; | |
3396 if((a[i]>learn_parm->epsilon_a) && (dist > target)) { | |
3397 if((dist-target)>(*maxdiff)) /* largest violation */ | |
3398 (*maxdiff)=dist-target; | |
3399 } | |
3400 else if((a[i]<ex_c) && (dist < target)) { | |
3401 if((target-dist)>(*maxdiff)) /* largest violation */ | |
3402 (*maxdiff)=target-dist; | |
3403 } | |
3404 if((a[i]>(0+learn_parm->epsilon_a)) | |
3405 && (a[i]<ex_c)) { | |
3406 shrink_state->active[i]=1; /* not at bound */ | |
3407 } | |
3408 else if((a[i]<=(0+learn_parm->epsilon_a)) && (dist < (target+learn_parm->epsilon_shrink))) { | |
3409 shrink_state->active[i]=1; | |
3410 } | |
3411 else if((a[i]>=ex_c) | |
3412 && (dist > (target-learn_parm->epsilon_shrink))) { | |
3413 shrink_state->active[i]=1; | |
3414 } | |
3415 else if(learn_parm->sharedslack) { /* make all active when sharedslack */ | |
3416 shrink_state->active[i]=1; | |
3417 } | |
3418 } | |
3419 } | |
3420 if(kernel_parm->kernel_type != LINEAR) { /* update history for non-linear */ | |
3421 for(i=0;i<totdoc;i++) { | |
3422 (shrink_state->a_history[shrink_state->deactnum-1])[i]=a[i]; | |
3423 } | |
3424 for(t=shrink_state->deactnum-2;(t>=0) && shrink_state->a_history[t];t--) { | |
3425 free(shrink_state->a_history[t]); | |
3426 shrink_state->a_history[t]=0; | |
3427 } | |
3428 } | |
3429 } | |
3430 | |
3431 /****************************** Cache handling *******************************/ | |
3432 | |
3433 void get_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs, | |
3434 long int docnum, long int totdoc, | |
3435 long int *active2dnum, CFLOAT *buffer, | |
3436 KERNEL_PARM *kernel_parm) | |
3437 /* Get's a row of the matrix of kernel values This matrix has the | |
3438 same form as the Hessian, just that the elements are not | |
3439 multiplied by */ | |
3440 /* y_i * y_j * a_i * a_j */ | |
3441 /* Takes the values from the cache if available. */ | |
3442 { | |
3443 register long i,j,start; | |
3444 DOC *ex; | |
3445 | |
3446 ex=docs[docnum]; | |
3447 | |
3448 if(kernel_cache->index[docnum] != -1) { /* row is cached? */ | |
3449 kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */ | |
3450 start=kernel_cache->activenum*kernel_cache->index[docnum]; | |
3451 for(i=0;(j=active2dnum[i])>=0;i++) { | |
3452 if(kernel_cache->totdoc2active[j] >= 0) { /* column is cached? */ | |
3453 buffer[j]=kernel_cache->buffer[start+kernel_cache->totdoc2active[j]]; | |
3454 } | |
3455 else { | |
3456 buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]); | |
3457 } | |
3458 } | |
3459 } | |
3460 else { | |
3461 for(i=0;(j=active2dnum[i])>=0;i++) { | |
3462 buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]); | |
3463 } | |
3464 } | |
3465 } | |
3466 | |
3467 | |
3468 void cache_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs, | |
3469 long int m, KERNEL_PARM *kernel_parm) | |
3470 /* Fills cache for the row m */ | |
3471 { | |
3472 register DOC *ex; | |
3473 register long j,k,l; | |
3474 register CFLOAT *cache; | |
3475 | |
3476 if(!kernel_cache_check(kernel_cache,m)) { /* not cached yet*/ | |
3477 cache = kernel_cache_clean_and_malloc(kernel_cache,m); | |
3478 if(cache) { | |
3479 l=kernel_cache->totdoc2active[m]; | |
3480 ex=docs[m]; | |
3481 for(j=0;j<kernel_cache->activenum;j++) { /* fill cache */ | |
3482 k=kernel_cache->active2totdoc[j]; | |
3483 if((kernel_cache->index[k] != -1) && (l != -1) && (k != m)) { | |
3484 cache[j]=kernel_cache->buffer[kernel_cache->activenum | |
3485 *kernel_cache->index[k]+l]; | |
3486 } | |
3487 else { | |
3488 cache[j]=kernel(kernel_parm,ex,docs[k]); | |
3489 } | |
3490 } | |
3491 } | |
3492 else { | |
3493 perror("Error: Kernel cache full! => increase cache size"); | |
3494 } | |
3495 } | |
3496 } | |
3497 | |
3498 | |
3499 void cache_multiple_kernel_rows(KERNEL_CACHE *kernel_cache, DOC **docs, | |
3500 long int *key, long int varnum, | |
3501 KERNEL_PARM *kernel_parm) | |
3502 /* Fills cache for the rows in key */ | |
3503 { | |
3504 register long i; | |
3505 | |
3506 for(i=0;i<varnum;i++) { /* fill up kernel cache */ | |
3507 cache_kernel_row(kernel_cache,docs,key[i],kernel_parm); | |
3508 } | |
3509 } | |
3510 | |
3511 | |
3512 void kernel_cache_shrink(KERNEL_CACHE *kernel_cache, long int totdoc, | |
3513 long int numshrink, long int *after) | |
3514 /* Remove numshrink columns in the cache which correspond to | |
3515 examples marked 0 in after. */ | |
3516 { | |
3517 register long i,j,jj,from=0,to=0,scount; | |
3518 long *keep; | |
3519 | |
3520 if(verbosity>=2) { | |
3521 printf(" Reorganizing cache..."); fflush(stdout); | |
3522 } | |
3523 | |
3524 keep=(long *)my_malloc(sizeof(long)*totdoc); | |
3525 for(j=0;j<totdoc;j++) { | |
3526 keep[j]=1; | |
3527 } | |
3528 scount=0; | |
3529 for(jj=0;(jj<kernel_cache->activenum) && (scount<numshrink);jj++) { | |
3530 j=kernel_cache->active2totdoc[jj]; | |
3531 if(!after[j]) { | |
3532 scount++; | |
3533 keep[j]=0; | |
3534 } | |
3535 } | |
3536 | |
3537 for(i=0;i<kernel_cache->max_elems;i++) { | |
3538 for(jj=0;jj<kernel_cache->activenum;jj++) { | |
3539 j=kernel_cache->active2totdoc[jj]; | |
3540 if(!keep[j]) { | |
3541 from++; | |
3542 } | |
3543 else { | |
3544 kernel_cache->buffer[to]=kernel_cache->buffer[from]; | |
3545 to++; | |
3546 from++; | |
3547 } | |
3548 } | |
3549 } | |
3550 | |
3551 kernel_cache->activenum=0; | |
3552 for(j=0;j<totdoc;j++) { | |
3553 if((keep[j]) && (kernel_cache->totdoc2active[j] != -1)) { | |
3554 kernel_cache->active2totdoc[kernel_cache->activenum]=j; | |
3555 kernel_cache->totdoc2active[j]=kernel_cache->activenum; | |
3556 kernel_cache->activenum++; | |
3557 } | |
3558 else { | |
3559 kernel_cache->totdoc2active[j]=-1; | |
3560 } | |
3561 } | |
3562 | |
3563 kernel_cache->max_elems=(long)(kernel_cache->buffsize/kernel_cache->activenum); | |
3564 if(kernel_cache->max_elems>totdoc) { | |
3565 kernel_cache->max_elems=totdoc; | |
3566 } | |
3567 | |
3568 free(keep); | |
3569 | |
3570 if(verbosity>=2) { | |
3571 printf("done.\n"); fflush(stdout); | |
3572 printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems); | |
3573 } | |
3574 } | |
3575 | |
3576 KERNEL_CACHE *kernel_cache_init(long int totdoc, long int buffsize) | |
3577 { | |
3578 long i; | |
3579 KERNEL_CACHE *kernel_cache; | |
3580 | |
3581 kernel_cache=(KERNEL_CACHE *)my_malloc(sizeof(KERNEL_CACHE)); | |
3582 kernel_cache->index = (long *)my_malloc(sizeof(long)*totdoc); | |
3583 kernel_cache->occu = (long *)my_malloc(sizeof(long)*totdoc); | |
3584 kernel_cache->lru = (long *)my_malloc(sizeof(long)*totdoc); | |
3585 kernel_cache->invindex = (long *)my_malloc(sizeof(long)*totdoc); | |
3586 kernel_cache->active2totdoc = (long *)my_malloc(sizeof(long)*totdoc); | |
3587 kernel_cache->totdoc2active = (long *)my_malloc(sizeof(long)*totdoc); | |
3588 kernel_cache->buffer = (CFLOAT *)my_malloc((size_t)(buffsize)*1024*1024); | |
3589 | |
3590 kernel_cache->buffsize=(long)(buffsize/sizeof(CFLOAT)*1024*1024); | |
3591 | |
3592 kernel_cache->max_elems=(long)(kernel_cache->buffsize/totdoc); | |
3593 if(kernel_cache->max_elems>totdoc) { | |
3594 kernel_cache->max_elems=totdoc; | |
3595 } | |
3596 | |
3597 if(verbosity>=2) { | |
3598 printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems); | |
3599 printf(" Kernel evals so far: %ld\n",kernel_cache_statistic); | |
3600 } | |
3601 | |
3602 kernel_cache->elems=0; /* initialize cache */ | |
3603 for(i=0;i<totdoc;i++) { | |
3604 kernel_cache->index[i]=-1; | |
3605 kernel_cache->lru[i]=0; | |
3606 } | |
3607 for(i=0;i<totdoc;i++) { | |
3608 kernel_cache->occu[i]=0; | |
3609 kernel_cache->invindex[i]=-1; | |
3610 } | |
3611 | |
3612 kernel_cache->activenum=totdoc;; | |
3613 for(i=0;i<totdoc;i++) { | |
3614 kernel_cache->active2totdoc[i]=i; | |
3615 kernel_cache->totdoc2active[i]=i; | |
3616 } | |
3617 | |
3618 kernel_cache->time=0; | |
3619 | |
3620 return(kernel_cache); | |
3621 } | |
3622 | |
3623 void kernel_cache_reset_lru(KERNEL_CACHE *kernel_cache) | |
3624 { | |
3625 long maxlru=0,k; | |
3626 | |
3627 for(k=0;k<kernel_cache->max_elems;k++) { | |
3628 if(maxlru < kernel_cache->lru[k]) | |
3629 maxlru=kernel_cache->lru[k]; | |
3630 } | |
3631 for(k=0;k<kernel_cache->max_elems;k++) { | |
3632 kernel_cache->lru[k]-=maxlru; | |
3633 } | |
3634 } | |
3635 | |
3636 void kernel_cache_cleanup(KERNEL_CACHE *kernel_cache) | |
3637 { | |
3638 free(kernel_cache->index); | |
3639 free(kernel_cache->occu); | |
3640 free(kernel_cache->lru); | |
3641 free(kernel_cache->invindex); | |
3642 free(kernel_cache->active2totdoc); | |
3643 free(kernel_cache->totdoc2active); | |
3644 free(kernel_cache->buffer); | |
3645 free(kernel_cache); | |
3646 } | |
3647 | |
3648 long kernel_cache_malloc(KERNEL_CACHE *kernel_cache) | |
3649 { | |
3650 long i; | |
3651 | |
3652 if(kernel_cache_space_available(kernel_cache)) { | |
3653 for(i=0;i<kernel_cache->max_elems;i++) { | |
3654 if(!kernel_cache->occu[i]) { | |
3655 kernel_cache->occu[i]=1; | |
3656 kernel_cache->elems++; | |
3657 return(i); | |
3658 } | |
3659 } | |
3660 } | |
3661 return(-1); | |
3662 } | |
3663 | |
3664 void kernel_cache_free(KERNEL_CACHE *kernel_cache, long int i) | |
3665 { | |
3666 kernel_cache->occu[i]=0; | |
3667 kernel_cache->elems--; | |
3668 } | |
3669 | |
3670 long kernel_cache_free_lru(KERNEL_CACHE *kernel_cache) | |
3671 /* remove least recently used cache element */ | |
3672 { | |
3673 register long k,least_elem=-1,least_time; | |
3674 | |
3675 least_time=kernel_cache->time+1; | |
3676 for(k=0;k<kernel_cache->max_elems;k++) { | |
3677 if(kernel_cache->invindex[k] != -1) { | |
3678 if(kernel_cache->lru[k]<least_time) { | |
3679 least_time=kernel_cache->lru[k]; | |
3680 least_elem=k; | |
3681 } | |
3682 } | |
3683 } | |
3684 if(least_elem != -1) { | |
3685 kernel_cache_free(kernel_cache,least_elem); | |
3686 kernel_cache->index[kernel_cache->invindex[least_elem]]=-1; | |
3687 kernel_cache->invindex[least_elem]=-1; | |
3688 return(1); | |
3689 } | |
3690 return(0); | |
3691 } | |
3692 | |
3693 | |
3694 CFLOAT *kernel_cache_clean_and_malloc(KERNEL_CACHE *kernel_cache, | |
3695 long int docnum) | |
3696 /* Get a free cache entry. In case cache is full, the lru element | |
3697 is removed. */ | |
3698 { | |
3699 long result; | |
3700 if((result = kernel_cache_malloc(kernel_cache)) == -1) { | |
3701 if(kernel_cache_free_lru(kernel_cache)) { | |
3702 result = kernel_cache_malloc(kernel_cache); | |
3703 } | |
3704 } | |
3705 kernel_cache->index[docnum]=result; | |
3706 if(result == -1) { | |
3707 return(0); | |
3708 } | |
3709 kernel_cache->invindex[result]=docnum; | |
3710 kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */ | |
3711 return((CFLOAT *)((long)kernel_cache->buffer | |
3712 +(kernel_cache->activenum*sizeof(CFLOAT)* | |
3713 kernel_cache->index[docnum]))); | |
3714 } | |
3715 | |
3716 long kernel_cache_touch(KERNEL_CACHE *kernel_cache, long int docnum) | |
3717 /* Update lru time to avoid removal from cache. */ | |
3718 { | |
3719 if(kernel_cache && kernel_cache->index[docnum] != -1) { | |
3720 kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */ | |
3721 return(1); | |
3722 } | |
3723 return(0); | |
3724 } | |
3725 | |
3726 long kernel_cache_check(KERNEL_CACHE *kernel_cache, long int docnum) | |
3727 /* Is that row cached? */ | |
3728 { | |
3729 return(kernel_cache->index[docnum] != -1); | |
3730 } | |
3731 | |
3732 long kernel_cache_space_available(KERNEL_CACHE *kernel_cache) | |
3733 /* Is there room for one more row? */ | |
3734 { | |
3735 return(kernel_cache->elems < kernel_cache->max_elems); | |
3736 } | |
3737 | |
3738 /************************** Compute estimates ******************************/ | |
3739 | |
3740 void compute_xa_estimates(MODEL *model, long int *label, | |
3741 long int *unlabeled, long int totdoc, | |
3742 DOC **docs, double *lin, double *a, | |
3743 KERNEL_PARM *kernel_parm, | |
3744 LEARN_PARM *learn_parm, double *error, | |
3745 double *recall, double *precision) | |
3746 /* Computes xa-estimate of error rate, recall, and precision. See | |
3747 T. Joachims, Estimating the Generalization Performance of an SVM | |
3748 Efficiently, IMCL, 2000. */ | |
3749 { | |
3750 long i,looerror,looposerror,loonegerror; | |
3751 long totex,totposex; | |
3752 double xi,r_delta,r_delta_sq,sim=0; | |
3753 long *sv2dnum=NULL,*sv=NULL,svnum; | |
3754 | |
3755 r_delta=estimate_r_delta(docs,totdoc,kernel_parm); | |
3756 r_delta_sq=r_delta*r_delta; | |
3757 | |
3758 looerror=0; | |
3759 looposerror=0; | |
3760 loonegerror=0; | |
3761 totex=0; | |
3762 totposex=0; | |
3763 svnum=0; | |
3764 | |
3765 if(learn_parm->xa_depth > 0) { | |
3766 sv = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
3767 for(i=0;i<totdoc;i++) | |
3768 sv[i]=0; | |
3769 for(i=1;i<model->sv_num;i++) | |
3770 if(a[model->supvec[i]->docnum] | |
3771 < (learn_parm->svm_cost[model->supvec[i]->docnum] | |
3772 -learn_parm->epsilon_a)) { | |
3773 sv[model->supvec[i]->docnum]=1; | |
3774 svnum++; | |
3775 } | |
3776 sv2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); | |
3777 clear_index(sv2dnum); | |
3778 compute_index(sv,totdoc,sv2dnum); | |
3779 } | |
3780 | |
3781 for(i=0;i<totdoc;i++) { | |
3782 if(unlabeled[i]) { | |
3783 /* ignore it */ | |
3784 } | |
3785 else { | |
3786 xi=1.0-((lin[i]-model->b)*(double)label[i]); | |
3787 if(xi<0) xi=0; | |
3788 if(label[i]>0) { | |
3789 totposex++; | |
3790 } | |
3791 if((learn_parm->rho*a[i]*r_delta_sq+xi) >= 1.0) { | |
3792 if(learn_parm->xa_depth > 0) { /* makes assumptions */ | |
3793 sim=distribute_alpha_t_greedily(sv2dnum,svnum,docs,a,i,label, | |
3794 kernel_parm,learn_parm, | |
3795 (double)((1.0-xi-a[i]*r_delta_sq)/(2.0*a[i]))); | |
3796 } | |
3797 if((learn_parm->xa_depth == 0) || | |
3798 ((a[i]*kernel(kernel_parm,docs[i],docs[i])+a[i]*2.0*sim+xi) >= 1.0)) { | |
3799 looerror++; | |
3800 if(label[i]>0) { | |
3801 looposerror++; | |
3802 } | |
3803 else { | |
3804 loonegerror++; | |
3805 } | |
3806 } | |
3807 } | |
3808 totex++; | |
3809 } | |
3810 } | |
3811 | |
3812 (*error)=((double)looerror/(double)totex)*100.0; | |
3813 (*recall)=(1.0-(double)looposerror/(double)totposex)*100.0; | |
3814 (*precision)=(((double)totposex-(double)looposerror) | |
3815 /((double)totposex-(double)looposerror+(double)loonegerror))*100.0; | |
3816 | |
3817 free(sv); | |
3818 free(sv2dnum); | |
3819 } | |
3820 | |
3821 | |
3822 double distribute_alpha_t_greedily(long int *sv2dnum, long int svnum, | |
3823 DOC **docs, double *a, | |
3824 long int docnum, | |
3825 long int *label, | |
3826 KERNEL_PARM *kernel_parm, | |
3827 LEARN_PARM *learn_parm, double thresh) | |
3828 /* Experimental Code improving plain XiAlpha Estimates by | |
3829 computing a better bound using a greedy optimzation strategy. */ | |
3830 { | |
3831 long best_depth=0; | |
3832 long i,j,k,d,skip,allskip; | |
3833 double best,best_val[101],val,init_val_sq,init_val_lin; | |
3834 long best_ex[101]; | |
3835 CFLOAT *cache,*trow; | |
3836 | |
3837 cache=(CFLOAT *)my_malloc(sizeof(CFLOAT)*learn_parm->xa_depth*svnum); | |
3838 trow = (CFLOAT *)my_malloc(sizeof(CFLOAT)*svnum); | |
3839 | |
3840 for(k=0;k<svnum;k++) { | |
3841 trow[k]=kernel(kernel_parm,docs[docnum],docs[sv2dnum[k]]); | |
3842 } | |
3843 | |
3844 init_val_sq=0; | |
3845 init_val_lin=0; | |
3846 best=0; | |
3847 | |
3848 for(d=0;d<learn_parm->xa_depth;d++) { | |
3849 allskip=1; | |
3850 if(d>=1) { | |
3851 init_val_sq+=cache[best_ex[d-1]+svnum*(d-1)]; | |
3852 for(k=0;k<d-1;k++) { | |
3853 init_val_sq+=2.0*cache[best_ex[k]+svnum*(d-1)]; | |
3854 } | |
3855 init_val_lin+=trow[best_ex[d-1]]; | |
3856 } | |
3857 for(i=0;i<svnum;i++) { | |
3858 skip=0; | |
3859 if(sv2dnum[i] == docnum) skip=1; | |
3860 for(j=0;j<d;j++) { | |
3861 if(i == best_ex[j]) skip=1; | |
3862 } | |
3863 | |
3864 if(!skip) { | |
3865 val=init_val_sq; | |
3866 if(kernel_parm->kernel_type == LINEAR) | |
3867 val+=docs[sv2dnum[i]]->fvec->twonorm_sq; | |
3868 else | |
3869 val+=kernel(kernel_parm,docs[sv2dnum[i]],docs[sv2dnum[i]]); | |
3870 for(j=0;j<d;j++) { | |
3871 val+=2.0*cache[i+j*svnum]; | |
3872 } | |
3873 val*=(1.0/(2.0*(d+1.0)*(d+1.0))); | |
3874 val-=((init_val_lin+trow[i])/(d+1.0)); | |
3875 | |
3876 if(allskip || (val < best_val[d])) { | |
3877 best_val[d]=val; | |
3878 best_ex[d]=i; | |
3879 } | |
3880 allskip=0; | |
3881 if(val < thresh) { | |
3882 i=svnum; | |
3883 /* printf("EARLY"); */ | |
3884 } | |
3885 } | |
3886 } | |
3887 if(!allskip) { | |
3888 for(k=0;k<svnum;k++) { | |
3889 cache[d*svnum+k]=kernel(kernel_parm, | |
3890 docs[sv2dnum[best_ex[d]]], | |
3891 docs[sv2dnum[k]]); | |
3892 } | |
3893 } | |
3894 if((!allskip) && ((best_val[d] < best) || (d == 0))) { | |
3895 best=best_val[d]; | |
3896 best_depth=d; | |
3897 } | |
3898 if(allskip || (best < thresh)) { | |
3899 d=learn_parm->xa_depth; | |
3900 } | |
3901 } | |
3902 | |
3903 free(cache); | |
3904 free(trow); | |
3905 | |
3906 /* printf("Distribute[%ld](%ld)=%f, ",docnum,best_depth,best); */ | |
3907 return(best); | |
3908 } | |
3909 | |
3910 | |
3911 void estimate_transduction_quality(MODEL *model, long int *label, | |
3912 long int *unlabeled, | |
3913 long int totdoc, DOC **docs, double *lin) | |
3914 /* Loo-bound based on observation that loo-errors must have an | |
3915 equal distribution in both training and test examples, given | |
3916 that the test examples are classified correctly. Compare | |
3917 chapter "Constraints on the Transductive Hyperplane" in my | |
3918 Dissertation. */ | |
3919 { | |
3920 long i,j,l=0,ulab=0,lab=0,labpos=0,labneg=0,ulabpos=0,ulabneg=0,totulab=0; | |
3921 double totlab=0,totlabpos=0,totlabneg=0,labsum=0,ulabsum=0; | |
3922 double r_delta,r_delta_sq,xi,xisum=0,asum=0; | |
3923 | |
3924 r_delta=estimate_r_delta(docs,totdoc,&(model->kernel_parm)); | |
3925 r_delta_sq=r_delta*r_delta; | |
3926 | |
3927 for(j=0;j<totdoc;j++) { | |
3928 if(unlabeled[j]) { | |
3929 totulab++; | |
3930 } | |
3931 else { | |
3932 totlab++; | |
3933 if(label[j] > 0) | |
3934 totlabpos++; | |
3935 else | |
3936 totlabneg++; | |
3937 } | |
3938 } | |
3939 for(j=1;j<model->sv_num;j++) { | |
3940 i=model->supvec[j]->docnum; | |
3941 xi=1.0-((lin[i]-model->b)*(double)label[i]); | |
3942 if(xi<0) xi=0; | |
3943 | |
3944 xisum+=xi; | |
3945 asum+=fabs(model->alpha[j]); | |
3946 if(unlabeled[i]) { | |
3947 ulabsum+=(fabs(model->alpha[j])*r_delta_sq+xi); | |
3948 } | |
3949 else { | |
3950 labsum+=(fabs(model->alpha[j])*r_delta_sq+xi); | |
3951 } | |
3952 if((fabs(model->alpha[j])*r_delta_sq+xi) >= 1) { | |
3953 l++; | |
3954 if(unlabeled[model->supvec[j]->docnum]) { | |
3955 ulab++; | |
3956 if(model->alpha[j] > 0) | |
3957 ulabpos++; | |
3958 else | |
3959 ulabneg++; | |
3960 } | |
3961 else { | |
3962 lab++; | |
3963 if(model->alpha[j] > 0) | |
3964 labpos++; | |
3965 else | |
3966 labneg++; | |
3967 } | |
3968 } | |
3969 } | |
3970 printf("xacrit>=1: labeledpos=%.5f labeledneg=%.5f default=%.5f\n",(double)labpos/(double)totlab*100.0,(double)labneg/(double)totlab*100.0,(double)totlabpos/(double)(totlab)*100.0); | |
3971 printf("xacrit>=1: unlabelpos=%.5f unlabelneg=%.5f\n",(double)ulabpos/(double)totulab*100.0,(double)ulabneg/(double)totulab*100.0); | |
3972 printf("xacrit>=1: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)lab/(double)totlab*100.0,(double)ulab/(double)totulab*100.0,(double)l/(double)(totdoc)*100.0); | |
3973 printf("xacritsum: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)labsum/(double)totlab*100.0,(double)ulabsum/(double)totulab*100.0,(double)(labsum+ulabsum)/(double)(totdoc)*100.0); | |
3974 printf("r_delta_sq=%.5f xisum=%.5f asum=%.5f\n",r_delta_sq,xisum,asum); | |
3975 } | |
3976 | |
3977 double estimate_margin_vcdim(MODEL *model, double w, double R, | |
3978 KERNEL_PARM *kernel_parm) | |
3979 /* optional: length of model vector in feature space */ | |
3980 /* optional: radius of ball containing the data */ | |
3981 { | |
3982 double h; | |
3983 | |
3984 /* follows chapter 5.6.4 in [Vapnik/95] */ | |
3985 | |
3986 if(w<0) { | |
3987 w=model_length_s(model,kernel_parm); | |
3988 } | |
3989 if(R<0) { | |
3990 R=estimate_sphere(model,kernel_parm); | |
3991 } | |
3992 h = w*w * R*R +1; | |
3993 return(h); | |
3994 } | |
3995 | |
3996 double estimate_sphere(MODEL *model, KERNEL_PARM *kernel_parm) | |
3997 /* Approximates the radius of the ball containing */ | |
3998 /* the support vectors by bounding it with the */ | |
3999 { /* length of the longest support vector. This is */ | |
4000 register long j; /* pretty good for text categorization, since all */ | |
4001 double xlen,maxxlen=0; /* documents have feature vectors of length 1. It */ | |
4002 DOC *nulldoc; /* assumes that the center of the ball is at the */ | |
4003 WORD nullword; /* origin of the space. */ | |
4004 | |
4005 nullword.wnum=0; | |
4006 nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); | |
4007 | |
4008 for(j=1;j<model->sv_num;j++) { | |
4009 xlen=sqrt(kernel(kernel_parm,model->supvec[j],model->supvec[j]) | |
4010 -2*kernel(kernel_parm,model->supvec[j],nulldoc) | |
4011 +kernel(kernel_parm,nulldoc,nulldoc)); | |
4012 if(xlen>maxxlen) { | |
4013 maxxlen=xlen; | |
4014 } | |
4015 } | |
4016 | |
4017 free_example(nulldoc,1); | |
4018 return(maxxlen); | |
4019 } | |
4020 | |
4021 double estimate_r_delta(DOC **docs, long int totdoc, KERNEL_PARM *kernel_parm) | |
4022 { | |
4023 long i; | |
4024 double maxxlen,xlen; | |
4025 DOC *nulldoc; /* assumes that the center of the ball is at the */ | |
4026 WORD nullword; /* origin of the space. */ | |
4027 | |
4028 nullword.wnum=0; | |
4029 nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); | |
4030 | |
4031 maxxlen=0; | |
4032 for(i=0;i<totdoc;i++) { | |
4033 xlen=sqrt(kernel(kernel_parm,docs[i],docs[i]) | |
4034 -2*kernel(kernel_parm,docs[i],nulldoc) | |
4035 +kernel(kernel_parm,nulldoc,nulldoc)); | |
4036 if(xlen>maxxlen) { | |
4037 maxxlen=xlen; | |
4038 } | |
4039 } | |
4040 | |
4041 free_example(nulldoc,1); | |
4042 return(maxxlen); | |
4043 } | |
4044 | |
4045 double estimate_r_delta_average(DOC **docs, long int totdoc, | |
4046 KERNEL_PARM *kernel_parm) | |
4047 { | |
4048 long i; | |
4049 double avgxlen; | |
4050 DOC *nulldoc; /* assumes that the center of the ball is at the */ | |
4051 WORD nullword; /* origin of the space. */ | |
4052 | |
4053 nullword.wnum=0; | |
4054 nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); | |
4055 | |
4056 avgxlen=0; | |
4057 for(i=0;i<totdoc;i++) { | |
4058 avgxlen+=sqrt(kernel(kernel_parm,docs[i],docs[i]) | |
4059 -2*kernel(kernel_parm,docs[i],nulldoc) | |
4060 +kernel(kernel_parm,nulldoc,nulldoc)); | |
4061 } | |
4062 | |
4063 free_example(nulldoc,1); | |
4064 return(avgxlen/totdoc); | |
4065 } | |
4066 | |
4067 double length_of_longest_document_vector(DOC **docs, long int totdoc, | |
4068 KERNEL_PARM *kernel_parm) | |
4069 { | |
4070 long i; | |
4071 double maxxlen,xlen; | |
4072 | |
4073 maxxlen=0; | |
4074 for(i=0;i<totdoc;i++) { | |
4075 xlen=sqrt(kernel(kernel_parm,docs[i],docs[i])); | |
4076 if(xlen>maxxlen) { | |
4077 maxxlen=xlen; | |
4078 } | |
4079 } | |
4080 | |
4081 return(maxxlen); | |
4082 } | |
4083 | |
4084 /****************************** IO-handling **********************************/ | |
4085 | |
4086 void write_prediction(char *predfile, MODEL *model, double *lin, | |
4087 double *a, long int *unlabeled, | |
4088 long int *label, long int totdoc, | |
4089 LEARN_PARM *learn_parm) | |
4090 { | |
4091 FILE *predfl; | |
4092 long i; | |
4093 double dist,a_max; | |
4094 | |
4095 if(verbosity>=1) { | |
4096 printf("Writing prediction file..."); fflush(stdout); | |
4097 } | |
4098 if ((predfl = fopen (predfile, "w")) == NULL) | |
4099 { perror (predfile); exit (1); } | |
4100 a_max=learn_parm->epsilon_a; | |
4101 for(i=0;i<totdoc;i++) { | |
4102 if((unlabeled[i]) && (a[i]>a_max)) { | |
4103 a_max=a[i]; | |
4104 } | |
4105 } | |
4106 for(i=0;i<totdoc;i++) { | |
4107 if(unlabeled[i]) { | |
4108 if((a[i]>(learn_parm->epsilon_a))) { | |
4109 dist=(double)label[i]*(1.0-learn_parm->epsilon_crit-a[i]/(a_max*2.0)); | |
4110 } | |
4111 else { | |
4112 dist=(lin[i]-model->b); | |
4113 } | |
4114 if(dist>0) { | |
4115 fprintf(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist); | |
4116 } | |
4117 else { | |
4118 fprintf(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist); | |
4119 } | |
4120 } | |
4121 } | |
4122 fclose(predfl); | |
4123 if(verbosity>=1) { | |
4124 printf("done\n"); | |
4125 } | |
4126 } | |
4127 | |
4128 void write_alphas(char *alphafile, double *a, | |
4129 long int *label, long int totdoc) | |
4130 { | |
4131 FILE *alphafl; | |
4132 long i; | |
4133 | |
4134 if(verbosity>=1) { | |
4135 printf("Writing alpha file..."); fflush(stdout); | |
4136 } | |
4137 if ((alphafl = fopen (alphafile, "w")) == NULL) | |
4138 { perror (alphafile); exit (1); } | |
4139 for(i=0;i<totdoc;i++) { | |
4140 fprintf(alphafl,"%.18g\n",a[i]*(double)label[i]); | |
4141 } | |
4142 fclose(alphafl); | |
4143 if(verbosity>=1) { | |
4144 printf("done\n"); | |
4145 } | |
4146 } | |
4147 |