Mercurial > hg > camir-aes2014
view toolboxes/SVM-light/src/svm_learn.c @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line source
/***********************************************************************/ /* */ /* svm_learn.c */ /* */ /* Learning module of Support Vector Machine. */ /* */ /* Author: Thorsten Joachims */ /* Date: 02.07.02 */ /* */ /* Copyright (c) 2002 Thorsten Joachims - All rights reserved */ /* */ /* This software is available for non-commercial use only. It must */ /* not be modified and distributed without prior permission of the */ /* author. The author is not responsible for implications from the */ /* use of this software. */ /* */ /***********************************************************************/ # include "svm_common.h" # include "svm_learn.h" /* interface to QP-solver */ double *optimize_qp(QP *, double *, long, double *, LEARN_PARM *); /*---------------------------------------------------------------------------*/ /* Learns an SVM classification model based on the training data in docs/label. The resulting model is returned in the structure model. */ void svm_learn_classification(DOC **docs, double *class, long int totdoc, long int totwords, LEARN_PARM *learn_parm, KERNEL_PARM *kernel_parm, KERNEL_CACHE *kernel_cache, MODEL *model, double *alpha) /* docs: Training vectors (x-part) */ /* class: Training labels (y-part, zero if test example for transduction) */ /* totdoc: Number of examples in docs/label */ /* totwords: Number of features (i.e. highest feature index) */ /* learn_parm: Learning paramenters */ /* kernel_parm: Kernel paramenters */ /* kernel_cache:Initialized Cache of size totdoc, if using a kernel. NULL if linear.*/ /* model: Returns learning result (assumed empty before called) */ /* alpha: Start values for the alpha variables or NULL pointer. The new alpha values are returned after optimization if not NULL. Array must be of size totdoc. */ { long *inconsistent,i,*label; long inconsistentnum; long misclassified,upsupvecnum; double loss,model_length,example_length; double maxdiff,*lin,*a,*c; long runtime_start,runtime_end; long iterations; long *unlabeled,transduction; long heldout; long loo_count=0,loo_count_pos=0,loo_count_neg=0,trainpos=0,trainneg=0; long loocomputed=0,runtime_start_loo=0,runtime_start_xa=0; double heldout_c=0,r_delta_sq=0,r_delta,r_delta_avg; long *index,*index2dnum; double *weights; CFLOAT *aicache; /* buffer to keep one row of hessian */ double *xi_fullset; /* buffer for storing xi on full sample in loo */ double *a_fullset; /* buffer for storing alpha on full sample in loo */ TIMING timing_profile; SHRINK_STATE shrink_state; runtime_start=get_runtime(); timing_profile.time_kernel=0; timing_profile.time_opti=0; timing_profile.time_shrink=0; timing_profile.time_update=0; timing_profile.time_model=0; timing_profile.time_check=0; timing_profile.time_select=0; kernel_cache_statistic=0; learn_parm->totwords=totwords; /* make sure -n value is reasonable */ if((learn_parm->svm_newvarsinqp < 2) || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) { learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; } init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK); label = (long *)my_malloc(sizeof(long)*totdoc); inconsistent = (long *)my_malloc(sizeof(long)*totdoc); unlabeled = (long *)my_malloc(sizeof(long)*totdoc); c = (double *)my_malloc(sizeof(double)*totdoc); a = (double *)my_malloc(sizeof(double)*totdoc); a_fullset = (double *)my_malloc(sizeof(double)*totdoc); xi_fullset = (double *)my_malloc(sizeof(double)*totdoc); lin = (double *)my_malloc(sizeof(double)*totdoc); learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc); model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); model->at_upper_bound=0; model->b=0; model->supvec[0]=0; /* element 0 reserved and empty for now */ model->alpha[0]=0; model->lin_weights=NULL; model->totwords=totwords; model->totdoc=totdoc; model->kernel_parm=(*kernel_parm); model->sv_num=1; model->loo_error=-1; model->loo_recall=-1; model->loo_precision=-1; model->xa_error=-1; model->xa_recall=-1; model->xa_precision=-1; inconsistentnum=0; transduction=0; r_delta=estimate_r_delta(docs,totdoc,kernel_parm); r_delta_sq=r_delta*r_delta; r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm); if(learn_parm->svm_c == 0.0) { /* default value for C */ learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg); if(verbosity>=1) printf("Setting default regularization parameter C=%.4f\n", learn_parm->svm_c); } learn_parm->eps=-1.0; /* equivalent regression epsilon for classification */ for(i=0;i<totdoc;i++) { /* various inits */ docs[i]->docnum=i; inconsistent[i]=0; a[i]=0; lin[i]=0; c[i]=0.0; unlabeled[i]=0; if(class[i] == 0) { unlabeled[i]=1; label[i]=0; transduction=1; } if(class[i] > 0) { learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio* docs[i]->costfactor; label[i]=1; trainpos++; } else if(class[i] < 0) { learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor; label[i]=-1; trainneg++; } else { learn_parm->svm_cost[i]=0; } } if(verbosity>=2) { printf("%ld positive, %ld negative, and %ld unlabeled examples.\n",trainpos,trainneg,totdoc-trainpos-trainneg); fflush(stdout); } /* caching makes no sense for linear kernel */ if(kernel_parm->kernel_type == LINEAR) { kernel_cache = NULL; } /* compute starting state for initial alpha values */ if(alpha) { if(verbosity>=1) { printf("Computing starting state..."); fflush(stdout); } index = (long *)my_malloc(sizeof(long)*totdoc); index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); weights=(double *)my_malloc(sizeof(double)*(totwords+1)); aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); for(i=0;i<totdoc;i++) { /* create full index and clip alphas */ index[i]=1; alpha[i]=fabs(alpha[i]); if(alpha[i]<0) alpha[i]=0; if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i]; } if(kernel_parm->kernel_type != LINEAR) { for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */ if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) && (kernel_cache_space_available(kernel_cache))) cache_kernel_row(kernel_cache,docs,i,kernel_parm); for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */ if((alpha[i]==learn_parm->svm_cost[i]) && (kernel_cache_space_available(kernel_cache))) cache_kernel_row(kernel_cache,docs,i,kernel_parm); } (void)compute_index(index,totdoc,index2dnum); update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc, totwords,kernel_parm,kernel_cache,lin,aicache, weights); (void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c, learn_parm,index2dnum,index2dnum,model); for(i=0;i<totdoc;i++) { /* copy initial alphas */ a[i]=alpha[i]; } free(index); free(index2dnum); free(weights); free(aicache); if(verbosity>=1) { printf("done.\n"); fflush(stdout); } } if(transduction) { learn_parm->svm_iter_to_shrink=99999999; if(verbosity >= 1) printf("\nDeactivating Shrinking due to an incompatibility with the transductive \nlearner in the current version.\n\n"); } if(transduction && learn_parm->compute_loo) { learn_parm->compute_loo=0; if(verbosity >= 1) printf("\nCannot compute leave-one-out estimates for transductive learner.\n\n"); } if(learn_parm->remove_inconsistent && learn_parm->compute_loo) { learn_parm->compute_loo=0; printf("\nCannot compute leave-one-out estimates when removing inconsistent examples.\n\n"); } if(learn_parm->compute_loo && ((trainpos == 1) || (trainneg == 1))) { learn_parm->compute_loo=0; printf("\nCannot compute leave-one-out with only one example in one class.\n\n"); } if(verbosity==1) { printf("Optimizing"); fflush(stdout); } /* train the svm */ iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, kernel_parm,kernel_cache,&shrink_state,model, inconsistent,unlabeled,a,lin, c,&timing_profile, &maxdiff,(long)-1, (long)1); if(verbosity>=1) { if(verbosity==1) printf("done. (%ld iterations)\n",iterations); misclassified=0; for(i=0;(i<totdoc);i++) { /* get final statistic */ if((lin[i]-model->b)*(double)label[i] <= 0.0) misclassified++; } printf("Optimization finished (%ld misclassified, maxdiff=%.5f).\n", misclassified,maxdiff); runtime_end=get_runtime(); if(verbosity>=2) { printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n", ((float)runtime_end-(float)runtime_start)/100.0, (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start)); } else { printf("Runtime in cpu-seconds: %.2f\n", (runtime_end-runtime_start)/100.0); } if(learn_parm->remove_inconsistent) { inconsistentnum=0; for(i=0;i<totdoc;i++) if(inconsistent[i]) inconsistentnum++; printf("Number of SV: %ld (plus %ld inconsistent examples)\n", model->sv_num-1,inconsistentnum); } else { upsupvecnum=0; for(i=1;i<model->sv_num;i++) { if(fabs(model->alpha[i]) >= (learn_parm->svm_cost[(model->supvec[i])->docnum]- learn_parm->epsilon_a)) upsupvecnum++; } printf("Number of SV: %ld (including %ld at upper bound)\n", model->sv_num-1,upsupvecnum); } if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { loss=0; model_length=0; for(i=0;i<totdoc;i++) { if((lin[i]-model->b)*(double)label[i] < 1.0-learn_parm->epsilon_crit) loss+=1.0-(lin[i]-model->b)*(double)label[i]; model_length+=a[i]*label[i]*lin[i]; } model_length=sqrt(model_length); fprintf(stdout,"L1 loss: loss=%.5f\n",loss); fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length); example_length=estimate_sphere(model,kernel_parm); fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n", length_of_longest_document_vector(docs,totdoc,kernel_parm)); fprintf(stdout,"Estimated VCdim of classifier: VCdim<=%.5f\n", estimate_margin_vcdim(model,model_length,example_length, kernel_parm)); if((!learn_parm->remove_inconsistent) && (!transduction)) { runtime_start_xa=get_runtime(); if(verbosity>=1) { printf("Computing XiAlpha-estimates..."); fflush(stdout); } compute_xa_estimates(model,label,unlabeled,totdoc,docs,lin,a, kernel_parm,learn_parm,&(model->xa_error), &(model->xa_recall),&(model->xa_precision)); if(verbosity>=1) { printf("done\n"); } printf("Runtime for XiAlpha-estimates in cpu-seconds: %.2f\n", (get_runtime()-runtime_start_xa)/100.0); fprintf(stdout,"XiAlpha-estimate of the error: error<=%.2f%% (rho=%.2f,depth=%ld)\n", model->xa_error,learn_parm->rho,learn_parm->xa_depth); fprintf(stdout,"XiAlpha-estimate of the recall: recall=>%.2f%% (rho=%.2f,depth=%ld)\n", model->xa_recall,learn_parm->rho,learn_parm->xa_depth); fprintf(stdout,"XiAlpha-estimate of the precision: precision=>%.2f%% (rho=%.2f,depth=%ld)\n", model->xa_precision,learn_parm->rho,learn_parm->xa_depth); } else if(!learn_parm->remove_inconsistent) { estimate_transduction_quality(model,label,unlabeled,totdoc,docs,lin); } } if(verbosity>=1) { printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic); } } /* leave-one-out testing starts now */ if(learn_parm->compute_loo) { /* save results of training on full dataset for leave-one-out */ runtime_start_loo=get_runtime(); for(i=0;i<totdoc;i++) { xi_fullset[i]=1.0-((lin[i]-model->b)*(double)label[i]); if(xi_fullset[i]<0) xi_fullset[i]=0; a_fullset[i]=a[i]; } if(verbosity>=1) { printf("Computing leave-one-out"); } /* repeat this loop for every held-out example */ for(heldout=0;(heldout<totdoc);heldout++) { if(learn_parm->rho*a_fullset[heldout]*r_delta_sq+xi_fullset[heldout] < 1.0) { /* guaranteed to not produce a leave-one-out error */ if(verbosity==1) { printf("+"); fflush(stdout); } } else if(xi_fullset[heldout] > 1.0) { /* guaranteed to produce a leave-one-out error */ loo_count++; if(label[heldout] > 0) loo_count_pos++; else loo_count_neg++; if(verbosity==1) { printf("-"); fflush(stdout); } } else { loocomputed++; heldout_c=learn_parm->svm_cost[heldout]; /* set upper bound to zero */ learn_parm->svm_cost[heldout]=0; /* make sure heldout example is not currently */ /* shrunk away. Assumes that lin is up to date! */ shrink_state.active[heldout]=1; if(verbosity>=2) printf("\nLeave-One-Out test on example %ld\n",heldout); if(verbosity>=1) { printf("(?[%ld]",heldout); fflush(stdout); } optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, kernel_parm, kernel_cache,&shrink_state,model,inconsistent,unlabeled, a,lin,c,&timing_profile, &maxdiff,heldout,(long)2); /* printf("%.20f\n",(lin[heldout]-model->b)*(double)label[heldout]); */ if(((lin[heldout]-model->b)*(double)label[heldout]) <= 0.0) { loo_count++; /* there was a loo-error */ if(label[heldout] > 0) loo_count_pos++; else loo_count_neg++; if(verbosity>=1) { printf("-)"); fflush(stdout); } } else { if(verbosity>=1) { printf("+)"); fflush(stdout); } } /* now we need to restore the original data set*/ learn_parm->svm_cost[heldout]=heldout_c; /* restore upper bound */ } } /* end of leave-one-out loop */ if(verbosity>=1) { printf("\nRetrain on full problem"); fflush(stdout); } optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, kernel_parm, kernel_cache,&shrink_state,model,inconsistent,unlabeled, a,lin,c,&timing_profile, &maxdiff,(long)-1,(long)1); if(verbosity >= 1) printf("done.\n"); /* after all leave-one-out computed */ model->loo_error=100.0*loo_count/(double)totdoc; model->loo_recall=(1.0-(double)loo_count_pos/(double)trainpos)*100.0; model->loo_precision=(trainpos-loo_count_pos)/ (double)(trainpos-loo_count_pos+loo_count_neg)*100.0; if(verbosity >= 1) { fprintf(stdout,"Leave-one-out estimate of the error: error=%.2f%%\n", model->loo_error); fprintf(stdout,"Leave-one-out estimate of the recall: recall=%.2f%%\n", model->loo_recall); fprintf(stdout,"Leave-one-out estimate of the precision: precision=%.2f%%\n", model->loo_precision); fprintf(stdout,"Actual leave-one-outs computed: %ld (rho=%.2f)\n", loocomputed,learn_parm->rho); printf("Runtime for leave-one-out in cpu-seconds: %.2f\n", (double)(get_runtime()-runtime_start_loo)/100.0); } } if(learn_parm->alphafile[0]) write_alphas(learn_parm->alphafile,a,label,totdoc); shrink_state_cleanup(&shrink_state); free(label); free(inconsistent); free(unlabeled); free(c); free(a); free(a_fullset); free(xi_fullset); free(lin); free(learn_parm->svm_cost); } /* Learns an SVM regression model based on the training data in docs/label. The resulting model is returned in the structure model. */ void svm_learn_regression(DOC **docs, double *value, long int totdoc, long int totwords, LEARN_PARM *learn_parm, KERNEL_PARM *kernel_parm, KERNEL_CACHE **kernel_cache, MODEL *model) /* docs: Training vectors (x-part) */ /* class: Training value (y-part) */ /* totdoc: Number of examples in docs/label */ /* totwords: Number of features (i.e. highest feature index) */ /* learn_parm: Learning paramenters */ /* kernel_parm: Kernel paramenters */ /* kernel_cache:Initialized Cache, if using a kernel. NULL if linear. Note that it will be free'd and reassigned */ /* model: Returns learning result (assumed empty before called) */ { long *inconsistent,i,j; long inconsistentnum; long upsupvecnum; double loss,model_length,example_length; double maxdiff,*lin,*a,*c; long runtime_start,runtime_end; long iterations,kernel_cache_size; long *unlabeled; double r_delta_sq=0,r_delta,r_delta_avg; double *xi_fullset; /* buffer for storing xi on full sample in loo */ double *a_fullset; /* buffer for storing alpha on full sample in loo */ TIMING timing_profile; SHRINK_STATE shrink_state; DOC **docs_org; long *label; /* set up regression problem in standard form */ docs_org=docs; docs = (DOC **)my_malloc(sizeof(DOC)*2*totdoc); label = (long *)my_malloc(sizeof(long)*2*totdoc); c = (double *)my_malloc(sizeof(double)*2*totdoc); for(i=0;i<totdoc;i++) { j=2*totdoc-1-i; docs[i]=create_example(i,0,0,docs_org[i]->costfactor,docs_org[i]->fvec); label[i]=+1; c[i]=value[i]; docs[j]=create_example(j,0,0,docs_org[i]->costfactor,docs_org[i]->fvec); label[j]=-1; c[j]=value[i]; } totdoc*=2; /* need to get a bigger kernel cache */ if(*kernel_cache) { kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024); kernel_cache_cleanup(*kernel_cache); (*kernel_cache)=kernel_cache_init(totdoc,kernel_cache_size); } runtime_start=get_runtime(); timing_profile.time_kernel=0; timing_profile.time_opti=0; timing_profile.time_shrink=0; timing_profile.time_update=0; timing_profile.time_model=0; timing_profile.time_check=0; timing_profile.time_select=0; kernel_cache_statistic=0; learn_parm->totwords=totwords; /* make sure -n value is reasonable */ if((learn_parm->svm_newvarsinqp < 2) || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) { learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; } init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK); inconsistent = (long *)my_malloc(sizeof(long)*totdoc); unlabeled = (long *)my_malloc(sizeof(long)*totdoc); a = (double *)my_malloc(sizeof(double)*totdoc); a_fullset = (double *)my_malloc(sizeof(double)*totdoc); xi_fullset = (double *)my_malloc(sizeof(double)*totdoc); lin = (double *)my_malloc(sizeof(double)*totdoc); learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc); model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); model->at_upper_bound=0; model->b=0; model->supvec[0]=0; /* element 0 reserved and empty for now */ model->alpha[0]=0; model->lin_weights=NULL; model->totwords=totwords; model->totdoc=totdoc; model->kernel_parm=(*kernel_parm); model->sv_num=1; model->loo_error=-1; model->loo_recall=-1; model->loo_precision=-1; model->xa_error=-1; model->xa_recall=-1; model->xa_precision=-1; inconsistentnum=0; r_delta=estimate_r_delta(docs,totdoc,kernel_parm); r_delta_sq=r_delta*r_delta; r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm); if(learn_parm->svm_c == 0.0) { /* default value for C */ learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg); if(verbosity>=1) printf("Setting default regularization parameter C=%.4f\n", learn_parm->svm_c); } for(i=0;i<totdoc;i++) { /* various inits */ inconsistent[i]=0; a[i]=0; lin[i]=0; unlabeled[i]=0; if(label[i] > 0) { learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio* docs[i]->costfactor; } else if(label[i] < 0) { learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor; } } /* caching makes no sense for linear kernel */ if((kernel_parm->kernel_type == LINEAR) && (*kernel_cache)) { printf("WARNING: Using a kernel cache for linear case will slow optimization down!\n"); } if(verbosity==1) { printf("Optimizing"); fflush(stdout); } /* train the svm */ iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm, kernel_parm,*kernel_cache,&shrink_state, model,inconsistent,unlabeled,a,lin,c, &timing_profile,&maxdiff,(long)-1, (long)1); if(verbosity>=1) { if(verbosity==1) printf("done. (%ld iterations)\n",iterations); printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); runtime_end=get_runtime(); if(verbosity>=2) { printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n", ((float)runtime_end-(float)runtime_start)/100.0, (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start)); } else { printf("Runtime in cpu-seconds: %.2f\n", (runtime_end-runtime_start)/100.0); } if(learn_parm->remove_inconsistent) { inconsistentnum=0; for(i=0;i<totdoc;i++) if(inconsistent[i]) inconsistentnum++; printf("Number of SV: %ld (plus %ld inconsistent examples)\n", model->sv_num-1,inconsistentnum); } else { upsupvecnum=0; for(i=1;i<model->sv_num;i++) { if(fabs(model->alpha[i]) >= (learn_parm->svm_cost[(model->supvec[i])->docnum]- learn_parm->epsilon_a)) upsupvecnum++; } printf("Number of SV: %ld (including %ld at upper bound)\n", model->sv_num-1,upsupvecnum); } if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { loss=0; model_length=0; for(i=0;i<totdoc;i++) { if((lin[i]-model->b)*(double)label[i] < (-learn_parm->eps+(double)label[i]*c[i])-learn_parm->epsilon_crit) loss+=-learn_parm->eps+(double)label[i]*c[i]-(lin[i]-model->b)*(double)label[i]; model_length+=a[i]*label[i]*lin[i]; } model_length=sqrt(model_length); fprintf(stdout,"L1 loss: loss=%.5f\n",loss); fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length); example_length=estimate_sphere(model,kernel_parm); fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n", length_of_longest_document_vector(docs,totdoc,kernel_parm)); } if(verbosity>=1) { printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic); } } if(learn_parm->alphafile[0]) write_alphas(learn_parm->alphafile,a,label,totdoc); /* this makes sure the model we return does not contain pointers to the temporary documents */ for(i=1;i<model->sv_num;i++) { j=model->supvec[i]->docnum; if(j >= (totdoc/2)) { j=totdoc-j-1; } model->supvec[i]=docs_org[j]; } shrink_state_cleanup(&shrink_state); for(i=0;i<totdoc;i++) free_example(docs[i],0); free(docs); free(label); free(inconsistent); free(unlabeled); free(c); free(a); free(a_fullset); free(xi_fullset); free(lin); free(learn_parm->svm_cost); } void svm_learn_ranking(DOC **docs, double *rankvalue, long int totdoc, long int totwords, LEARN_PARM *learn_parm, KERNEL_PARM *kernel_parm, KERNEL_CACHE **kernel_cache, MODEL *model) /* docs: Training vectors (x-part) */ /* rankvalue: Training target values that determine the ranking */ /* totdoc: Number of examples in docs/label */ /* totwords: Number of features (i.e. highest feature index) */ /* learn_parm: Learning paramenters */ /* kernel_parm: Kernel paramenters */ /* kernel_cache:Initialized pointer to Cache of size 1*totdoc, if using a kernel. NULL if linear. NOTE: Cache is getting reinitialized in this function */ /* model: Returns learning result (assumed empty before called) */ { DOC **docdiff; long i,j,k,totpair,kernel_cache_size; double *target,*alpha,cost; long *greater,*lesser; MODEL *pairmodel; SVECTOR *flow,*fhigh; totpair=0; for(i=0;i<totdoc;i++) { for(j=i+1;j<totdoc;j++) { if((docs[i]->queryid==docs[j]->queryid) && (rankvalue[i] != rankvalue[j])) { totpair++; } } } printf("Constructing %ld rank constraints...",totpair); fflush(stdout); docdiff=(DOC **)my_malloc(sizeof(DOC)*totpair); target=(double *)my_malloc(sizeof(double)*totpair); greater=(long *)my_malloc(sizeof(long)*totpair); lesser=(long *)my_malloc(sizeof(long)*totpair); k=0; for(i=0;i<totdoc;i++) { for(j=i+1;j<totdoc;j++) { if(docs[i]->queryid == docs[j]->queryid) { cost=(docs[i]->costfactor+docs[j]->costfactor)/2.0; if(rankvalue[i] > rankvalue[j]) { if(kernel_parm->kernel_type == LINEAR) docdiff[k]=create_example(k,0,0,cost, sub_ss(docs[i]->fvec,docs[j]->fvec)); else { flow=copy_svector(docs[j]->fvec); flow->factor=-1.0; flow->next=NULL; fhigh=copy_svector(docs[i]->fvec); fhigh->factor=1.0; fhigh->next=flow; docdiff[k]=create_example(k,0,0,cost,fhigh); } target[k]=1; greater[k]=i; lesser[k]=j; k++; } else if(rankvalue[i] < rankvalue[j]) { if(kernel_parm->kernel_type == LINEAR) docdiff[k]=create_example(k,0,0,cost, sub_ss(docs[i]->fvec,docs[j]->fvec)); else { flow=copy_svector(docs[j]->fvec); flow->factor=-1.0; flow->next=NULL; fhigh=copy_svector(docs[i]->fvec); fhigh->factor=1.0; fhigh->next=flow; docdiff[k]=create_example(k,0,0,cost,fhigh); } target[k]=-1; greater[k]=i; lesser[k]=j; k++; } } } } printf("done.\n"); fflush(stdout); /* need to get a bigger kernel cache */ if(*kernel_cache) { kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024); kernel_cache_cleanup(*kernel_cache); (*kernel_cache)=kernel_cache_init(totpair,kernel_cache_size); } /* must use unbiased hyperplane on difference vectors */ learn_parm->biased_hyperplane=0; pairmodel=(MODEL *)my_malloc(sizeof(MODEL)); svm_learn_classification(docdiff,target,totpair,totwords,learn_parm, kernel_parm,(*kernel_cache),pairmodel,NULL); /* Transfer the result into a more compact model. If you would like to output the original model on pairs of documents, see below. */ alpha=(double *)my_malloc(sizeof(double)*totdoc); for(i=0;i<totdoc;i++) { alpha[i]=0; } for(i=1;i<pairmodel->sv_num;i++) { alpha[lesser[(pairmodel->supvec[i])->docnum]]-=pairmodel->alpha[i]; alpha[greater[(pairmodel->supvec[i])->docnum]]+=pairmodel->alpha[i]; } model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); model->supvec[0]=0; /* element 0 reserved and empty for now */ model->alpha[0]=0; model->sv_num=1; for(i=0;i<totdoc;i++) { if(alpha[i]) { model->supvec[model->sv_num]=docs[i]; model->alpha[model->sv_num]=alpha[i]; model->index[i]=model->sv_num; model->sv_num++; } else { model->index[i]=-1; } } model->at_upper_bound=0; model->b=0; model->lin_weights=NULL; model->totwords=totwords; model->totdoc=totdoc; model->kernel_parm=(*kernel_parm); model->loo_error=-1; model->loo_recall=-1; model->loo_precision=-1; model->xa_error=-1; model->xa_recall=-1; model->xa_precision=-1; free(alpha); free(greater); free(lesser); free(target); /* If you would like to output the original model on pairs of document, replace the following lines with '(*model)=(*pairmodel);' */ for(i=0;i<totpair;i++) free_example(docdiff[i],1); free(docdiff); free_model(pairmodel,0); } /* The following solves a freely defined and given set of inequalities. The optimization problem is of the following form: min 0.5 w*w + C sum_i C_i \xi_i s.t. x_i * w > rhs_i - \xi_i This corresponds to the -z o option. */ void svm_learn_optimization(DOC **docs, double *rhs, long int totdoc, long int totwords, LEARN_PARM *learn_parm, KERNEL_PARM *kernel_parm, KERNEL_CACHE *kernel_cache, MODEL *model, double *alpha) /* docs: Left-hand side of inequalities (x-part) */ /* rhs: Right-hand side of inequalities */ /* totdoc: Number of examples in docs/label */ /* totwords: Number of features (i.e. highest feature index) */ /* learn_parm: Learning paramenters */ /* kernel_parm: Kernel paramenters */ /* kernel_cache:Initialized Cache of size 1*totdoc, if using a kernel. NULL if linear.*/ /* model: Returns solution as SV expansion (assumed empty before called) */ /* alpha: Start values for the alpha variables or NULL pointer. The new alpha values are returned after optimization if not NULL. Array must be of size totdoc. */ { long i,*label; long misclassified,upsupvecnum; double loss,model_length,example_length; double maxdiff,*lin,*a,*c; long runtime_start,runtime_end; long iterations,maxslackid,svsetnum; long *unlabeled,*inconsistent; double r_delta_sq=0,r_delta,r_delta_avg; long *index,*index2dnum; double *weights,*slack,*alphaslack; CFLOAT *aicache; /* buffer to keep one row of hessian */ TIMING timing_profile; SHRINK_STATE shrink_state; runtime_start=get_runtime(); timing_profile.time_kernel=0; timing_profile.time_opti=0; timing_profile.time_shrink=0; timing_profile.time_update=0; timing_profile.time_model=0; timing_profile.time_check=0; timing_profile.time_select=0; kernel_cache_statistic=0; learn_parm->totwords=totwords; /* make sure -n value is reasonable */ if((learn_parm->svm_newvarsinqp < 2) || (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) { learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; } init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK); label = (long *)my_malloc(sizeof(long)*totdoc); unlabeled = (long *)my_malloc(sizeof(long)*totdoc); inconsistent = (long *)my_malloc(sizeof(long)*totdoc); c = (double *)my_malloc(sizeof(double)*totdoc); a = (double *)my_malloc(sizeof(double)*totdoc); lin = (double *)my_malloc(sizeof(double)*totdoc); learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc); model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2)); model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2)); model->index = (long *)my_malloc(sizeof(long)*(totdoc+2)); model->at_upper_bound=0; model->b=0; model->supvec[0]=0; /* element 0 reserved and empty for now */ model->alpha[0]=0; model->lin_weights=NULL; model->totwords=totwords; model->totdoc=totdoc; model->kernel_parm=(*kernel_parm); model->sv_num=1; model->loo_error=-1; model->loo_recall=-1; model->loo_precision=-1; model->xa_error=-1; model->xa_recall=-1; model->xa_precision=-1; r_delta=estimate_r_delta(docs,totdoc,kernel_parm); r_delta_sq=r_delta*r_delta; r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm); if(learn_parm->svm_c == 0.0) { /* default value for C */ learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg); if(verbosity>=1) printf("Setting default regularization parameter C=%.4f\n", learn_parm->svm_c); } learn_parm->biased_hyperplane=0; /* learn an unbiased hyperplane */ learn_parm->eps=0.0; /* No margin, unless explicitly handcoded in the right-hand side in the training set. */ for(i=0;i<totdoc;i++) { /* various inits */ docs[i]->docnum=i; a[i]=0; lin[i]=0; c[i]=rhs[i]; /* set right-hand side */ unlabeled[i]=0; inconsistent[i]=0; learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio* docs[i]->costfactor; label[i]=1; } if(learn_parm->sharedslack) /* if shared slacks are used, they must */ for(i=0;i<totdoc;i++) /* be used on every constraint */ if(!docs[i]->slackid) { perror("Error: Missing shared slacks definitions in some of the examples."); exit(0); } /* compute starting state for initial alpha values */ if(alpha) { if(verbosity>=1) { printf("Computing starting state..."); fflush(stdout); } index = (long *)my_malloc(sizeof(long)*totdoc); index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); weights=(double *)my_malloc(sizeof(double)*(totwords+1)); aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); for(i=0;i<totdoc;i++) { /* create full index and clip alphas */ index[i]=1; alpha[i]=fabs(alpha[i]); if(alpha[i]<0) alpha[i]=0; if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i]; } if(kernel_parm->kernel_type != LINEAR) { for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */ if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i]) && (kernel_cache_space_available(kernel_cache))) cache_kernel_row(kernel_cache,docs,i,kernel_parm); for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */ if((alpha[i]==learn_parm->svm_cost[i]) && (kernel_cache_space_available(kernel_cache))) cache_kernel_row(kernel_cache,docs,i,kernel_parm); } (void)compute_index(index,totdoc,index2dnum); update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc, totwords,kernel_parm,kernel_cache,lin,aicache, weights); (void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c, learn_parm,index2dnum,index2dnum,model); for(i=0;i<totdoc;i++) { /* copy initial alphas */ a[i]=alpha[i]; } free(index); free(index2dnum); free(weights); free(aicache); if(verbosity>=1) { printf("done.\n"); fflush(stdout); } } /* removing inconsistent does not work for general optimization problem */ if(learn_parm->remove_inconsistent) { learn_parm->remove_inconsistent = 0; printf("'remove inconsistent' not available in this mode. Switching option off!"); fflush(stdout); } /* caching makes no sense for linear kernel */ if(kernel_parm->kernel_type == LINEAR) { kernel_cache = NULL; } if(verbosity==1) { printf("Optimizing"); fflush(stdout); } /* train the svm */ if(learn_parm->sharedslack) iterations=optimize_to_convergence_sharedslack(docs,label,totdoc, totwords,learn_parm,kernel_parm, kernel_cache,&shrink_state,model, a,lin,c,&timing_profile, &maxdiff); else iterations=optimize_to_convergence(docs,label,totdoc, totwords,learn_parm,kernel_parm, kernel_cache,&shrink_state,model, inconsistent,unlabeled, a,lin,c,&timing_profile, &maxdiff,(long)-1,(long)1); if(verbosity>=1) { if(verbosity==1) printf("done. (%ld iterations)\n",iterations); misclassified=0; for(i=0;(i<totdoc);i++) { /* get final statistic */ if((lin[i]-model->b)*(double)label[i] <= 0.0) misclassified++; } printf("Optimization finished (maxdiff=%.5f).\n",maxdiff); runtime_end=get_runtime(); if(verbosity>=2) { printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n", ((float)runtime_end-(float)runtime_start)/100.0, (100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start), (100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start)); } else { printf("Runtime in cpu-seconds: %.2f\n", (runtime_end-runtime_start)/100.0); } } if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { loss=0; model_length=0; for(i=0;i<totdoc;i++) { if((lin[i]-model->b)*(double)label[i] < c[i]-learn_parm->epsilon_crit) loss+=c[i]-(lin[i]-model->b)*(double)label[i]; model_length+=a[i]*label[i]*lin[i]; } model_length=sqrt(model_length); fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length); } if(learn_parm->sharedslack) { index = (long *)my_malloc(sizeof(long)*totdoc); index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); maxslackid=0; for(i=0;i<totdoc;i++) { /* create full index */ index[i]=1; if(maxslackid<docs[i]->slackid) maxslackid=docs[i]->slackid; } (void)compute_index(index,totdoc,index2dnum); slack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); for(i=0;i<=maxslackid;i++) { /* init shared slacks */ slack[i]=0; alphaslack[i]=0; } compute_shared_slacks(docs,label,a,lin,c,index2dnum,learn_parm, slack,alphaslack); loss=0; model->at_upper_bound=0; svsetnum=0; for(i=0;i<=maxslackid;i++) { /* create full index */ loss+=slack[i]; if(alphaslack[i] > (learn_parm->svm_c - learn_parm->epsilon_a)) model->at_upper_bound++; if(alphaslack[i] > learn_parm->epsilon_a) svsetnum++; } free(index); free(index2dnum); free(slack); free(alphaslack); } if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) { if(learn_parm->sharedslack) { printf("Number of SV: %ld\n", model->sv_num-1); printf("Number of non-zero slack variables: %ld (out of %ld)\n", model->at_upper_bound,svsetnum); fprintf(stdout,"L1 loss: loss=%.5f\n",loss); } else { upsupvecnum=0; for(i=1;i<model->sv_num;i++) { if(fabs(model->alpha[i]) >= (learn_parm->svm_cost[(model->supvec[i])->docnum]- learn_parm->epsilon_a)) upsupvecnum++; } printf("Number of SV: %ld (including %ld at upper bound)\n", model->sv_num-1,upsupvecnum); fprintf(stdout,"L1 loss: loss=%.5f\n",loss); } example_length=estimate_sphere(model,kernel_parm); fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n", length_of_longest_document_vector(docs,totdoc,kernel_parm)); } if(verbosity>=1) { printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic); } if(alpha) { for(i=0;i<totdoc;i++) { /* copy final alphas */ alpha[i]=a[i]; } } if(learn_parm->alphafile[0]) write_alphas(learn_parm->alphafile,a,label,totdoc); shrink_state_cleanup(&shrink_state); free(label); free(unlabeled); free(inconsistent); free(c); free(a); free(lin); free(learn_parm->svm_cost); } long optimize_to_convergence(DOC **docs, long int *label, long int totdoc, long int totwords, LEARN_PARM *learn_parm, KERNEL_PARM *kernel_parm, KERNEL_CACHE *kernel_cache, SHRINK_STATE *shrink_state, MODEL *model, long int *inconsistent, long int *unlabeled, double *a, double *lin, double *c, TIMING *timing_profile, double *maxdiff, long int heldout, long int retrain) /* docs: Training vectors (x-part) */ /* label: Training labels/value (y-part, zero if test example for transduction) */ /* totdoc: Number of examples in docs/label */ /* totwords: Number of features (i.e. highest feature index) */ /* laern_parm: Learning paramenters */ /* kernel_parm: Kernel paramenters */ /* kernel_cache: Initialized/partly filled Cache, if using a kernel. NULL if linear. */ /* shrink_state: State of active variables */ /* model: Returns learning result */ /* inconsistent: examples thrown out as inconstistent */ /* unlabeled: test examples for transduction */ /* a: alphas */ /* lin: linear component of gradient */ /* c: right hand side of inequalities (margin) */ /* maxdiff: returns maximum violation of KT-conditions */ /* heldout: marks held-out example for leave-one-out (or -1) */ /* retrain: selects training mode (1=regular / 2=holdout) */ { long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink; long inconsistentnum,choosenum,already_chosen=0,iteration; long misclassified,supvecnum=0,*active2dnum,inactivenum; long *working2dnum,*selexam; long activenum; double criterion,eq; double *a_old; long t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */ long transductcycle; long transduction; double epsilon_crit_org; double bestmaxdiff; long bestmaxdiffiter,terminate; double *selcrit; /* buffer for sorting */ CFLOAT *aicache; /* buffer to keep one row of hessian */ double *weights; /* buffer for weight vector in linear case */ QP qp; /* buffer for one quadratic program */ epsilon_crit_org=learn_parm->epsilon_crit; /* save org */ if(kernel_parm->kernel_type == LINEAR) { learn_parm->epsilon_crit=2.0; kernel_cache=NULL; /* caching makes no sense for linear kernel */ } learn_parm->epsilon_shrink=2; (*maxdiff)=1; learn_parm->totwords=totwords; chosen = (long *)my_malloc(sizeof(long)*totdoc); last_suboptimal_at = (long *)my_malloc(sizeof(long)*totdoc); key = (long *)my_malloc(sizeof(long)*(totdoc+11)); selcrit = (double *)my_malloc(sizeof(double)*totdoc); selexam = (long *)my_malloc(sizeof(long)*totdoc); a_old = (double *)my_malloc(sizeof(double)*totdoc); aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_ce0 = (double *)my_malloc(sizeof(double)); qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize *learn_parm->svm_maxqpsize); qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); weights=(double *)my_malloc(sizeof(double)*(totwords+1)); choosenum=0; inconsistentnum=0; transductcycle=0; transduction=0; if(!retrain) retrain=1; iteration=1; bestmaxdiffiter=1; bestmaxdiff=999999999; terminate=0; if(kernel_cache) { kernel_cache->time=iteration; /* for lru cache */ kernel_cache_reset_lru(kernel_cache); } for(i=0;i<totdoc;i++) { /* various inits */ chosen[i]=0; a_old[i]=a[i]; last_suboptimal_at[i]=1; if(inconsistent[i]) inconsistentnum++; if(unlabeled[i]) { transduction=1; } } activenum=compute_index(shrink_state->active,totdoc,active2dnum); inactivenum=totdoc-activenum; clear_index(working2dnum); /* repeat this loop until we have convergence */ for(;retrain && (!terminate);iteration++) { if(kernel_cache) kernel_cache->time=iteration; /* for lru cache */ if(verbosity>=2) { printf( "Iteration %ld: ",iteration); fflush(stdout); } else if(verbosity==1) { printf("."); fflush(stdout); } if(verbosity>=2) t0=get_runtime(); if(verbosity>=3) { printf("\nSelecting working set... "); fflush(stdout); } if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize) learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; i=0; for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */ if((chosen[j]>=(learn_parm->svm_maxqpsize/ minl(learn_parm->svm_maxqpsize, learn_parm->svm_newvarsinqp))) || (inconsistent[j]) || (j == heldout)) { chosen[j]=0; choosenum--; } else { chosen[j]++; working2dnum[i++]=j; } } working2dnum[i]=-1; if(retrain == 2) { choosenum=0; for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* fully clear working set */ chosen[j]=0; } clear_index(working2dnum); for(i=0;i<totdoc;i++) { /* set inconsistent examples to zero (-i 1) */ if((inconsistent[i] || (heldout==i)) && (a[i] != 0.0)) { chosen[i]=99999; choosenum++; a[i]=0; } } if(learn_parm->biased_hyperplane) { eq=0; for(i=0;i<totdoc;i++) { /* make sure we fulfill equality constraint */ eq+=a[i]*label[i]; } for(i=0;(i<totdoc) && (fabs(eq) > learn_parm->epsilon_a);i++) { if((eq*label[i] > 0) && (a[i] > 0)) { chosen[i]=88888; choosenum++; if((eq*label[i]) > a[i]) { eq-=(a[i]*label[i]); a[i]=0; } else { a[i]-=(eq*label[i]); eq=0; } } } } compute_index(chosen,totdoc,working2dnum); } else { /* select working set according to steepest gradient */ if(iteration % 101) { already_chosen=0; if((minl(learn_parm->svm_newvarsinqp, learn_parm->svm_maxqpsize-choosenum)>=4) && (kernel_parm->kernel_type != LINEAR)) { /* select part of the working set from cache */ already_chosen=select_next_qp_subproblem_grad( label,unlabeled,a,lin,c,totdoc, (long)(minl(learn_parm->svm_maxqpsize-choosenum, learn_parm->svm_newvarsinqp) /2), learn_parm,inconsistent,active2dnum, working2dnum,selcrit,selexam,kernel_cache,1, key,chosen); choosenum+=already_chosen; } choosenum+=select_next_qp_subproblem_grad( label,unlabeled,a,lin,c,totdoc, minl(learn_parm->svm_maxqpsize-choosenum, learn_parm->svm_newvarsinqp-already_chosen), learn_parm,inconsistent,active2dnum, working2dnum,selcrit,selexam,kernel_cache,0,key, chosen); } else { /* once in a while, select a somewhat random working set to get unlocked of infinite loops due to numerical inaccuracies in the core qp-solver */ choosenum+=select_next_qp_subproblem_rand( label,unlabeled,a,lin,c,totdoc, minl(learn_parm->svm_maxqpsize-choosenum, learn_parm->svm_newvarsinqp), learn_parm,inconsistent,active2dnum, working2dnum,selcrit,selexam,kernel_cache,key, chosen,iteration); } } if(verbosity>=2) { printf(" %ld vectors chosen\n",choosenum); fflush(stdout); } if(verbosity>=2) t1=get_runtime(); if(kernel_cache) cache_multiple_kernel_rows(kernel_cache,docs,working2dnum, choosenum,kernel_parm); if(verbosity>=2) t2=get_runtime(); if(retrain != 2) { optimize_svm(docs,label,unlabeled,inconsistent,0.0,chosen,active2dnum, model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm, aicache,kernel_parm,&qp,&epsilon_crit_org); } if(verbosity>=2) t3=get_runtime(); update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc, totwords,kernel_parm,kernel_cache,lin,aicache, weights); if(verbosity>=2) t4=get_runtime(); supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c, learn_parm,working2dnum,active2dnum,model); if(verbosity>=2) t5=get_runtime(); /* The following computation of the objective function works only */ /* relative to the active variables */ if(verbosity>=3) { criterion=compute_objective_function(a,lin,c,learn_parm->eps,label, active2dnum); printf("Objective function (over active variables): %.16f\n",criterion); fflush(stdout); } for(jj=0;(i=working2dnum[jj])>=0;jj++) { a_old[i]=a[i]; } if(retrain == 2) { /* reset inconsistent unlabeled examples */ for(i=0;(i<totdoc);i++) { if(inconsistent[i] && unlabeled[i]) { inconsistent[i]=0; label[i]=0; } } } retrain=check_optimality(model,label,unlabeled,a,lin,c,totdoc,learn_parm, maxdiff,epsilon_crit_org,&misclassified, inconsistent,active2dnum,last_suboptimal_at, iteration,kernel_parm); if(verbosity>=2) { t6=get_runtime(); timing_profile->time_select+=t1-t0; timing_profile->time_kernel+=t2-t1; timing_profile->time_opti+=t3-t2; timing_profile->time_update+=t4-t3; timing_profile->time_model+=t5-t4; timing_profile->time_check+=t6-t5; } /* checking whether optimizer got stuck */ if((*maxdiff) < bestmaxdiff) { bestmaxdiff=(*maxdiff); bestmaxdiffiter=iteration; } if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) { /* long time no progress? */ terminate=1; retrain=0; if(verbosity>=1) printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n"); } noshrink=0; if((!retrain) && (inactivenum>0) && ((!learn_parm->skip_final_opt_check) || (kernel_parm->kernel_type == LINEAR))) { if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) || (verbosity>=2)) { if(verbosity==1) { printf("\n"); } printf(" Checking optimality of inactive variables..."); fflush(stdout); } t1=get_runtime(); reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc, totwords,iteration,learn_parm,inconsistent, docs,kernel_parm,kernel_cache,model,aicache, weights,maxdiff); /* Update to new active variables. */ activenum=compute_index(shrink_state->active,totdoc,active2dnum); inactivenum=totdoc-activenum; /* reset watchdog */ bestmaxdiff=(*maxdiff); bestmaxdiffiter=iteration; /* termination criterion */ noshrink=1; retrain=0; if((*maxdiff) > learn_parm->epsilon_crit) retrain=1; timing_profile->time_shrink+=get_runtime()-t1; if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) || (verbosity>=2)) { printf("done.\n"); fflush(stdout); printf(" Number of inactive variables = %ld\n",inactivenum); } } if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff))) learn_parm->epsilon_crit=(*maxdiff); if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) { learn_parm->epsilon_crit/=2.0; retrain=1; noshrink=1; } if(learn_parm->epsilon_crit<epsilon_crit_org) learn_parm->epsilon_crit=epsilon_crit_org; if(verbosity>=2) { printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n", supvecnum,model->at_upper_bound,(*maxdiff)); fflush(stdout); } if(verbosity>=3) { printf("\n"); } if((!retrain) && (transduction)) { for(i=0;(i<totdoc);i++) { shrink_state->active[i]=1; } activenum=compute_index(shrink_state->active,totdoc,active2dnum); inactivenum=0; if(verbosity==1) printf("done\n"); retrain=incorporate_unlabeled_examples(model,label,inconsistent, unlabeled,a,lin,totdoc, selcrit,selexam,key, transductcycle,kernel_parm, learn_parm); epsilon_crit_org=learn_parm->epsilon_crit; if(kernel_parm->kernel_type == LINEAR) learn_parm->epsilon_crit=1; transductcycle++; /* reset watchdog */ bestmaxdiff=(*maxdiff); bestmaxdiffiter=iteration; } else if(((iteration % 10) == 0) && (!noshrink)) { activenum=shrink_problem(docs,learn_parm,shrink_state,kernel_parm, active2dnum,last_suboptimal_at,iteration,totdoc, maxl((long)(activenum/10), maxl((long)(totdoc/500),100)), a,inconsistent); inactivenum=totdoc-activenum; if((kernel_cache) && (supvecnum>kernel_cache->max_elems) && ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) { kernel_cache_shrink(kernel_cache,totdoc, minl((kernel_cache->activenum-activenum), (kernel_cache->activenum-supvecnum)), shrink_state->active); } } if((!retrain) && learn_parm->remove_inconsistent) { if(verbosity>=1) { printf(" Moving training errors to inconsistent examples..."); fflush(stdout); } if(learn_parm->remove_inconsistent == 1) { retrain=identify_inconsistent(a,label,unlabeled,totdoc,learn_parm, &inconsistentnum,inconsistent); } else if(learn_parm->remove_inconsistent == 2) { retrain=identify_misclassified(lin,label,unlabeled,totdoc, model,&inconsistentnum,inconsistent); } else if(learn_parm->remove_inconsistent == 3) { retrain=identify_one_misclassified(lin,label,unlabeled,totdoc, model,&inconsistentnum,inconsistent); } if(retrain) { if(kernel_parm->kernel_type == LINEAR) { /* reinit shrinking */ learn_parm->epsilon_crit=2.0; } } if(verbosity>=1) { printf("done.\n"); if(retrain) { printf(" Now %ld inconsistent examples.\n",inconsistentnum); } } } } /* end of loop */ free(chosen); free(last_suboptimal_at); free(key); free(selcrit); free(selexam); free(a_old); free(aicache); free(working2dnum); free(active2dnum); free(qp.opt_ce); free(qp.opt_ce0); free(qp.opt_g); free(qp.opt_g0); free(qp.opt_xinit); free(qp.opt_low); free(qp.opt_up); free(weights); learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */ model->maxdiff=(*maxdiff); return(iteration); } long optimize_to_convergence_sharedslack(DOC **docs, long int *label, long int totdoc, long int totwords, LEARN_PARM *learn_parm, KERNEL_PARM *kernel_parm, KERNEL_CACHE *kernel_cache, SHRINK_STATE *shrink_state, MODEL *model, double *a, double *lin, double *c, TIMING *timing_profile, double *maxdiff) /* docs: Training vectors (x-part) */ /* label: Training labels/value (y-part, zero if test example for transduction) */ /* totdoc: Number of examples in docs/label */ /* totwords: Number of features (i.e. highest feature index) */ /* learn_parm: Learning paramenters */ /* kernel_parm: Kernel paramenters */ /* kernel_cache: Initialized/partly filled Cache, if using a kernel. NULL if linear. */ /* shrink_state: State of active variables */ /* model: Returns learning result */ /* a: alphas */ /* lin: linear component of gradient */ /* c: right hand side of inequalities (margin) */ /* maxdiff: returns maximum violation of KT-conditions */ { long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink,*unlabeled; long *inconsistent,choosenum,already_chosen=0,iteration; long misclassified,supvecnum=0,*active2dnum,inactivenum; long *working2dnum,*selexam,*ignore; long activenum,retrain,maxslackid,slackset,jointstep; double criterion,eq_target; double *a_old,*alphaslack; long t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */ double epsilon_crit_org,maxsharedviol; double bestmaxdiff; long bestmaxdiffiter,terminate; double *selcrit; /* buffer for sorting */ CFLOAT *aicache; /* buffer to keep one row of hessian */ double *weights; /* buffer for weight vector in linear case */ QP qp; /* buffer for one quadratic program */ double *slack; /* vector of slack variables for optimization with shared slacks */ epsilon_crit_org=learn_parm->epsilon_crit; /* save org */ if(kernel_parm->kernel_type == LINEAR) { learn_parm->epsilon_crit=2.0; kernel_cache=NULL; /* caching makes no sense for linear kernel */ } learn_parm->epsilon_shrink=2; (*maxdiff)=1; learn_parm->totwords=totwords; chosen = (long *)my_malloc(sizeof(long)*totdoc); unlabeled = (long *)my_malloc(sizeof(long)*totdoc); inconsistent = (long *)my_malloc(sizeof(long)*totdoc); ignore = (long *)my_malloc(sizeof(long)*totdoc); key = (long *)my_malloc(sizeof(long)*(totdoc+11)); selcrit = (double *)my_malloc(sizeof(double)*totdoc); selexam = (long *)my_malloc(sizeof(long)*totdoc); a_old = (double *)my_malloc(sizeof(double)*totdoc); aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc); working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_ce0 = (double *)my_malloc(sizeof(double)); qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize *learn_parm->svm_maxqpsize); qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize); weights=(double *)my_malloc(sizeof(double)*(totwords+1)); maxslackid=0; for(i=0;i<totdoc;i++) { /* determine size of slack array */ if(maxslackid<docs[i]->slackid) maxslackid=docs[i]->slackid; } slack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1)); last_suboptimal_at = (long *)my_malloc(sizeof(long)*(maxslackid+1)); for(i=0;i<=maxslackid;i++) { /* init shared slacks */ slack[i]=0; alphaslack[i]=0; last_suboptimal_at[i]=1; } choosenum=0; retrain=1; iteration=1; bestmaxdiffiter=1; bestmaxdiff=999999999; terminate=0; if(kernel_cache) { kernel_cache->time=iteration; /* for lru cache */ kernel_cache_reset_lru(kernel_cache); } for(i=0;i<totdoc;i++) { /* various inits */ chosen[i]=0; unlabeled[i]=0; inconsistent[i]=0; ignore[i]=0; a_old[i]=a[i]; } activenum=compute_index(shrink_state->active,totdoc,active2dnum); inactivenum=totdoc-activenum; clear_index(working2dnum); /* call to init slack and alphaslack */ compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm, slack,alphaslack); /* repeat this loop until we have convergence */ for(;retrain && (!terminate);iteration++) { if(kernel_cache) kernel_cache->time=iteration; /* for lru cache */ if(verbosity>=2) { printf( "Iteration %ld: ",iteration); fflush(stdout); } else if(verbosity==1) { printf("."); fflush(stdout); } if(verbosity>=2) t0=get_runtime(); if(verbosity>=3) { printf("\nSelecting working set... "); fflush(stdout); } if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize) learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize; /* select working set according to steepest gradient */ jointstep=0; eq_target=0; if(iteration % 101) { slackset=select_next_qp_slackset(docs,label,a,lin,slack,alphaslack,c, learn_parm,active2dnum,&maxsharedviol); if((iteration % 2) || (!slackset) || (maxsharedviol<learn_parm->epsilon_crit)){ /* do a step with examples from different slack sets */ if(verbosity >= 2) { printf("(i-step)"); fflush(stdout); } i=0; for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear old part of working set */ if((chosen[j]>=(learn_parm->svm_maxqpsize/ minl(learn_parm->svm_maxqpsize, learn_parm->svm_newvarsinqp)))) { chosen[j]=0; choosenum--; } else { chosen[j]++; working2dnum[i++]=j; } } working2dnum[i]=-1; already_chosen=0; if((minl(learn_parm->svm_newvarsinqp, learn_parm->svm_maxqpsize-choosenum)>=4) && (kernel_parm->kernel_type != LINEAR)) { /* select part of the working set from cache */ already_chosen=select_next_qp_subproblem_grad( label,unlabeled,a,lin,c,totdoc, (long)(minl(learn_parm->svm_maxqpsize-choosenum, learn_parm->svm_newvarsinqp) /2), learn_parm,inconsistent,active2dnum, working2dnum,selcrit,selexam,kernel_cache, (long)1,key,chosen); choosenum+=already_chosen; } choosenum+=select_next_qp_subproblem_grad( label,unlabeled,a,lin,c,totdoc, minl(learn_parm->svm_maxqpsize-choosenum, learn_parm->svm_newvarsinqp-already_chosen), learn_parm,inconsistent,active2dnum, working2dnum,selcrit,selexam,kernel_cache, (long)0,key,chosen); } else { /* do a step with all examples from same slack set */ if(verbosity >= 2) { printf("(j-step on %ld)",slackset); fflush(stdout); } jointstep=1; for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */ chosen[j]=0; } working2dnum[0]=-1; eq_target=alphaslack[slackset]; for(j=0;j<totdoc;j++) { /* mask all but slackset */ /* for(jj=0;(j=active2dnum[jj])>=0;jj++) { */ if(docs[j]->slackid != slackset) ignore[j]=1; else { ignore[j]=0; learn_parm->svm_cost[j]=learn_parm->svm_c; /* printf("Inslackset(%ld,%ld)",j,shrink_state->active[j]); */ } } learn_parm->biased_hyperplane=1; choosenum=select_next_qp_subproblem_grad( label,unlabeled,a,lin,c,totdoc, learn_parm->svm_maxqpsize, learn_parm,ignore,active2dnum, working2dnum,selcrit,selexam,kernel_cache, (long)0,key,chosen); learn_parm->biased_hyperplane=0; } } else { /* once in a while, select a somewhat random working set to get unlocked of infinite loops due to numerical inaccuracies in the core qp-solver */ choosenum+=select_next_qp_subproblem_rand( label,unlabeled,a,lin,c,totdoc, minl(learn_parm->svm_maxqpsize-choosenum, learn_parm->svm_newvarsinqp), learn_parm,inconsistent,active2dnum, working2dnum,selcrit,selexam,kernel_cache,key, chosen,iteration); } if(verbosity>=2) { printf(" %ld vectors chosen\n",choosenum); fflush(stdout); } if(verbosity>=2) t1=get_runtime(); if(kernel_cache) cache_multiple_kernel_rows(kernel_cache,docs,working2dnum, choosenum,kernel_parm); if(verbosity>=2) t2=get_runtime(); if(jointstep) learn_parm->biased_hyperplane=1; optimize_svm(docs,label,unlabeled,ignore,eq_target,chosen,active2dnum, model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm, aicache,kernel_parm,&qp,&epsilon_crit_org); learn_parm->biased_hyperplane=0; for(jj=0;(i=working2dnum[jj])>=0;jj++) /* recompute sums of alphas */ alphaslack[docs[i]->slackid]+=(a[i]-a_old[i]); for(jj=0;(i=working2dnum[jj])>=0;jj++) { /* reduce alpha to fulfill constraints */ if(alphaslack[docs[i]->slackid] > learn_parm->svm_c) { if(a[i] < (alphaslack[docs[i]->slackid]-learn_parm->svm_c)) { alphaslack[docs[i]->slackid]-=a[i]; a[i]=0; } else { a[i]-=(alphaslack[docs[i]->slackid]-learn_parm->svm_c); alphaslack[docs[i]->slackid]=learn_parm->svm_c; } } } for(jj=0;(i=active2dnum[jj])>=0;jj++) learn_parm->svm_cost[i]=a[i]+(learn_parm->svm_c -alphaslack[docs[i]->slackid]); if(verbosity>=2) t3=get_runtime(); update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc, totwords,kernel_parm,kernel_cache,lin,aicache, weights); compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm, slack,alphaslack); if(verbosity>=2) t4=get_runtime(); supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c, learn_parm,working2dnum,active2dnum,model); if(verbosity>=2) t5=get_runtime(); /* The following computation of the objective function works only */ /* relative to the active variables */ if(verbosity>=3) { criterion=compute_objective_function(a,lin,c,learn_parm->eps,label, active2dnum); printf("Objective function (over active variables): %.16f\n",criterion); fflush(stdout); } for(jj=0;(i=working2dnum[jj])>=0;jj++) { a_old[i]=a[i]; } retrain=check_optimality_sharedslack(docs,model,label,a,lin,c, slack,alphaslack,totdoc,learn_parm, maxdiff,epsilon_crit_org,&misclassified, active2dnum,last_suboptimal_at, iteration,kernel_parm); if(verbosity>=2) { t6=get_runtime(); timing_profile->time_select+=t1-t0; timing_profile->time_kernel+=t2-t1; timing_profile->time_opti+=t3-t2; timing_profile->time_update+=t4-t3; timing_profile->time_model+=t5-t4; timing_profile->time_check+=t6-t5; } /* checking whether optimizer got stuck */ if((*maxdiff) < bestmaxdiff) { bestmaxdiff=(*maxdiff); bestmaxdiffiter=iteration; } if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) { /* long time no progress? */ terminate=1; retrain=0; if(verbosity>=1) printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n"); } noshrink=0; if((!retrain) && (inactivenum>0) && ((!learn_parm->skip_final_opt_check) || (kernel_parm->kernel_type == LINEAR))) { if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) || (verbosity>=2)) { if(verbosity==1) { printf("\n"); } printf(" Checking optimality of inactive variables..."); fflush(stdout); } t1=get_runtime(); reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc, totwords,iteration,learn_parm,inconsistent, docs,kernel_parm,kernel_cache,model,aicache, weights,maxdiff); /* Update to new active variables. */ activenum=compute_index(shrink_state->active,totdoc,active2dnum); inactivenum=totdoc-activenum; /* check optimality, since check in reactivate does not work for sharedslacks */ retrain=check_optimality_sharedslack(docs,model,label,a,lin,c, slack,alphaslack,totdoc,learn_parm, maxdiff,epsilon_crit_org,&misclassified, active2dnum,last_suboptimal_at, iteration,kernel_parm); /* reset watchdog */ bestmaxdiff=(*maxdiff); bestmaxdiffiter=iteration; /* termination criterion */ noshrink=1; retrain=0; if((*maxdiff) > learn_parm->epsilon_crit) retrain=1; timing_profile->time_shrink+=get_runtime()-t1; if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR)) || (verbosity>=2)) { printf("done.\n"); fflush(stdout); printf(" Number of inactive variables = %ld\n",inactivenum); } } if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff))) learn_parm->epsilon_crit=(*maxdiff); if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) { learn_parm->epsilon_crit/=2.0; retrain=1; noshrink=1; } if(learn_parm->epsilon_crit<epsilon_crit_org) learn_parm->epsilon_crit=epsilon_crit_org; if(verbosity>=2) { printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n", supvecnum,model->at_upper_bound,(*maxdiff)); fflush(stdout); } if(verbosity>=3) { printf("\n"); } if(((iteration % 10) == 0) && (!noshrink)) { activenum=shrink_problem(docs,learn_parm,shrink_state, kernel_parm,active2dnum, last_suboptimal_at,iteration,totdoc, maxl((long)(activenum/10), maxl((long)(totdoc/500),100)), a,inconsistent); inactivenum=totdoc-activenum; if((kernel_cache) && (supvecnum>kernel_cache->max_elems) && ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) { kernel_cache_shrink(kernel_cache,totdoc, minl((kernel_cache->activenum-activenum), (kernel_cache->activenum-supvecnum)), shrink_state->active); } } } /* end of loop */ free(alphaslack); free(slack); free(chosen); free(unlabeled); free(inconsistent); free(ignore); free(last_suboptimal_at); free(key); free(selcrit); free(selexam); free(a_old); free(aicache); free(working2dnum); free(active2dnum); free(qp.opt_ce); free(qp.opt_ce0); free(qp.opt_g); free(qp.opt_g0); free(qp.opt_xinit); free(qp.opt_low); free(qp.opt_up); free(weights); learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */ model->maxdiff=(*maxdiff); return(iteration); } double compute_objective_function(double *a, double *lin, double *c, double eps, long int *label, long int *active2dnum) /* Return value of objective function. */ /* Works only relative to the active variables! */ { long i,ii; double criterion; /* calculate value of objective function */ criterion=0; for(ii=0;active2dnum[ii]>=0;ii++) { i=active2dnum[ii]; criterion=criterion+(eps-(double)label[i]*c[i])*a[i]+0.5*a[i]*label[i]*lin[i]; } return(criterion); } void clear_index(long int *index) /* initializes and empties index */ { index[0]=-1; } void add_to_index(long int *index, long int elem) /* initializes and empties index */ { register long i; for(i=0;index[i] != -1;i++); index[i]=elem; index[i+1]=-1; } long compute_index(long int *binfeature, long int range, long int *index) /* create an inverted index of binfeature */ { register long i,ii; ii=0; for(i=0;i<range;i++) { if(binfeature[i]) { index[ii]=i; ii++; } } for(i=0;i<4;i++) { index[ii+i]=-1; } return(ii); } void optimize_svm(DOC **docs, long int *label, long int *unlabeled, long int *exclude_from_eq_const, double eq_target, long int *chosen, long int *active2dnum, MODEL *model, long int totdoc, long int *working2dnum, long int varnum, double *a, double *lin, double *c, LEARN_PARM *learn_parm, CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp, double *epsilon_crit_target) /* Do optimization on the working set. */ { long i; double *a_v; compute_matrices_for_optimization(docs,label,unlabeled, exclude_from_eq_const,eq_target,chosen, active2dnum,working2dnum,model,a,lin,c, varnum,totdoc,learn_parm,aicache, kernel_parm,qp); if(verbosity>=3) { printf("Running optimizer..."); fflush(stdout); } /* call the qp-subsolver */ a_v=optimize_qp(qp,epsilon_crit_target, learn_parm->svm_maxqpsize, &(model->b), /* in case the optimizer gives us */ /* the threshold for free. otherwise */ /* b is calculated in calculate_model. */ learn_parm); if(verbosity>=3) { printf("done\n"); } for(i=0;i<varnum;i++) { a[working2dnum[i]]=a_v[i]; /* if(a_v[i]<=(0+learn_parm->epsilon_a)) { a[working2dnum[i]]=0; } else if(a_v[i]>=(learn_parm->svm_cost[working2dnum[i]]-learn_parm->epsilon_a)) { a[working2dnum[i]]=learn_parm->svm_cost[working2dnum[i]]; } */ } } void compute_matrices_for_optimization(DOC **docs, long int *label, long int *unlabeled, long *exclude_from_eq_const, double eq_target, long int *chosen, long int *active2dnum, long int *key, MODEL *model, double *a, double *lin, double *c, long int varnum, long int totdoc, LEARN_PARM *learn_parm, CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp) { register long ki,kj,i,j; register double kernel_temp; if(verbosity>=3) { fprintf(stdout,"Computing qp-matrices (type %ld kernel [degree %ld, rbf_gamma %f, coef_lin %f, coef_const %f])...",kernel_parm->kernel_type,kernel_parm->poly_degree,kernel_parm->rbf_gamma,kernel_parm->coef_lin,kernel_parm->coef_const); fflush(stdout); } qp->opt_n=varnum; qp->opt_ce0[0]=-eq_target; /* compute the constant for equality constraint */ for(j=1;j<model->sv_num;j++) { /* start at 1 */ if((!chosen[(model->supvec[j])->docnum]) && (!exclude_from_eq_const[(model->supvec[j])->docnum])) { qp->opt_ce0[0]+=model->alpha[j]; } } if(learn_parm->biased_hyperplane) qp->opt_m=1; else qp->opt_m=0; /* eq-constraint will be ignored */ /* init linear part of objective function */ for(i=0;i<varnum;i++) { qp->opt_g0[i]=lin[key[i]]; } for(i=0;i<varnum;i++) { ki=key[i]; /* Compute the matrix for equality constraints */ qp->opt_ce[i]=label[ki]; qp->opt_low[i]=0; qp->opt_up[i]=learn_parm->svm_cost[ki]; kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[ki]); /* compute linear part of objective function */ qp->opt_g0[i]-=(kernel_temp*a[ki]*(double)label[ki]); /* compute quadratic part of objective function */ qp->opt_g[varnum*i+i]=kernel_temp; for(j=i+1;j<varnum;j++) { kj=key[j]; kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[kj]); /* compute linear part of objective function */ qp->opt_g0[i]-=(kernel_temp*a[kj]*(double)label[kj]); qp->opt_g0[j]-=(kernel_temp*a[ki]*(double)label[ki]); /* compute quadratic part of objective function */ qp->opt_g[varnum*i+j]=(double)label[ki]*(double)label[kj]*kernel_temp; qp->opt_g[varnum*j+i]=(double)label[ki]*(double)label[kj]*kernel_temp; } if(verbosity>=3) { if(i % 20 == 0) { fprintf(stdout,"%ld..",i); fflush(stdout); } } } for(i=0;i<varnum;i++) { /* assure starting at feasible point */ qp->opt_xinit[i]=a[key[i]]; /* set linear part of objective function */ qp->opt_g0[i]=(learn_parm->eps-(double)label[key[i]]*c[key[i]])+qp->opt_g0[i]*(double)label[key[i]]; } if(verbosity>=3) { fprintf(stdout,"done\n"); } } long calculate_svm_model(DOC **docs, long int *label, long int *unlabeled, double *lin, double *a, double *a_old, double *c, LEARN_PARM *learn_parm, long int *working2dnum, long int *active2dnum, MODEL *model) /* Compute decision function based on current values */ /* of alpha. */ { long i,ii,pos,b_calculated=0,first_low,first_high; double ex_c,b_temp,b_low,b_high; if(verbosity>=3) { printf("Calculating model..."); fflush(stdout); } if(!learn_parm->biased_hyperplane) { model->b=0; b_calculated=1; } for(ii=0;(i=working2dnum[ii])>=0;ii++) { if((a_old[i]>0) && (a[i]==0)) { /* remove from model */ pos=model->index[i]; model->index[i]=-1; (model->sv_num)--; model->supvec[pos]=model->supvec[model->sv_num]; model->alpha[pos]=model->alpha[model->sv_num]; model->index[(model->supvec[pos])->docnum]=pos; } else if((a_old[i]==0) && (a[i]>0)) { /* add to model */ model->supvec[model->sv_num]=docs[i]; model->alpha[model->sv_num]=a[i]*(double)label[i]; model->index[i]=model->sv_num; (model->sv_num)++; } else if(a_old[i]==a[i]) { /* nothing to do */ } else { /* just update alpha */ model->alpha[model->index[i]]=a[i]*(double)label[i]; } ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; if((a_old[i]>=ex_c) && (a[i]<ex_c)) { (model->at_upper_bound)--; } else if((a_old[i]<ex_c) && (a[i]>=ex_c)) { (model->at_upper_bound)++; } if((!b_calculated) && (a[i]>learn_parm->epsilon_a) && (a[i]<ex_c)) { /* calculate b */ model->b=((double)label[i]*learn_parm->eps-c[i]+lin[i]); /* model->b=(-(double)label[i]+lin[i]); */ b_calculated=1; } } /* No alpha in the working set not at bounds, so b was not calculated in the usual way. The following handles this special case. */ if(learn_parm->biased_hyperplane && (!b_calculated) && (model->sv_num-1 == model->at_upper_bound)) { first_low=1; first_high=1; b_low=0; b_high=0; for(ii=0;(i=active2dnum[ii])>=0;ii++) { ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; if(a[i]<ex_c) { if(label[i]>0) { b_temp=-(learn_parm->eps-c[i]+lin[i]); if((b_temp>b_low) || (first_low)) { b_low=b_temp; first_low=0; } } else { b_temp=-(-learn_parm->eps-c[i]+lin[i]); if((b_temp<b_high) || (first_high)) { b_high=b_temp; first_high=0; } } } else { if(label[i]<0) { b_temp=-(-learn_parm->eps-c[i]+lin[i]); if((b_temp>b_low) || (first_low)) { b_low=b_temp; first_low=0; } } else { b_temp=-(learn_parm->eps-c[i]+lin[i]); if((b_temp<b_high) || (first_high)) { b_high=b_temp; first_high=0; } } } } if(first_high) { model->b=-b_low; } else if(first_low) { model->b=-b_high; } else { model->b=-(b_high+b_low)/2.0; /* select b as the middle of range */ /* printf("\nb_low=%f, b_high=%f,b=%f\n",b_low,b_high,model->b); */ } } if(verbosity>=3) { printf("done\n"); fflush(stdout); } return(model->sv_num-1); /* have to substract one, since element 0 is empty*/ } long check_optimality(MODEL *model, long int *label, long int *unlabeled, double *a, double *lin, double *c, long int totdoc, LEARN_PARM *learn_parm, double *maxdiff, double epsilon_crit_org, long int *misclassified, long int *inconsistent, long int *active2dnum, long int *last_suboptimal_at, long int iteration, KERNEL_PARM *kernel_parm) /* Check KT-conditions */ { long i,ii,retrain; double dist,ex_c,target; if(kernel_parm->kernel_type == LINEAR) { /* be optimistic */ learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org; } else { /* be conservative */ learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3; } retrain=0; (*maxdiff)=0; (*misclassified)=0; for(ii=0;(i=active2dnum[ii])>=0;ii++) { if((!inconsistent[i]) && label[i]) { dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from hyperplane*/ target=-(learn_parm->eps-(double)label[i]*c[i]); ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; if(dist <= 0) { (*misclassified)++; /* does not work due to deactivation of var */ } if((a[i]>learn_parm->epsilon_a) && (dist > target)) { if((dist-target)>(*maxdiff)) /* largest violation */ (*maxdiff)=dist-target; } else if((a[i]<ex_c) && (dist < target)) { if((target-dist)>(*maxdiff)) /* largest violation */ (*maxdiff)=target-dist; } /* Count how long a variable was at lower/upper bound (and optimal).*/ /* Variables, which were at the bound and optimal for a long */ /* time are unlikely to become support vectors. In case our */ /* cache is filled up, those variables are excluded to save */ /* kernel evaluations. (See chapter 'Shrinking').*/ if((a[i]>(learn_parm->epsilon_a)) && (a[i]<ex_c)) { last_suboptimal_at[i]=iteration; /* not at bound */ } else if((a[i]<=(learn_parm->epsilon_a)) && (dist < (target+learn_parm->epsilon_shrink))) { last_suboptimal_at[i]=iteration; /* not likely optimal */ } else if((a[i]>=ex_c) && (dist > (target-learn_parm->epsilon_shrink))) { last_suboptimal_at[i]=iteration; /* not likely optimal */ } } } /* termination criterion */ if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) { retrain=1; } return(retrain); } long check_optimality_sharedslack(DOC **docs, MODEL *model, long int *label, double *a, double *lin, double *c, double *slack, double *alphaslack, long int totdoc, LEARN_PARM *learn_parm, double *maxdiff, double epsilon_crit_org, long int *misclassified, long int *active2dnum, long int *last_suboptimal_at, long int iteration, KERNEL_PARM *kernel_parm) /* Check KT-conditions */ { long i,ii,retrain; double dist,ex_c=0,target; if(kernel_parm->kernel_type == LINEAR) { /* be optimistic */ learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org; } else { /* be conservative */ learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3; } retrain=0; (*maxdiff)=0; (*misclassified)=0; for(ii=0;(i=active2dnum[ii])>=0;ii++) { /* 'distance' from hyperplane*/ dist=(lin[i]-model->b)*(double)label[i]+slack[docs[i]->slackid]; target=-(learn_parm->eps-(double)label[i]*c[i]); ex_c=learn_parm->svm_c-learn_parm->epsilon_a; if((a[i]>learn_parm->epsilon_a) && (dist > target)) { if((dist-target)>(*maxdiff)) { /* largest violation */ (*maxdiff)=dist-target; if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]); if(verbosity>=5) printf(" (single %f)\n",(*maxdiff)); } } if((alphaslack[docs[i]->slackid]<ex_c) && (slack[docs[i]->slackid]>0)) { if((slack[docs[i]->slackid])>(*maxdiff)) { /* largest violation */ (*maxdiff)=slack[docs[i]->slackid]; if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]); if(verbosity>=5) printf(" (joint %f)\n",(*maxdiff)); } } /* Count how long a variable was at lower/upper bound (and optimal).*/ /* Variables, which were at the bound and optimal for a long */ /* time are unlikely to become support vectors. In case our */ /* cache is filled up, those variables are excluded to save */ /* kernel evaluations. (See chapter 'Shrinking').*/ if((a[i]>(learn_parm->epsilon_a)) && (a[i]<ex_c)) { last_suboptimal_at[docs[i]->slackid]=iteration; /* not at bound */ } else if((a[i]<=(learn_parm->epsilon_a)) && (dist < (target+learn_parm->epsilon_shrink))) { last_suboptimal_at[docs[i]->slackid]=iteration; /* not likely optimal */ } else if((a[i]>=ex_c) && (slack[docs[i]->slackid] < learn_parm->epsilon_shrink)) { last_suboptimal_at[docs[i]->slackid]=iteration; /* not likely optimal */ } } /* termination criterion */ if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) { retrain=1; } return(retrain); } void compute_shared_slacks(DOC **docs, long int *label, double *a, double *lin, double *c, long int *active2dnum, LEARN_PARM *learn_parm, double *slack, double *alphaslack) /* compute the value of shared slacks and the joint alphas */ { long jj,i; double dist,target; for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* clear slack variables */ slack[docs[i]->slackid]=0.0; alphaslack[docs[i]->slackid]=0.0; } for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* recompute slack variables */ dist=(lin[i])*(double)label[i]; target=-(learn_parm->eps-(double)label[i]*c[i]); if((target-dist) > slack[docs[i]->slackid]) slack[docs[i]->slackid]=target-dist; alphaslack[docs[i]->slackid]+=a[i]; } } long identify_inconsistent(double *a, long int *label, long int *unlabeled, long int totdoc, LEARN_PARM *learn_parm, long int *inconsistentnum, long int *inconsistent) { long i,retrain; /* Throw out examples with multipliers at upper bound. This */ /* corresponds to the -i 1 option. */ /* ATTENTION: this is just a heuristic for finding a close */ /* to minimum number of examples to exclude to */ /* make the problem separable with desired margin */ retrain=0; for(i=0;i<totdoc;i++) { if((!inconsistent[i]) && (!unlabeled[i]) && (a[i]>=(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) { (*inconsistentnum)++; inconsistent[i]=1; /* never choose again */ retrain=2; /* start over */ if(verbosity>=3) { printf("inconsistent(%ld)..",i); fflush(stdout); } } } return(retrain); } long identify_misclassified(double *lin, long int *label, long int *unlabeled, long int totdoc, MODEL *model, long int *inconsistentnum, long int *inconsistent) { long i,retrain; double dist; /* Throw out misclassified examples. This */ /* corresponds to the -i 2 option. */ /* ATTENTION: this is just a heuristic for finding a close */ /* to minimum number of examples to exclude to */ /* make the problem separable with desired margin */ retrain=0; for(i=0;i<totdoc;i++) { dist=(lin[i]-model->b)*(double)label[i]; /* 'distance' from hyperplane*/ if((!inconsistent[i]) && (!unlabeled[i]) && (dist <= 0)) { (*inconsistentnum)++; inconsistent[i]=1; /* never choose again */ retrain=2; /* start over */ if(verbosity>=3) { printf("inconsistent(%ld)..",i); fflush(stdout); } } } return(retrain); } long identify_one_misclassified(double *lin, long int *label, long int *unlabeled, long int totdoc, MODEL *model, long int *inconsistentnum, long int *inconsistent) { long i,retrain,maxex=-1; double dist,maxdist=0; /* Throw out the 'most misclassified' example. This */ /* corresponds to the -i 3 option. */ /* ATTENTION: this is just a heuristic for finding a close */ /* to minimum number of examples to exclude to */ /* make the problem separable with desired margin */ retrain=0; for(i=0;i<totdoc;i++) { if((!inconsistent[i]) && (!unlabeled[i])) { dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from hyperplane*/ if(dist<maxdist) { maxdist=dist; maxex=i; } } } if(maxex>=0) { (*inconsistentnum)++; inconsistent[maxex]=1; /* never choose again */ retrain=2; /* start over */ if(verbosity>=3) { printf("inconsistent(%ld)..",i); fflush(stdout); } } return(retrain); } void update_linear_component(DOC **docs, long int *label, long int *active2dnum, double *a, double *a_old, long int *working2dnum, long int totdoc, long int totwords, KERNEL_PARM *kernel_parm, KERNEL_CACHE *kernel_cache, double *lin, CFLOAT *aicache, double *weights) /* keep track of the linear component */ /* lin of the gradient etc. by updating */ /* based on the change of the variables */ /* in the current working set */ { register long i,ii,j,jj; register double tec; SVECTOR *f; if(kernel_parm->kernel_type==0) { /* special linear case */ clear_vector_n(weights,totwords); for(ii=0;(i=working2dnum[ii])>=0;ii++) { if(a[i] != a_old[i]) { for(f=docs[i]->fvec;f;f=f->next) add_vector_ns(weights,f, f->factor*((a[i]-a_old[i])*(double)label[i])); } } for(jj=0;(j=active2dnum[jj])>=0;jj++) { for(f=docs[j]->fvec;f;f=f->next) lin[j]+=f->factor*sprod_ns(weights,f); } } else { /* general case */ for(jj=0;(i=working2dnum[jj])>=0;jj++) { if(a[i] != a_old[i]) { get_kernel_row(kernel_cache,docs,i,totdoc,active2dnum,aicache, kernel_parm); for(ii=0;(j=active2dnum[ii])>=0;ii++) { tec=aicache[j]; lin[j]+=(((a[i]*tec)-(a_old[i]*tec))*(double)label[i]); } } } } } long incorporate_unlabeled_examples(MODEL *model, long int *label, long int *inconsistent, long int *unlabeled, double *a, double *lin, long int totdoc, double *selcrit, long int *select, long int *key, long int transductcycle, KERNEL_PARM *kernel_parm, LEARN_PARM *learn_parm) { long i,j,k,j1,j2,j3,j4,unsupaddnum1=0,unsupaddnum2=0; long pos,neg,upos,uneg,orgpos,orgneg,nolabel,newpos,newneg,allunlab; double dist,model_length,posratio,negratio; long check_every=2; double loss; static double switchsens=0.0,switchsensorg=0.0; double umin,umax,sumalpha; long imin=0,imax=0; static long switchnum=0; switchsens/=1.2; /* assumes that lin[] is up to date -> no inactive vars */ orgpos=0; orgneg=0; newpos=0; newneg=0; nolabel=0; allunlab=0; for(i=0;i<totdoc;i++) { if(!unlabeled[i]) { if(label[i] > 0) { orgpos++; } else { orgneg++; } } else { allunlab++; if(unlabeled[i]) { if(label[i] > 0) { newpos++; } else if(label[i] < 0) { newneg++; } } } if(label[i]==0) { nolabel++; } } if(learn_parm->transduction_posratio >= 0) { posratio=learn_parm->transduction_posratio; } else { posratio=(double)orgpos/(double)(orgpos+orgneg); /* use ratio of pos/neg */ } /* in training data */ negratio=1.0-posratio; learn_parm->svm_costratio=1.0; /* global */ if(posratio>0) { learn_parm->svm_costratio_unlab=negratio/posratio; } else { learn_parm->svm_costratio_unlab=1.0; } pos=0; neg=0; upos=0; uneg=0; for(i=0;i<totdoc;i++) { dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ if(dist>0) { pos++; } else { neg++; } if(unlabeled[i]) { if(dist>0) { upos++; } else { uneg++; } } if((!unlabeled[i]) && (a[i]>(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) { /* printf("Ubounded %ld (class %ld, unlabeled %ld)\n",i,label[i],unlabeled[i]); */ } } if(verbosity>=2) { printf("POS=%ld, ORGPOS=%ld, ORGNEG=%ld\n",pos,orgpos,orgneg); printf("POS=%ld, NEWPOS=%ld, NEWNEG=%ld\n",pos,newpos,newneg); printf("pos ratio = %f (%f).\n",(double)(upos)/(double)(allunlab),posratio); fflush(stdout); } if(transductcycle == 0) { j1=0; j2=0; j4=0; for(i=0;i<totdoc;i++) { dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ if((label[i]==0) && (unlabeled[i])) { selcrit[j4]=dist; key[j4]=i; j4++; } } unsupaddnum1=0; unsupaddnum2=0; select_top_n(selcrit,j4,select,(long)(allunlab*posratio+0.5)); for(k=0;(k<(long)(allunlab*posratio+0.5));k++) { i=key[select[k]]; label[i]=1; unsupaddnum1++; j1++; } for(i=0;i<totdoc;i++) { if((label[i]==0) && (unlabeled[i])) { label[i]=-1; j2++; unsupaddnum2++; } } for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ if(unlabeled[i]) { if(label[i] == 1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; } else if(label[i] == -1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_unlabbound; } } } if(verbosity>=1) { /* printf("costratio %f, costratio_unlab %f, unlabbound %f\n", learn_parm->svm_costratio,learn_parm->svm_costratio_unlab, learn_parm->svm_unlabbound); */ printf("Classifying unlabeled data as %ld POS / %ld NEG.\n", unsupaddnum1,unsupaddnum2); fflush(stdout); } if(verbosity >= 1) printf("Retraining."); if(verbosity >= 2) printf("\n"); return((long)3); } if((transductcycle % check_every) == 0) { if(verbosity >= 1) printf("Retraining."); if(verbosity >= 2) printf("\n"); j1=0; j2=0; unsupaddnum1=0; unsupaddnum2=0; for(i=0;i<totdoc;i++) { if((unlabeled[i] == 2)) { unlabeled[i]=1; label[i]=1; j1++; unsupaddnum1++; } else if((unlabeled[i] == 3)) { unlabeled[i]=1; label[i]=-1; j2++; unsupaddnum2++; } } for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ if(unlabeled[i]) { if(label[i] == 1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; } else if(label[i] == -1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_unlabbound; } } } if(verbosity>=2) { /* printf("costratio %f, costratio_unlab %f, unlabbound %f\n", learn_parm->svm_costratio,learn_parm->svm_costratio_unlab, learn_parm->svm_unlabbound); */ printf("%ld positive -> Added %ld POS / %ld NEG unlabeled examples.\n", upos,unsupaddnum1,unsupaddnum2); fflush(stdout); } if(learn_parm->svm_unlabbound == 1) { learn_parm->epsilon_crit=0.001; /* do the last run right */ } else { learn_parm->epsilon_crit=0.01; /* otherwise, no need to be so picky */ } return((long)3); } else if(((transductcycle % check_every) < check_every)) { model_length=0; sumalpha=0; loss=0; for(i=0;i<totdoc;i++) { model_length+=a[i]*label[i]*lin[i]; sumalpha+=a[i]; dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ if((label[i]*dist)<(1.0-learn_parm->epsilon_crit)) { loss+=(1.0-(label[i]*dist))*learn_parm->svm_cost[i]; } } model_length=sqrt(model_length); if(verbosity>=2) { printf("Model-length = %f (%f), loss = %f, objective = %f\n", model_length,sumalpha,loss,loss+0.5*model_length*model_length); fflush(stdout); } j1=0; j2=0; j3=0; j4=0; unsupaddnum1=0; unsupaddnum2=0; umin=99999; umax=-99999; j4=1; while(j4) { umin=99999; umax=-99999; for(i=0;(i<totdoc);i++) { dist=(lin[i]-model->b); if((label[i]>0) && (unlabeled[i]) && (!inconsistent[i]) && (dist<umin)) { umin=dist; imin=i; } if((label[i]<0) && (unlabeled[i]) && (!inconsistent[i]) && (dist>umax)) { umax=dist; imax=i; } } if((umin < (umax+switchsens-1E-4))) { j1++; j2++; unsupaddnum1++; unlabeled[imin]=3; inconsistent[imin]=1; unsupaddnum2++; unlabeled[imax]=2; inconsistent[imax]=1; } else j4=0; j4=0; } for(j=0;(j<totdoc);j++) { if(unlabeled[j] && (!inconsistent[j])) { if(label[j]>0) { unlabeled[j]=2; } else if(label[j]<0) { unlabeled[j]=3; } /* inconsistent[j]=1; */ j3++; } } switchnum+=unsupaddnum1+unsupaddnum2; /* stop and print out current margin printf("switchnum %ld %ld\n",switchnum,kernel_parm->poly_degree); if(switchnum == 2*kernel_parm->poly_degree) { learn_parm->svm_unlabbound=1; } */ if((!unsupaddnum1) && (!unsupaddnum2)) { if((learn_parm->svm_unlabbound>=1) && ((newpos+newneg) == allunlab)) { for(j=0;(j<totdoc);j++) { inconsistent[j]=0; if(unlabeled[j]) unlabeled[j]=1; } write_prediction(learn_parm->predfile,model,lin,a,unlabeled,label, totdoc,learn_parm); if(verbosity>=1) printf("Number of switches: %ld\n",switchnum); return((long)0); } switchsens=switchsensorg; learn_parm->svm_unlabbound*=1.5; if(learn_parm->svm_unlabbound>1) { learn_parm->svm_unlabbound=1; } model->at_upper_bound=0; /* since upper bound increased */ if(verbosity>=1) printf("Increasing influence of unlabeled examples to %f%% .", learn_parm->svm_unlabbound*100.0); } else if(verbosity>=1) { printf("%ld positive -> Switching labels of %ld POS / %ld NEG unlabeled examples.", upos,unsupaddnum1,unsupaddnum2); fflush(stdout); } if(verbosity >= 2) printf("\n"); learn_parm->epsilon_crit=0.5; /* don't need to be so picky */ for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ if(unlabeled[i]) { if(label[i] == 1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; } else if(label[i] == -1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_unlabbound; } } } return((long)2); } return((long)0); } /*************************** Working set selection ***************************/ long select_next_qp_subproblem_grad(long int *label, long int *unlabeled, double *a, double *lin, double *c, long int totdoc, long int qp_size, LEARN_PARM *learn_parm, long int *inconsistent, long int *active2dnum, long int *working2dnum, double *selcrit, long int *select, KERNEL_CACHE *kernel_cache, long int cache_only, long int *key, long int *chosen) /* Use the feasible direction approach to select the next qp-subproblem (see chapter 'Selecting a good working set'). If 'cache_only' is true, then the variables are selected only among those for which the kernel evaluations are cached. */ { long choosenum,i,j,k,activedoc,inum,valid; double s; for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */ choosenum=0; activedoc=0; for(i=0;(j=active2dnum[i])>=0;i++) { s=-label[j]; if(kernel_cache && cache_only) valid=(kernel_cache->index[j]>=0); else valid=1; if(valid && (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) && (s>0))) && (!chosen[j]) && (label[j]) && (!inconsistent[j])) { selcrit[activedoc]=(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]); /* selcrit[activedoc]=(double)label[j]*(-1.0+(double)label[j]*lin[j]); */ key[activedoc]=j; activedoc++; } } select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) { /* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */ i=key[select[k]]; chosen[i]=1; working2dnum[inum+choosenum]=i; choosenum+=1; if(kernel_cache) kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked out of cache */ /* } */ } activedoc=0; for(i=0;(j=active2dnum[i])>=0;i++) { s=label[j]; if(kernel_cache && cache_only) valid=(kernel_cache->index[j]>=0); else valid=1; if(valid && (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) && (s>0))) && (!chosen[j]) && (label[j]) && (!inconsistent[j])) { selcrit[activedoc]=-(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]); /* selcrit[activedoc]=-(double)(label[j]*(-1.0+(double)label[j]*lin[j])); */ key[activedoc]=j; activedoc++; } } select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) { /* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */ i=key[select[k]]; chosen[i]=1; working2dnum[inum+choosenum]=i; choosenum+=1; if(kernel_cache) kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked out of cache */ /* } */ } working2dnum[inum+choosenum]=-1; /* complete index */ return(choosenum); } long select_next_qp_subproblem_rand(long int *label, long int *unlabeled, double *a, double *lin, double *c, long int totdoc, long int qp_size, LEARN_PARM *learn_parm, long int *inconsistent, long int *active2dnum, long int *working2dnum, double *selcrit, long int *select, KERNEL_CACHE *kernel_cache, long int *key, long int *chosen, long int iteration) /* Use the feasible direction approach to select the next qp-subproblem (see section 'Selecting a good working set'). Chooses a feasible direction at (pseudo) random to help jump over numerical problem. */ { long choosenum,i,j,k,activedoc,inum; double s; for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */ choosenum=0; activedoc=0; for(i=0;(j=active2dnum[i])>=0;i++) { s=-label[j]; if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) && (s>0))) && (!inconsistent[j]) && (label[j]) && (!chosen[j])) { selcrit[activedoc]=(j+iteration) % totdoc; key[activedoc]=j; activedoc++; } } select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) { i=key[select[k]]; chosen[i]=1; working2dnum[inum+choosenum]=i; choosenum+=1; kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */ /* out of cache */ } activedoc=0; for(i=0;(j=active2dnum[i])>=0;i++) { s=label[j]; if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) && (s>0))) && (!inconsistent[j]) && (label[j]) && (!chosen[j])) { selcrit[activedoc]=(j+iteration) % totdoc; key[activedoc]=j; activedoc++; } } select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) { i=key[select[k]]; chosen[i]=1; working2dnum[inum+choosenum]=i; choosenum+=1; kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */ /* out of cache */ } working2dnum[inum+choosenum]=-1; /* complete index */ return(choosenum); } long select_next_qp_slackset(DOC **docs, long int *label, double *a, double *lin, double *slack, double *alphaslack, double *c, LEARN_PARM *learn_parm, long int *active2dnum, double *maxviol) /* returns the slackset with the largest internal violation */ { long i,ii,maxdiffid; double dist,target,maxdiff,ex_c; maxdiff=0; maxdiffid=0; for(ii=0;(i=active2dnum[ii])>=0;ii++) { ex_c=learn_parm->svm_c-learn_parm->epsilon_a; if(alphaslack[docs[i]->slackid] >= ex_c) { dist=(lin[i])*(double)label[i]+slack[docs[i]->slackid]; /* distance */ target=-(learn_parm->eps-(double)label[i]*c[i]); /* rhs of constraint */ if((a[i]>learn_parm->epsilon_a) && (dist > target)) { if((dist-target)>maxdiff) { /* largest violation */ maxdiff=dist-target; maxdiffid=docs[i]->slackid; } } } } (*maxviol)=maxdiff; return(maxdiffid); } void select_top_n(double *selcrit, long int range, long int *select, long int n) { register long i,j; for(i=0;(i<n) && (i<range);i++) { /* Initialize with the first n elements */ for(j=i;j>=0;j--) { if((j>0) && (selcrit[select[j-1]]<selcrit[i])){ select[j]=select[j-1]; } else { select[j]=i; j=-1; } } } if(n>0) { for(i=n;i<range;i++) { if(selcrit[i]>selcrit[select[n-1]]) { for(j=n-1;j>=0;j--) { if((j>0) && (selcrit[select[j-1]]<selcrit[i])) { select[j]=select[j-1]; } else { select[j]=i; j=-1; } } } } } } /******************************** Shrinking *********************************/ void init_shrink_state(SHRINK_STATE *shrink_state, long int totdoc, long int maxhistory) { long i; shrink_state->deactnum=0; shrink_state->active = (long *)my_malloc(sizeof(long)*totdoc); shrink_state->inactive_since = (long *)my_malloc(sizeof(long)*totdoc); shrink_state->a_history = (double **)my_malloc(sizeof(double *)*maxhistory); shrink_state->maxhistory=maxhistory; shrink_state->last_lin = (double *)my_malloc(sizeof(double)*totdoc); shrink_state->last_a = (double *)my_malloc(sizeof(double)*totdoc); for(i=0;i<totdoc;i++) { shrink_state->active[i]=1; shrink_state->inactive_since[i]=0; shrink_state->last_a[i]=0; shrink_state->last_lin[i]=0; } } void shrink_state_cleanup(SHRINK_STATE *shrink_state) { free(shrink_state->active); free(shrink_state->inactive_since); if(shrink_state->deactnum > 0) free(shrink_state->a_history[shrink_state->deactnum-1]); free(shrink_state->a_history); free(shrink_state->last_a); free(shrink_state->last_lin); } long shrink_problem(DOC **docs, LEARN_PARM *learn_parm, SHRINK_STATE *shrink_state, KERNEL_PARM *kernel_parm, long int *active2dnum, long int *last_suboptimal_at, long int iteration, long int totdoc, long int minshrink, double *a, long int *inconsistent) /* Shrink some variables away. Do the shrinking only if at least minshrink variables can be removed. */ { long i,ii,change,activenum,lastiter; double *a_old; activenum=0; change=0; for(ii=0;active2dnum[ii]>=0;ii++) { i=active2dnum[ii]; activenum++; if(learn_parm->sharedslack) lastiter=last_suboptimal_at[docs[i]->slackid]; else lastiter=last_suboptimal_at[i]; if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink) || (inconsistent[i])) { change++; } } if((change>=minshrink) /* shrink only if sufficiently many candidates */ && (shrink_state->deactnum<shrink_state->maxhistory)) { /* and enough memory */ /* Shrink problem by removing those variables which are */ /* optimal at a bound for a minimum number of iterations */ if(verbosity>=2) { printf(" Shrinking..."); fflush(stdout); } if(kernel_parm->kernel_type != LINEAR) { /* non-linear case save alphas */ a_old=(double *)my_malloc(sizeof(double)*totdoc); shrink_state->a_history[shrink_state->deactnum]=a_old; for(i=0;i<totdoc;i++) { a_old[i]=a[i]; } } for(ii=0;active2dnum[ii]>=0;ii++) { i=active2dnum[ii]; if(learn_parm->sharedslack) lastiter=last_suboptimal_at[docs[i]->slackid]; else lastiter=last_suboptimal_at[i]; if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink) || (inconsistent[i])) { shrink_state->active[i]=0; shrink_state->inactive_since[i]=shrink_state->deactnum; } } activenum=compute_index(shrink_state->active,totdoc,active2dnum); shrink_state->deactnum++; if(kernel_parm->kernel_type == LINEAR) { shrink_state->deactnum=0; } if(verbosity>=2) { printf("done.\n"); fflush(stdout); printf(" Number of inactive variables = %ld\n",totdoc-activenum); } } return(activenum); } void reactivate_inactive_examples(long int *label, long int *unlabeled, double *a, SHRINK_STATE *shrink_state, double *lin, double *c, long int totdoc, long int totwords, long int iteration, LEARN_PARM *learn_parm, long int *inconsistent, DOC **docs, KERNEL_PARM *kernel_parm, KERNEL_CACHE *kernel_cache, MODEL *model, CFLOAT *aicache, double *weights, double *maxdiff) /* Make all variables active again which had been removed by shrinking. */ /* Computes lin for those variables from scratch. */ { register long i,j,ii,jj,t,*changed2dnum,*inactive2dnum; long *changed,*inactive; register double kernel_val,*a_old,dist; double ex_c,target; SVECTOR *f; if(kernel_parm->kernel_type == LINEAR) { /* special linear case */ a_old=shrink_state->last_a; clear_vector_n(weights,totwords); for(i=0;i<totdoc;i++) { if(a[i] != a_old[i]) { for(f=docs[i]->fvec;f;f=f->next) add_vector_ns(weights,f, f->factor*((a[i]-a_old[i])*(double)label[i])); a_old[i]=a[i]; } } for(i=0;i<totdoc;i++) { if(!shrink_state->active[i]) { for(f=docs[i]->fvec;f;f=f->next) lin[i]=shrink_state->last_lin[i]+f->factor*sprod_ns(weights,f); } shrink_state->last_lin[i]=lin[i]; } } else { changed=(long *)my_malloc(sizeof(long)*totdoc); changed2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11)); inactive=(long *)my_malloc(sizeof(long)*totdoc); inactive2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11)); for(t=shrink_state->deactnum-1;(t>=0) && shrink_state->a_history[t];t--) { if(verbosity>=2) { printf("%ld..",t); fflush(stdout); } a_old=shrink_state->a_history[t]; for(i=0;i<totdoc;i++) { inactive[i]=((!shrink_state->active[i]) && (shrink_state->inactive_since[i] == t)); changed[i]= (a[i] != a_old[i]); } compute_index(inactive,totdoc,inactive2dnum); compute_index(changed,totdoc,changed2dnum); for(ii=0;(i=changed2dnum[ii])>=0;ii++) { get_kernel_row(kernel_cache,docs,i,totdoc,inactive2dnum,aicache, kernel_parm); for(jj=0;(j=inactive2dnum[jj])>=0;jj++) { kernel_val=aicache[j]; lin[j]+=(((a[i]*kernel_val)-(a_old[i]*kernel_val))*(double)label[i]); } } } free(changed); free(changed2dnum); free(inactive); free(inactive2dnum); } (*maxdiff)=0; for(i=0;i<totdoc;i++) { shrink_state->inactive_since[i]=shrink_state->deactnum-1; if(!inconsistent[i]) { dist=(lin[i]-model->b)*(double)label[i]; target=-(learn_parm->eps-(double)label[i]*c[i]); ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a; if((a[i]>learn_parm->epsilon_a) && (dist > target)) { if((dist-target)>(*maxdiff)) /* largest violation */ (*maxdiff)=dist-target; } else if((a[i]<ex_c) && (dist < target)) { if((target-dist)>(*maxdiff)) /* largest violation */ (*maxdiff)=target-dist; } if((a[i]>(0+learn_parm->epsilon_a)) && (a[i]<ex_c)) { shrink_state->active[i]=1; /* not at bound */ } else if((a[i]<=(0+learn_parm->epsilon_a)) && (dist < (target+learn_parm->epsilon_shrink))) { shrink_state->active[i]=1; } else if((a[i]>=ex_c) && (dist > (target-learn_parm->epsilon_shrink))) { shrink_state->active[i]=1; } else if(learn_parm->sharedslack) { /* make all active when sharedslack */ shrink_state->active[i]=1; } } } if(kernel_parm->kernel_type != LINEAR) { /* update history for non-linear */ for(i=0;i<totdoc;i++) { (shrink_state->a_history[shrink_state->deactnum-1])[i]=a[i]; } for(t=shrink_state->deactnum-2;(t>=0) && shrink_state->a_history[t];t--) { free(shrink_state->a_history[t]); shrink_state->a_history[t]=0; } } } /****************************** Cache handling *******************************/ void get_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs, long int docnum, long int totdoc, long int *active2dnum, CFLOAT *buffer, KERNEL_PARM *kernel_parm) /* Get's a row of the matrix of kernel values This matrix has the same form as the Hessian, just that the elements are not multiplied by */ /* y_i * y_j * a_i * a_j */ /* Takes the values from the cache if available. */ { register long i,j,start; DOC *ex; ex=docs[docnum]; if(kernel_cache->index[docnum] != -1) { /* row is cached? */ kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */ start=kernel_cache->activenum*kernel_cache->index[docnum]; for(i=0;(j=active2dnum[i])>=0;i++) { if(kernel_cache->totdoc2active[j] >= 0) { /* column is cached? */ buffer[j]=kernel_cache->buffer[start+kernel_cache->totdoc2active[j]]; } else { buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]); } } } else { for(i=0;(j=active2dnum[i])>=0;i++) { buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]); } } } void cache_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs, long int m, KERNEL_PARM *kernel_parm) /* Fills cache for the row m */ { register DOC *ex; register long j,k,l; register CFLOAT *cache; if(!kernel_cache_check(kernel_cache,m)) { /* not cached yet*/ cache = kernel_cache_clean_and_malloc(kernel_cache,m); if(cache) { l=kernel_cache->totdoc2active[m]; ex=docs[m]; for(j=0;j<kernel_cache->activenum;j++) { /* fill cache */ k=kernel_cache->active2totdoc[j]; if((kernel_cache->index[k] != -1) && (l != -1) && (k != m)) { cache[j]=kernel_cache->buffer[kernel_cache->activenum *kernel_cache->index[k]+l]; } else { cache[j]=kernel(kernel_parm,ex,docs[k]); } } } else { perror("Error: Kernel cache full! => increase cache size"); } } } void cache_multiple_kernel_rows(KERNEL_CACHE *kernel_cache, DOC **docs, long int *key, long int varnum, KERNEL_PARM *kernel_parm) /* Fills cache for the rows in key */ { register long i; for(i=0;i<varnum;i++) { /* fill up kernel cache */ cache_kernel_row(kernel_cache,docs,key[i],kernel_parm); } } void kernel_cache_shrink(KERNEL_CACHE *kernel_cache, long int totdoc, long int numshrink, long int *after) /* Remove numshrink columns in the cache which correspond to examples marked 0 in after. */ { register long i,j,jj,from=0,to=0,scount; long *keep; if(verbosity>=2) { printf(" Reorganizing cache..."); fflush(stdout); } keep=(long *)my_malloc(sizeof(long)*totdoc); for(j=0;j<totdoc;j++) { keep[j]=1; } scount=0; for(jj=0;(jj<kernel_cache->activenum) && (scount<numshrink);jj++) { j=kernel_cache->active2totdoc[jj]; if(!after[j]) { scount++; keep[j]=0; } } for(i=0;i<kernel_cache->max_elems;i++) { for(jj=0;jj<kernel_cache->activenum;jj++) { j=kernel_cache->active2totdoc[jj]; if(!keep[j]) { from++; } else { kernel_cache->buffer[to]=kernel_cache->buffer[from]; to++; from++; } } } kernel_cache->activenum=0; for(j=0;j<totdoc;j++) { if((keep[j]) && (kernel_cache->totdoc2active[j] != -1)) { kernel_cache->active2totdoc[kernel_cache->activenum]=j; kernel_cache->totdoc2active[j]=kernel_cache->activenum; kernel_cache->activenum++; } else { kernel_cache->totdoc2active[j]=-1; } } kernel_cache->max_elems=(long)(kernel_cache->buffsize/kernel_cache->activenum); if(kernel_cache->max_elems>totdoc) { kernel_cache->max_elems=totdoc; } free(keep); if(verbosity>=2) { printf("done.\n"); fflush(stdout); printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems); } } KERNEL_CACHE *kernel_cache_init(long int totdoc, long int buffsize) { long i; KERNEL_CACHE *kernel_cache; kernel_cache=(KERNEL_CACHE *)my_malloc(sizeof(KERNEL_CACHE)); kernel_cache->index = (long *)my_malloc(sizeof(long)*totdoc); kernel_cache->occu = (long *)my_malloc(sizeof(long)*totdoc); kernel_cache->lru = (long *)my_malloc(sizeof(long)*totdoc); kernel_cache->invindex = (long *)my_malloc(sizeof(long)*totdoc); kernel_cache->active2totdoc = (long *)my_malloc(sizeof(long)*totdoc); kernel_cache->totdoc2active = (long *)my_malloc(sizeof(long)*totdoc); kernel_cache->buffer = (CFLOAT *)my_malloc((size_t)(buffsize)*1024*1024); kernel_cache->buffsize=(long)(buffsize/sizeof(CFLOAT)*1024*1024); kernel_cache->max_elems=(long)(kernel_cache->buffsize/totdoc); if(kernel_cache->max_elems>totdoc) { kernel_cache->max_elems=totdoc; } if(verbosity>=2) { printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems); printf(" Kernel evals so far: %ld\n",kernel_cache_statistic); } kernel_cache->elems=0; /* initialize cache */ for(i=0;i<totdoc;i++) { kernel_cache->index[i]=-1; kernel_cache->lru[i]=0; } for(i=0;i<totdoc;i++) { kernel_cache->occu[i]=0; kernel_cache->invindex[i]=-1; } kernel_cache->activenum=totdoc;; for(i=0;i<totdoc;i++) { kernel_cache->active2totdoc[i]=i; kernel_cache->totdoc2active[i]=i; } kernel_cache->time=0; return(kernel_cache); } void kernel_cache_reset_lru(KERNEL_CACHE *kernel_cache) { long maxlru=0,k; for(k=0;k<kernel_cache->max_elems;k++) { if(maxlru < kernel_cache->lru[k]) maxlru=kernel_cache->lru[k]; } for(k=0;k<kernel_cache->max_elems;k++) { kernel_cache->lru[k]-=maxlru; } } void kernel_cache_cleanup(KERNEL_CACHE *kernel_cache) { free(kernel_cache->index); free(kernel_cache->occu); free(kernel_cache->lru); free(kernel_cache->invindex); free(kernel_cache->active2totdoc); free(kernel_cache->totdoc2active); free(kernel_cache->buffer); free(kernel_cache); } long kernel_cache_malloc(KERNEL_CACHE *kernel_cache) { long i; if(kernel_cache_space_available(kernel_cache)) { for(i=0;i<kernel_cache->max_elems;i++) { if(!kernel_cache->occu[i]) { kernel_cache->occu[i]=1; kernel_cache->elems++; return(i); } } } return(-1); } void kernel_cache_free(KERNEL_CACHE *kernel_cache, long int i) { kernel_cache->occu[i]=0; kernel_cache->elems--; } long kernel_cache_free_lru(KERNEL_CACHE *kernel_cache) /* remove least recently used cache element */ { register long k,least_elem=-1,least_time; least_time=kernel_cache->time+1; for(k=0;k<kernel_cache->max_elems;k++) { if(kernel_cache->invindex[k] != -1) { if(kernel_cache->lru[k]<least_time) { least_time=kernel_cache->lru[k]; least_elem=k; } } } if(least_elem != -1) { kernel_cache_free(kernel_cache,least_elem); kernel_cache->index[kernel_cache->invindex[least_elem]]=-1; kernel_cache->invindex[least_elem]=-1; return(1); } return(0); } CFLOAT *kernel_cache_clean_and_malloc(KERNEL_CACHE *kernel_cache, long int docnum) /* Get a free cache entry. In case cache is full, the lru element is removed. */ { long result; if((result = kernel_cache_malloc(kernel_cache)) == -1) { if(kernel_cache_free_lru(kernel_cache)) { result = kernel_cache_malloc(kernel_cache); } } kernel_cache->index[docnum]=result; if(result == -1) { return(0); } kernel_cache->invindex[result]=docnum; kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */ return((CFLOAT *)((long)kernel_cache->buffer +(kernel_cache->activenum*sizeof(CFLOAT)* kernel_cache->index[docnum]))); } long kernel_cache_touch(KERNEL_CACHE *kernel_cache, long int docnum) /* Update lru time to avoid removal from cache. */ { if(kernel_cache && kernel_cache->index[docnum] != -1) { kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */ return(1); } return(0); } long kernel_cache_check(KERNEL_CACHE *kernel_cache, long int docnum) /* Is that row cached? */ { return(kernel_cache->index[docnum] != -1); } long kernel_cache_space_available(KERNEL_CACHE *kernel_cache) /* Is there room for one more row? */ { return(kernel_cache->elems < kernel_cache->max_elems); } /************************** Compute estimates ******************************/ void compute_xa_estimates(MODEL *model, long int *label, long int *unlabeled, long int totdoc, DOC **docs, double *lin, double *a, KERNEL_PARM *kernel_parm, LEARN_PARM *learn_parm, double *error, double *recall, double *precision) /* Computes xa-estimate of error rate, recall, and precision. See T. Joachims, Estimating the Generalization Performance of an SVM Efficiently, IMCL, 2000. */ { long i,looerror,looposerror,loonegerror; long totex,totposex; double xi,r_delta,r_delta_sq,sim=0; long *sv2dnum=NULL,*sv=NULL,svnum; r_delta=estimate_r_delta(docs,totdoc,kernel_parm); r_delta_sq=r_delta*r_delta; looerror=0; looposerror=0; loonegerror=0; totex=0; totposex=0; svnum=0; if(learn_parm->xa_depth > 0) { sv = (long *)my_malloc(sizeof(long)*(totdoc+11)); for(i=0;i<totdoc;i++) sv[i]=0; for(i=1;i<model->sv_num;i++) if(a[model->supvec[i]->docnum] < (learn_parm->svm_cost[model->supvec[i]->docnum] -learn_parm->epsilon_a)) { sv[model->supvec[i]->docnum]=1; svnum++; } sv2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11)); clear_index(sv2dnum); compute_index(sv,totdoc,sv2dnum); } for(i=0;i<totdoc;i++) { if(unlabeled[i]) { /* ignore it */ } else { xi=1.0-((lin[i]-model->b)*(double)label[i]); if(xi<0) xi=0; if(label[i]>0) { totposex++; } if((learn_parm->rho*a[i]*r_delta_sq+xi) >= 1.0) { if(learn_parm->xa_depth > 0) { /* makes assumptions */ sim=distribute_alpha_t_greedily(sv2dnum,svnum,docs,a,i,label, kernel_parm,learn_parm, (double)((1.0-xi-a[i]*r_delta_sq)/(2.0*a[i]))); } if((learn_parm->xa_depth == 0) || ((a[i]*kernel(kernel_parm,docs[i],docs[i])+a[i]*2.0*sim+xi) >= 1.0)) { looerror++; if(label[i]>0) { looposerror++; } else { loonegerror++; } } } totex++; } } (*error)=((double)looerror/(double)totex)*100.0; (*recall)=(1.0-(double)looposerror/(double)totposex)*100.0; (*precision)=(((double)totposex-(double)looposerror) /((double)totposex-(double)looposerror+(double)loonegerror))*100.0; free(sv); free(sv2dnum); } double distribute_alpha_t_greedily(long int *sv2dnum, long int svnum, DOC **docs, double *a, long int docnum, long int *label, KERNEL_PARM *kernel_parm, LEARN_PARM *learn_parm, double thresh) /* Experimental Code improving plain XiAlpha Estimates by computing a better bound using a greedy optimzation strategy. */ { long best_depth=0; long i,j,k,d,skip,allskip; double best,best_val[101],val,init_val_sq,init_val_lin; long best_ex[101]; CFLOAT *cache,*trow; cache=(CFLOAT *)my_malloc(sizeof(CFLOAT)*learn_parm->xa_depth*svnum); trow = (CFLOAT *)my_malloc(sizeof(CFLOAT)*svnum); for(k=0;k<svnum;k++) { trow[k]=kernel(kernel_parm,docs[docnum],docs[sv2dnum[k]]); } init_val_sq=0; init_val_lin=0; best=0; for(d=0;d<learn_parm->xa_depth;d++) { allskip=1; if(d>=1) { init_val_sq+=cache[best_ex[d-1]+svnum*(d-1)]; for(k=0;k<d-1;k++) { init_val_sq+=2.0*cache[best_ex[k]+svnum*(d-1)]; } init_val_lin+=trow[best_ex[d-1]]; } for(i=0;i<svnum;i++) { skip=0; if(sv2dnum[i] == docnum) skip=1; for(j=0;j<d;j++) { if(i == best_ex[j]) skip=1; } if(!skip) { val=init_val_sq; if(kernel_parm->kernel_type == LINEAR) val+=docs[sv2dnum[i]]->fvec->twonorm_sq; else val+=kernel(kernel_parm,docs[sv2dnum[i]],docs[sv2dnum[i]]); for(j=0;j<d;j++) { val+=2.0*cache[i+j*svnum]; } val*=(1.0/(2.0*(d+1.0)*(d+1.0))); val-=((init_val_lin+trow[i])/(d+1.0)); if(allskip || (val < best_val[d])) { best_val[d]=val; best_ex[d]=i; } allskip=0; if(val < thresh) { i=svnum; /* printf("EARLY"); */ } } } if(!allskip) { for(k=0;k<svnum;k++) { cache[d*svnum+k]=kernel(kernel_parm, docs[sv2dnum[best_ex[d]]], docs[sv2dnum[k]]); } } if((!allskip) && ((best_val[d] < best) || (d == 0))) { best=best_val[d]; best_depth=d; } if(allskip || (best < thresh)) { d=learn_parm->xa_depth; } } free(cache); free(trow); /* printf("Distribute[%ld](%ld)=%f, ",docnum,best_depth,best); */ return(best); } void estimate_transduction_quality(MODEL *model, long int *label, long int *unlabeled, long int totdoc, DOC **docs, double *lin) /* Loo-bound based on observation that loo-errors must have an equal distribution in both training and test examples, given that the test examples are classified correctly. Compare chapter "Constraints on the Transductive Hyperplane" in my Dissertation. */ { long i,j,l=0,ulab=0,lab=0,labpos=0,labneg=0,ulabpos=0,ulabneg=0,totulab=0; double totlab=0,totlabpos=0,totlabneg=0,labsum=0,ulabsum=0; double r_delta,r_delta_sq,xi,xisum=0,asum=0; r_delta=estimate_r_delta(docs,totdoc,&(model->kernel_parm)); r_delta_sq=r_delta*r_delta; for(j=0;j<totdoc;j++) { if(unlabeled[j]) { totulab++; } else { totlab++; if(label[j] > 0) totlabpos++; else totlabneg++; } } for(j=1;j<model->sv_num;j++) { i=model->supvec[j]->docnum; xi=1.0-((lin[i]-model->b)*(double)label[i]); if(xi<0) xi=0; xisum+=xi; asum+=fabs(model->alpha[j]); if(unlabeled[i]) { ulabsum+=(fabs(model->alpha[j])*r_delta_sq+xi); } else { labsum+=(fabs(model->alpha[j])*r_delta_sq+xi); } if((fabs(model->alpha[j])*r_delta_sq+xi) >= 1) { l++; if(unlabeled[model->supvec[j]->docnum]) { ulab++; if(model->alpha[j] > 0) ulabpos++; else ulabneg++; } else { lab++; if(model->alpha[j] > 0) labpos++; else labneg++; } } } printf("xacrit>=1: labeledpos=%.5f labeledneg=%.5f default=%.5f\n",(double)labpos/(double)totlab*100.0,(double)labneg/(double)totlab*100.0,(double)totlabpos/(double)(totlab)*100.0); printf("xacrit>=1: unlabelpos=%.5f unlabelneg=%.5f\n",(double)ulabpos/(double)totulab*100.0,(double)ulabneg/(double)totulab*100.0); printf("xacrit>=1: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)lab/(double)totlab*100.0,(double)ulab/(double)totulab*100.0,(double)l/(double)(totdoc)*100.0); printf("xacritsum: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)labsum/(double)totlab*100.0,(double)ulabsum/(double)totulab*100.0,(double)(labsum+ulabsum)/(double)(totdoc)*100.0); printf("r_delta_sq=%.5f xisum=%.5f asum=%.5f\n",r_delta_sq,xisum,asum); } double estimate_margin_vcdim(MODEL *model, double w, double R, KERNEL_PARM *kernel_parm) /* optional: length of model vector in feature space */ /* optional: radius of ball containing the data */ { double h; /* follows chapter 5.6.4 in [Vapnik/95] */ if(w<0) { w=model_length_s(model,kernel_parm); } if(R<0) { R=estimate_sphere(model,kernel_parm); } h = w*w * R*R +1; return(h); } double estimate_sphere(MODEL *model, KERNEL_PARM *kernel_parm) /* Approximates the radius of the ball containing */ /* the support vectors by bounding it with the */ { /* length of the longest support vector. This is */ register long j; /* pretty good for text categorization, since all */ double xlen,maxxlen=0; /* documents have feature vectors of length 1. It */ DOC *nulldoc; /* assumes that the center of the ball is at the */ WORD nullword; /* origin of the space. */ nullword.wnum=0; nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); for(j=1;j<model->sv_num;j++) { xlen=sqrt(kernel(kernel_parm,model->supvec[j],model->supvec[j]) -2*kernel(kernel_parm,model->supvec[j],nulldoc) +kernel(kernel_parm,nulldoc,nulldoc)); if(xlen>maxxlen) { maxxlen=xlen; } } free_example(nulldoc,1); return(maxxlen); } double estimate_r_delta(DOC **docs, long int totdoc, KERNEL_PARM *kernel_parm) { long i; double maxxlen,xlen; DOC *nulldoc; /* assumes that the center of the ball is at the */ WORD nullword; /* origin of the space. */ nullword.wnum=0; nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); maxxlen=0; for(i=0;i<totdoc;i++) { xlen=sqrt(kernel(kernel_parm,docs[i],docs[i]) -2*kernel(kernel_parm,docs[i],nulldoc) +kernel(kernel_parm,nulldoc,nulldoc)); if(xlen>maxxlen) { maxxlen=xlen; } } free_example(nulldoc,1); return(maxxlen); } double estimate_r_delta_average(DOC **docs, long int totdoc, KERNEL_PARM *kernel_parm) { long i; double avgxlen; DOC *nulldoc; /* assumes that the center of the ball is at the */ WORD nullword; /* origin of the space. */ nullword.wnum=0; nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0)); avgxlen=0; for(i=0;i<totdoc;i++) { avgxlen+=sqrt(kernel(kernel_parm,docs[i],docs[i]) -2*kernel(kernel_parm,docs[i],nulldoc) +kernel(kernel_parm,nulldoc,nulldoc)); } free_example(nulldoc,1); return(avgxlen/totdoc); } double length_of_longest_document_vector(DOC **docs, long int totdoc, KERNEL_PARM *kernel_parm) { long i; double maxxlen,xlen; maxxlen=0; for(i=0;i<totdoc;i++) { xlen=sqrt(kernel(kernel_parm,docs[i],docs[i])); if(xlen>maxxlen) { maxxlen=xlen; } } return(maxxlen); } /****************************** IO-handling **********************************/ void write_prediction(char *predfile, MODEL *model, double *lin, double *a, long int *unlabeled, long int *label, long int totdoc, LEARN_PARM *learn_parm) { FILE *predfl; long i; double dist,a_max; if(verbosity>=1) { printf("Writing prediction file..."); fflush(stdout); } if ((predfl = fopen (predfile, "w")) == NULL) { perror (predfile); exit (1); } a_max=learn_parm->epsilon_a; for(i=0;i<totdoc;i++) { if((unlabeled[i]) && (a[i]>a_max)) { a_max=a[i]; } } for(i=0;i<totdoc;i++) { if(unlabeled[i]) { if((a[i]>(learn_parm->epsilon_a))) { dist=(double)label[i]*(1.0-learn_parm->epsilon_crit-a[i]/(a_max*2.0)); } else { dist=(lin[i]-model->b); } if(dist>0) { fprintf(predfl,"%.8g:+1 %.8g:-1\n",dist,-dist); } else { fprintf(predfl,"%.8g:-1 %.8g:+1\n",-dist,dist); } } } fclose(predfl); if(verbosity>=1) { printf("done\n"); } } void write_alphas(char *alphafile, double *a, long int *label, long int totdoc) { FILE *alphafl; long i; if(verbosity>=1) { printf("Writing alpha file..."); fflush(stdout); } if ((alphafl = fopen (alphafile, "w")) == NULL) { perror (alphafile); exit (1); } for(i=0;i<totdoc;i++) { fprintf(alphafl,"%.18g\n",a[i]*(double)label[i]); } fclose(alphafl); if(verbosity>=1) { printf("done\n"); } }