annotate toolboxes/SVM-light/src/svm_common.c @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 /************************************************************************/
wolffd@0 2 /* */
wolffd@0 3 /* svm_common.c */
wolffd@0 4 /* */
wolffd@0 5 /* Definitions and functions used in both svm_learn and svm_classify. */
wolffd@0 6 /* */
wolffd@0 7 /* Author: Thorsten Joachims */
wolffd@0 8 /* Date: 02.07.04 */
wolffd@0 9 /* */
wolffd@0 10 /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */
wolffd@0 11 /* */
wolffd@0 12 /* This software is available for non-commercial use only. It must */
wolffd@0 13 /* not be modified and distributed without prior permission of the */
wolffd@0 14 /* author. The author is not responsible for implications from the */
wolffd@0 15 /* use of this software. */
wolffd@0 16 /* */
wolffd@0 17 /************************************************************************/
wolffd@0 18
wolffd@0 19 # include "ctype.h"
wolffd@0 20 # include "svm_common.h"
wolffd@0 21 # include "kernel.h" /* this contains a user supplied kernel */
wolffd@0 22
wolffd@0 23 long verbosity; /* verbosity level (0-4) */
wolffd@0 24 long kernel_cache_statistic;
wolffd@0 25
wolffd@0 26 double classify_example(MODEL *model, DOC *ex)
wolffd@0 27 /* classifies one example */
wolffd@0 28 {
wolffd@0 29 register long i;
wolffd@0 30 register double dist;
wolffd@0 31
wolffd@0 32 if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights))
wolffd@0 33 return(classify_example_linear(model,ex));
wolffd@0 34
wolffd@0 35 dist=0;
wolffd@0 36 for(i=1;i<model->sv_num;i++) {
wolffd@0 37 dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
wolffd@0 38 }
wolffd@0 39 return(dist-model->b);
wolffd@0 40 }
wolffd@0 41
wolffd@0 42 double classify_example_linear(MODEL *model, DOC *ex)
wolffd@0 43 /* classifies example for linear kernel */
wolffd@0 44
wolffd@0 45 /* important: the model must have the linear weight vector computed */
wolffd@0 46 /* use: add_weight_vector_to_linear_model(&model); */
wolffd@0 47
wolffd@0 48
wolffd@0 49 /* important: the feature numbers in the example to classify must */
wolffd@0 50 /* not be larger than the weight vector! */
wolffd@0 51 {
wolffd@0 52 double sum=0;
wolffd@0 53 SVECTOR *f;
wolffd@0 54
wolffd@0 55 for(f=ex->fvec;f;f=f->next)
wolffd@0 56 sum+=f->factor*sprod_ns(model->lin_weights,f);
wolffd@0 57 return(sum-model->b);
wolffd@0 58 }
wolffd@0 59
wolffd@0 60
wolffd@0 61 CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
wolffd@0 62 /* calculate the kernel function */
wolffd@0 63 {
wolffd@0 64 double sum=0;
wolffd@0 65 SVECTOR *fa,*fb;
wolffd@0 66
wolffd@0 67 /* in case the constraints are sums of feature vector as represented
wolffd@0 68 as a list of SVECTOR's with their coefficient factor in the sum,
wolffd@0 69 take the kernel between all pairs */
wolffd@0 70 for(fa=a->fvec;fa;fa=fa->next) {
wolffd@0 71 for(fb=b->fvec;fb;fb=fb->next) {
wolffd@0 72 if(fa->kernel_id == fb->kernel_id)
wolffd@0 73 sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb);
wolffd@0 74 }
wolffd@0 75 }
wolffd@0 76 return(sum);
wolffd@0 77 }
wolffd@0 78
wolffd@0 79 CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b)
wolffd@0 80 /* calculate the kernel function between two vectors */
wolffd@0 81 {
wolffd@0 82 kernel_cache_statistic++;
wolffd@0 83 switch(kernel_parm->kernel_type) {
wolffd@0 84 case 0: /* linear */
wolffd@0 85 return((CFLOAT)sprod_ss(a,b));
wolffd@0 86 case 1: /* polynomial */
wolffd@0 87 return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
wolffd@0 88 case 2: /* radial basis function */
wolffd@0 89 return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq)));
wolffd@0 90 case 3: /* sigmoid neural net */
wolffd@0 91 return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const));
wolffd@0 92 case 4: /* custom-kernel supplied in file kernel.h*/
wolffd@0 93 return((CFLOAT)custom_kernel(kernel_parm,a,b));
wolffd@0 94 default: printf("Error: Unknown kernel function\n"); exit(1);
wolffd@0 95 }
wolffd@0 96 }
wolffd@0 97
wolffd@0 98
wolffd@0 99 SVECTOR *create_svector(WORD *words,char *userdefined,double factor)
wolffd@0 100 {
wolffd@0 101 SVECTOR *vec;
wolffd@0 102 long fnum,i;
wolffd@0 103
wolffd@0 104 fnum=0;
wolffd@0 105 while(words[fnum].wnum) {
wolffd@0 106 fnum++;
wolffd@0 107 }
wolffd@0 108 fnum++;
wolffd@0 109 vec = (SVECTOR *)my_malloc(sizeof(SVECTOR));
wolffd@0 110 vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum));
wolffd@0 111 for(i=0;i<fnum;i++) {
wolffd@0 112 vec->words[i]=words[i];
wolffd@0 113 }
wolffd@0 114 vec->twonorm_sq=sprod_ss(vec,vec);
wolffd@0 115
wolffd@0 116 fnum=0;
wolffd@0 117 while(userdefined[fnum]) {
wolffd@0 118 fnum++;
wolffd@0 119 }
wolffd@0 120 fnum++;
wolffd@0 121 vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum));
wolffd@0 122 for(i=0;i<fnum;i++) {
wolffd@0 123 vec->userdefined[i]=userdefined[i];
wolffd@0 124 }
wolffd@0 125 vec->kernel_id=0;
wolffd@0 126 vec->next=NULL;
wolffd@0 127 vec->factor=factor;
wolffd@0 128 return(vec);
wolffd@0 129 }
wolffd@0 130
wolffd@0 131 SVECTOR *copy_svector(SVECTOR *vec)
wolffd@0 132 {
wolffd@0 133 SVECTOR *newvec=NULL;
wolffd@0 134 if(vec) {
wolffd@0 135 newvec=create_svector(vec->words,vec->userdefined,vec->factor);
wolffd@0 136 newvec->next=copy_svector(vec->next);
wolffd@0 137 }
wolffd@0 138 return(newvec);
wolffd@0 139 }
wolffd@0 140
wolffd@0 141 void free_svector(SVECTOR *vec)
wolffd@0 142 {
wolffd@0 143 if(vec) {
wolffd@0 144 free(vec->words);
wolffd@0 145 if(vec->userdefined)
wolffd@0 146 free(vec->userdefined);
wolffd@0 147 free_svector(vec->next);
wolffd@0 148 free(vec);
wolffd@0 149 }
wolffd@0 150 }
wolffd@0 151
wolffd@0 152 double sprod_ss(SVECTOR *a, SVECTOR *b)
wolffd@0 153 /* compute the inner product of two sparse vectors */
wolffd@0 154 {
wolffd@0 155 register CFLOAT sum=0;
wolffd@0 156 register WORD *ai,*bj;
wolffd@0 157 ai=a->words;
wolffd@0 158 bj=b->words;
wolffd@0 159 while (ai->wnum && bj->wnum) {
wolffd@0 160 if(ai->wnum > bj->wnum) {
wolffd@0 161 bj++;
wolffd@0 162 }
wolffd@0 163 else if (ai->wnum < bj->wnum) {
wolffd@0 164 ai++;
wolffd@0 165 }
wolffd@0 166 else {
wolffd@0 167 sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight);
wolffd@0 168 ai++;
wolffd@0 169 bj++;
wolffd@0 170 }
wolffd@0 171 }
wolffd@0 172 return((double)sum);
wolffd@0 173 }
wolffd@0 174
wolffd@0 175 SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b)
wolffd@0 176 /* compute the difference a-b of two sparse vectors */
wolffd@0 177 /* Note: SVECTOR lists are not followed, but only the first
wolffd@0 178 SVECTOR is used */
wolffd@0 179 {
wolffd@0 180 SVECTOR *vec;
wolffd@0 181 register WORD *sum,*sumi;
wolffd@0 182 register WORD *ai,*bj;
wolffd@0 183 long veclength;
wolffd@0 184
wolffd@0 185 ai=a->words;
wolffd@0 186 bj=b->words;
wolffd@0 187 veclength=0;
wolffd@0 188 while (ai->wnum && bj->wnum) {
wolffd@0 189 if(ai->wnum > bj->wnum) {
wolffd@0 190 veclength++;
wolffd@0 191 bj++;
wolffd@0 192 }
wolffd@0 193 else if (ai->wnum < bj->wnum) {
wolffd@0 194 veclength++;
wolffd@0 195 ai++;
wolffd@0 196 }
wolffd@0 197 else {
wolffd@0 198 veclength++;
wolffd@0 199 ai++;
wolffd@0 200 bj++;
wolffd@0 201 }
wolffd@0 202 }
wolffd@0 203 while (bj->wnum) {
wolffd@0 204 veclength++;
wolffd@0 205 bj++;
wolffd@0 206 }
wolffd@0 207 while (ai->wnum) {
wolffd@0 208 veclength++;
wolffd@0 209 ai++;
wolffd@0 210 }
wolffd@0 211 veclength++;
wolffd@0 212
wolffd@0 213 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
wolffd@0 214 sumi=sum;
wolffd@0 215 ai=a->words;
wolffd@0 216 bj=b->words;
wolffd@0 217 while (ai->wnum && bj->wnum) {
wolffd@0 218 if(ai->wnum > bj->wnum) {
wolffd@0 219 (*sumi)=(*bj);
wolffd@0 220 sumi->weight*=(-1);
wolffd@0 221 sumi++;
wolffd@0 222 bj++;
wolffd@0 223 }
wolffd@0 224 else if (ai->wnum < bj->wnum) {
wolffd@0 225 (*sumi)=(*ai);
wolffd@0 226 sumi++;
wolffd@0 227 ai++;
wolffd@0 228 }
wolffd@0 229 else {
wolffd@0 230 (*sumi)=(*ai);
wolffd@0 231 sumi->weight-=bj->weight;
wolffd@0 232 if(sumi->weight != 0)
wolffd@0 233 sumi++;
wolffd@0 234 ai++;
wolffd@0 235 bj++;
wolffd@0 236 }
wolffd@0 237 }
wolffd@0 238 while (bj->wnum) {
wolffd@0 239 (*sumi)=(*bj);
wolffd@0 240 sumi->weight*=(-1);
wolffd@0 241 sumi++;
wolffd@0 242 bj++;
wolffd@0 243 }
wolffd@0 244 while (ai->wnum) {
wolffd@0 245 (*sumi)=(*ai);
wolffd@0 246 sumi++;
wolffd@0 247 ai++;
wolffd@0 248 }
wolffd@0 249 sumi->wnum=0;
wolffd@0 250
wolffd@0 251 vec=create_svector(sum,"",1.0);
wolffd@0 252 free(sum);
wolffd@0 253
wolffd@0 254 return(vec);
wolffd@0 255 }
wolffd@0 256
wolffd@0 257 SVECTOR* add_ss(SVECTOR *a, SVECTOR *b)
wolffd@0 258 /* compute the sum a+b of two sparse vectors */
wolffd@0 259 /* Note: SVECTOR lists are not followed, but only the first
wolffd@0 260 SVECTOR is used */
wolffd@0 261 {
wolffd@0 262 SVECTOR *vec;
wolffd@0 263 register WORD *sum,*sumi;
wolffd@0 264 register WORD *ai,*bj;
wolffd@0 265 long veclength;
wolffd@0 266
wolffd@0 267 ai=a->words;
wolffd@0 268 bj=b->words;
wolffd@0 269 veclength=0;
wolffd@0 270 while (ai->wnum && bj->wnum) {
wolffd@0 271 if(ai->wnum > bj->wnum) {
wolffd@0 272 veclength++;
wolffd@0 273 bj++;
wolffd@0 274 }
wolffd@0 275 else if (ai->wnum < bj->wnum) {
wolffd@0 276 veclength++;
wolffd@0 277 ai++;
wolffd@0 278 }
wolffd@0 279 else {
wolffd@0 280 veclength++;
wolffd@0 281 ai++;
wolffd@0 282 bj++;
wolffd@0 283 }
wolffd@0 284 }
wolffd@0 285 while (bj->wnum) {
wolffd@0 286 veclength++;
wolffd@0 287 bj++;
wolffd@0 288 }
wolffd@0 289 while (ai->wnum) {
wolffd@0 290 veclength++;
wolffd@0 291 ai++;
wolffd@0 292 }
wolffd@0 293 veclength++;
wolffd@0 294
wolffd@0 295 /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/
wolffd@0 296
wolffd@0 297 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
wolffd@0 298 sumi=sum;
wolffd@0 299 ai=a->words;
wolffd@0 300 bj=b->words;
wolffd@0 301 while (ai->wnum && bj->wnum) {
wolffd@0 302 if(ai->wnum > bj->wnum) {
wolffd@0 303 (*sumi)=(*bj);
wolffd@0 304 sumi++;
wolffd@0 305 bj++;
wolffd@0 306 }
wolffd@0 307 else if (ai->wnum < bj->wnum) {
wolffd@0 308 (*sumi)=(*ai);
wolffd@0 309 sumi++;
wolffd@0 310 ai++;
wolffd@0 311 }
wolffd@0 312 else {
wolffd@0 313 (*sumi)=(*ai);
wolffd@0 314 sumi->weight+=bj->weight;
wolffd@0 315 if(sumi->weight != 0)
wolffd@0 316 sumi++;
wolffd@0 317 ai++;
wolffd@0 318 bj++;
wolffd@0 319 }
wolffd@0 320 }
wolffd@0 321 while (bj->wnum) {
wolffd@0 322 (*sumi)=(*bj);
wolffd@0 323 sumi++;
wolffd@0 324 bj++;
wolffd@0 325 }
wolffd@0 326 while (ai->wnum) {
wolffd@0 327 (*sumi)=(*ai);
wolffd@0 328 sumi++;
wolffd@0 329 ai++;
wolffd@0 330 }
wolffd@0 331 sumi->wnum=0;
wolffd@0 332
wolffd@0 333 vec=create_svector(sum,"",1.0);
wolffd@0 334 free(sum);
wolffd@0 335
wolffd@0 336 return(vec);
wolffd@0 337 }
wolffd@0 338
wolffd@0 339 SVECTOR* add_list_ss(SVECTOR *a)
wolffd@0 340 /* computes the linear combination of the SVECTOR list weighted
wolffd@0 341 by the factor of each SVECTOR */
wolffd@0 342 {
wolffd@0 343 SVECTOR *scaled,*oldsum,*sum,*f;
wolffd@0 344 WORD empty[2];
wolffd@0 345
wolffd@0 346 if(a){
wolffd@0 347 sum=smult_s(a,a->factor);
wolffd@0 348 for(f=a->next;f;f=f->next) {
wolffd@0 349 scaled=smult_s(f,f->factor);
wolffd@0 350 oldsum=sum;
wolffd@0 351 sum=add_ss(sum,scaled);
wolffd@0 352 free_svector(oldsum);
wolffd@0 353 free_svector(scaled);
wolffd@0 354 }
wolffd@0 355 sum->factor=1.0;
wolffd@0 356 }
wolffd@0 357 else {
wolffd@0 358 empty[0].wnum=0;
wolffd@0 359 sum=create_svector(empty,"",1.0);
wolffd@0 360 }
wolffd@0 361 return(sum);
wolffd@0 362 }
wolffd@0 363
wolffd@0 364 void append_svector_list(SVECTOR *a, SVECTOR *b)
wolffd@0 365 /* appends SVECTOR b to the end of SVECTOR a. */
wolffd@0 366 {
wolffd@0 367 SVECTOR *f;
wolffd@0 368
wolffd@0 369 for(f=a;f->next;f=f->next); /* find end of first vector list */
wolffd@0 370 f->next=b; /* append the two vector lists */
wolffd@0 371 }
wolffd@0 372
wolffd@0 373 SVECTOR* smult_s(SVECTOR *a, double factor)
wolffd@0 374 /* scale sparse vector a by factor */
wolffd@0 375 {
wolffd@0 376 SVECTOR *vec;
wolffd@0 377 register WORD *sum,*sumi;
wolffd@0 378 register WORD *ai;
wolffd@0 379 long veclength;
wolffd@0 380
wolffd@0 381 ai=a->words;
wolffd@0 382 veclength=0;
wolffd@0 383 while (ai->wnum) {
wolffd@0 384 veclength++;
wolffd@0 385 ai++;
wolffd@0 386 }
wolffd@0 387 veclength++;
wolffd@0 388
wolffd@0 389 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
wolffd@0 390 sumi=sum;
wolffd@0 391 ai=a->words;
wolffd@0 392 while (ai->wnum) {
wolffd@0 393 (*sumi)=(*ai);
wolffd@0 394 sumi->weight*=factor;
wolffd@0 395 if(sumi->weight != 0)
wolffd@0 396 sumi++;
wolffd@0 397 ai++;
wolffd@0 398 }
wolffd@0 399 sumi->wnum=0;
wolffd@0 400
wolffd@0 401 vec=create_svector(sum,a->userdefined,a->factor);
wolffd@0 402 free(sum);
wolffd@0 403
wolffd@0 404 return(vec);
wolffd@0 405 }
wolffd@0 406
wolffd@0 407 int featvec_eq(SVECTOR *a, SVECTOR *b)
wolffd@0 408 /* tests two sparse vectors for equality */
wolffd@0 409 {
wolffd@0 410 register WORD *ai,*bj;
wolffd@0 411 ai=a->words;
wolffd@0 412 bj=b->words;
wolffd@0 413 while (ai->wnum && bj->wnum) {
wolffd@0 414 if(ai->wnum > bj->wnum) {
wolffd@0 415 if((CFLOAT)(bj->weight) != 0)
wolffd@0 416 return(0);
wolffd@0 417 bj++;
wolffd@0 418 }
wolffd@0 419 else if (ai->wnum < bj->wnum) {
wolffd@0 420 if((CFLOAT)(ai->weight) != 0)
wolffd@0 421 return(0);
wolffd@0 422 ai++;
wolffd@0 423 }
wolffd@0 424 else {
wolffd@0 425 if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight))
wolffd@0 426 return(0);
wolffd@0 427 ai++;
wolffd@0 428 bj++;
wolffd@0 429 }
wolffd@0 430 }
wolffd@0 431 return(1);
wolffd@0 432 }
wolffd@0 433
wolffd@0 434 double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
wolffd@0 435 /* compute length of weight vector */
wolffd@0 436 {
wolffd@0 437 register long i,j;
wolffd@0 438 register double sum=0,alphai;
wolffd@0 439 register DOC *supveci;
wolffd@0 440
wolffd@0 441 for(i=1;i<model->sv_num;i++) {
wolffd@0 442 alphai=model->alpha[i];
wolffd@0 443 supveci=model->supvec[i];
wolffd@0 444 for(j=1;j<model->sv_num;j++) {
wolffd@0 445 sum+=alphai*model->alpha[j]
wolffd@0 446 *kernel(kernel_parm,supveci,model->supvec[j]);
wolffd@0 447 }
wolffd@0 448 }
wolffd@0 449 return(sqrt(sum));
wolffd@0 450 }
wolffd@0 451
wolffd@0 452 void clear_vector_n(double *vec, long int n)
wolffd@0 453 {
wolffd@0 454 register long i;
wolffd@0 455 for(i=0;i<=n;i++) vec[i]=0;
wolffd@0 456 }
wolffd@0 457
wolffd@0 458 void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor)
wolffd@0 459 {
wolffd@0 460 register WORD *ai;
wolffd@0 461 ai=vec_s->words;
wolffd@0 462 while (ai->wnum) {
wolffd@0 463 vec_n[ai->wnum]+=(faktor*ai->weight);
wolffd@0 464 ai++;
wolffd@0 465 }
wolffd@0 466 }
wolffd@0 467
wolffd@0 468 double sprod_ns(double *vec_n, SVECTOR *vec_s)
wolffd@0 469 {
wolffd@0 470 register double sum=0;
wolffd@0 471 register WORD *ai;
wolffd@0 472 ai=vec_s->words;
wolffd@0 473 while (ai->wnum) {
wolffd@0 474 sum+=(vec_n[ai->wnum]*ai->weight);
wolffd@0 475 ai++;
wolffd@0 476 }
wolffd@0 477 return(sum);
wolffd@0 478 }
wolffd@0 479
wolffd@0 480 void add_weight_vector_to_linear_model(MODEL *model)
wolffd@0 481 /* compute weight vector in linear case and add to model */
wolffd@0 482 {
wolffd@0 483 long i;
wolffd@0 484 SVECTOR *f;
wolffd@0 485
wolffd@0 486 model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
wolffd@0 487 clear_vector_n(model->lin_weights,model->totwords);
wolffd@0 488 for(i=1;i<model->sv_num;i++) {
wolffd@0 489 for(f=(model->supvec[i])->fvec;f;f=f->next)
wolffd@0 490 add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]);
wolffd@0 491 }
wolffd@0 492 }
wolffd@0 493
wolffd@0 494
wolffd@0 495 DOC *create_example(long docnum, long queryid, long slackid,
wolffd@0 496 double costfactor, SVECTOR *fvec)
wolffd@0 497 {
wolffd@0 498 DOC *example;
wolffd@0 499 example = (DOC *)my_malloc(sizeof(DOC));
wolffd@0 500 example->docnum=docnum;
wolffd@0 501 example->queryid=queryid;
wolffd@0 502 example->slackid=slackid;
wolffd@0 503 example->costfactor=costfactor;
wolffd@0 504 example->fvec=fvec;
wolffd@0 505 return(example);
wolffd@0 506 }
wolffd@0 507
wolffd@0 508 void free_example(DOC *example, long deep)
wolffd@0 509 {
wolffd@0 510 if(example) {
wolffd@0 511 if(deep) {
wolffd@0 512 if(example->fvec)
wolffd@0 513 free_svector(example->fvec);
wolffd@0 514 }
wolffd@0 515 free(example);
wolffd@0 516 }
wolffd@0 517 }
wolffd@0 518
wolffd@0 519 void write_model(char *modelfile, MODEL *model)
wolffd@0 520 {
wolffd@0 521 FILE *modelfl;
wolffd@0 522 long j,i,sv_num;
wolffd@0 523 SVECTOR *v;
wolffd@0 524
wolffd@0 525 if(verbosity>=1) {
wolffd@0 526 printf("Writing model file..."); fflush(stdout);
wolffd@0 527 }
wolffd@0 528 if ((modelfl = fopen (modelfile, "w")) == NULL)
wolffd@0 529 { perror (modelfile); exit (1); }
wolffd@0 530 fprintf(modelfl,"SVM-light Version %s\n",VERSION);
wolffd@0 531 fprintf(modelfl,"%ld # kernel type\n",
wolffd@0 532 model->kernel_parm.kernel_type);
wolffd@0 533 fprintf(modelfl,"%ld # kernel parameter -d \n",
wolffd@0 534 model->kernel_parm.poly_degree);
wolffd@0 535 fprintf(modelfl,"%.8g # kernel parameter -g \n",
wolffd@0 536 model->kernel_parm.rbf_gamma);
wolffd@0 537 fprintf(modelfl,"%.8g # kernel parameter -s \n",
wolffd@0 538 model->kernel_parm.coef_lin);
wolffd@0 539 fprintf(modelfl,"%.8g # kernel parameter -r \n",
wolffd@0 540 model->kernel_parm.coef_const);
wolffd@0 541 fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
wolffd@0 542 fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
wolffd@0 543 fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
wolffd@0 544
wolffd@0 545 sv_num=1;
wolffd@0 546 for(i=1;i<model->sv_num;i++) {
wolffd@0 547 for(v=model->supvec[i]->fvec;v;v=v->next)
wolffd@0 548 sv_num++;
wolffd@0 549 }
wolffd@0 550 fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
wolffd@0 551 fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
wolffd@0 552
wolffd@0 553 for(i=1;i<model->sv_num;i++) {
wolffd@0 554 for(v=model->supvec[i]->fvec;v;v=v->next) {
wolffd@0 555 fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
wolffd@0 556 for (j=0; (v->words[j]).wnum; j++) {
wolffd@0 557 fprintf(modelfl,"%ld:%.8g ",
wolffd@0 558 (long)(v->words[j]).wnum,
wolffd@0 559 (double)(v->words[j]).weight);
wolffd@0 560 }
wolffd@0 561 fprintf(modelfl,"#%s\n",v->userdefined);
wolffd@0 562 /* NOTE: this could be made more efficient by summing the
wolffd@0 563 alpha's of identical vectors before writing them to the
wolffd@0 564 file. */
wolffd@0 565 }
wolffd@0 566 }
wolffd@0 567 fclose(modelfl);
wolffd@0 568 if(verbosity>=1) {
wolffd@0 569 printf("done\n");
wolffd@0 570 }
wolffd@0 571 }
wolffd@0 572
wolffd@0 573
wolffd@0 574 MODEL *read_model(char *modelfile)
wolffd@0 575 {
wolffd@0 576 FILE *modelfl;
wolffd@0 577 long i,queryid,slackid;
wolffd@0 578 double costfactor;
wolffd@0 579 long max_sv,max_words,ll,wpos;
wolffd@0 580 char *line,*comment;
wolffd@0 581 WORD *words;
wolffd@0 582 char version_buffer[100];
wolffd@0 583 MODEL *model;
wolffd@0 584
wolffd@0 585 if(verbosity>=1) {
wolffd@0 586 printf("Reading model..."); fflush(stdout);
wolffd@0 587 }
wolffd@0 588
wolffd@0 589 nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
wolffd@0 590 max_words+=2;
wolffd@0 591 ll+=2;
wolffd@0 592
wolffd@0 593 words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
wolffd@0 594 line = (char *)my_malloc(sizeof(char)*ll);
wolffd@0 595 model = (MODEL *)my_malloc(sizeof(MODEL));
wolffd@0 596
wolffd@0 597 if ((modelfl = fopen (modelfile, "r")) == NULL)
wolffd@0 598 { perror (modelfile); exit (1); }
wolffd@0 599
wolffd@0 600 fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
wolffd@0 601 if(strcmp(version_buffer,VERSION)) {
wolffd@0 602 perror ("Version of model-file does not match version of svm_classify!");
wolffd@0 603 exit (1);
wolffd@0 604 }
wolffd@0 605 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
wolffd@0 606 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
wolffd@0 607 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
wolffd@0 608 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
wolffd@0 609 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
wolffd@0 610 fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
wolffd@0 611
wolffd@0 612 fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
wolffd@0 613 fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
wolffd@0 614 fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
wolffd@0 615 fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
wolffd@0 616
wolffd@0 617 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
wolffd@0 618 model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
wolffd@0 619 model->index=NULL;
wolffd@0 620 model->lin_weights=NULL;
wolffd@0 621
wolffd@0 622 for(i=1;i<model->sv_num;i++) {
wolffd@0 623 fgets(line,(int)ll,modelfl);
wolffd@0 624 if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
wolffd@0 625 &costfactor,&wpos,max_words,&comment)) {
wolffd@0 626 printf("\nParsing error while reading model file in SV %ld!\n%s",
wolffd@0 627 i,line);
wolffd@0 628 exit(1);
wolffd@0 629 }
wolffd@0 630 model->supvec[i] = create_example(-1,
wolffd@0 631 0,0,
wolffd@0 632 0.0,
wolffd@0 633 create_svector(words,comment,1.0));
wolffd@0 634 }
wolffd@0 635 fclose(modelfl);
wolffd@0 636 free(line);
wolffd@0 637 free(words);
wolffd@0 638 if(verbosity>=1) {
wolffd@0 639 fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
wolffd@0 640 }
wolffd@0 641 return(model);
wolffd@0 642 }
wolffd@0 643
wolffd@0 644 MODEL *copy_model(MODEL *model)
wolffd@0 645 {
wolffd@0 646 MODEL *newmodel;
wolffd@0 647 long i;
wolffd@0 648
wolffd@0 649 newmodel=(MODEL *)my_malloc(sizeof(MODEL));
wolffd@0 650 (*newmodel)=(*model);
wolffd@0 651 newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
wolffd@0 652 newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
wolffd@0 653 newmodel->index = NULL; /* index is not copied */
wolffd@0 654 newmodel->supvec[0] = NULL;
wolffd@0 655 newmodel->alpha[0] = 0;
wolffd@0 656 for(i=1;i<model->sv_num;i++) {
wolffd@0 657 newmodel->alpha[i]=model->alpha[i];
wolffd@0 658 newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
wolffd@0 659 model->supvec[i]->queryid,0,
wolffd@0 660 model->supvec[i]->costfactor,
wolffd@0 661 copy_svector(model->supvec[i]->fvec));
wolffd@0 662 }
wolffd@0 663 if(model->lin_weights) {
wolffd@0 664 newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
wolffd@0 665 for(i=0;i<model->totwords+1;i++)
wolffd@0 666 newmodel->lin_weights[i]=model->lin_weights[i];
wolffd@0 667 }
wolffd@0 668 return(newmodel);
wolffd@0 669 }
wolffd@0 670
wolffd@0 671 void free_model(MODEL *model, int deep)
wolffd@0 672 {
wolffd@0 673 long i;
wolffd@0 674
wolffd@0 675 if(model->supvec) {
wolffd@0 676 if(deep) {
wolffd@0 677 for(i=1;i<model->sv_num;i++) {
wolffd@0 678 free_example(model->supvec[i],1);
wolffd@0 679 }
wolffd@0 680 }
wolffd@0 681 free(model->supvec);
wolffd@0 682 }
wolffd@0 683 if(model->alpha) free(model->alpha);
wolffd@0 684 if(model->index) free(model->index);
wolffd@0 685 if(model->lin_weights) free(model->lin_weights);
wolffd@0 686 free(model);
wolffd@0 687 }
wolffd@0 688
wolffd@0 689
wolffd@0 690 void read_documents(char *docfile, DOC ***docs, double **label,
wolffd@0 691 long int *totwords, long int *totdoc)
wolffd@0 692 {
wolffd@0 693 char *line,*comment;
wolffd@0 694 WORD *words;
wolffd@0 695 long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
wolffd@0 696 long max_words_doc, ll;
wolffd@0 697 double doc_label,costfactor;
wolffd@0 698 FILE *docfl;
wolffd@0 699
wolffd@0 700 if(verbosity>=1) {
wolffd@0 701 printf("Scanning examples..."); fflush(stdout);
wolffd@0 702 }
wolffd@0 703 nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
wolffd@0 704 max_words_doc+=2;
wolffd@0 705 ll+=2;
wolffd@0 706 max_docs+=2;
wolffd@0 707 if(verbosity>=1) {
wolffd@0 708 printf("done\n"); fflush(stdout);
wolffd@0 709 }
wolffd@0 710
wolffd@0 711 (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */
wolffd@0 712 (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
wolffd@0 713 line = (char *)my_malloc(sizeof(char)*ll);
wolffd@0 714
wolffd@0 715 if ((docfl = fopen (docfile, "r")) == NULL)
wolffd@0 716 { perror (docfile); exit (1); }
wolffd@0 717
wolffd@0 718 words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
wolffd@0 719 if(verbosity>=1) {
wolffd@0 720 printf("Reading examples into memory..."); fflush(stdout);
wolffd@0 721 }
wolffd@0 722 dnum=0;
wolffd@0 723 (*totwords)=0;
wolffd@0 724 while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
wolffd@0 725 if(line[0] == '#') continue; /* line contains comments */
wolffd@0 726 if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
wolffd@0 727 &wpos,max_words_doc,&comment)) {
wolffd@0 728 printf("\nParsing error in line %ld!\n%s",dnum,line);
wolffd@0 729 exit(1);
wolffd@0 730 }
wolffd@0 731 (*label)[dnum]=doc_label;
wolffd@0 732 /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
wolffd@0 733 if(doc_label > 0) dpos++;
wolffd@0 734 if (doc_label < 0) dneg++;
wolffd@0 735 if (doc_label == 0) dunlab++;
wolffd@0 736 if((wpos>1) && ((words[wpos-2]).wnum>(*totwords)))
wolffd@0 737 (*totwords)=(words[wpos-2]).wnum;
wolffd@0 738 if((*totwords) > MAXFEATNUM) {
wolffd@0 739 printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
wolffd@0 740 printf("LINE: %s\n",line);
wolffd@0 741 exit(1);
wolffd@0 742 }
wolffd@0 743 (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
wolffd@0 744 create_svector(words,comment,1.0));
wolffd@0 745 /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */
wolffd@0 746 dnum++;
wolffd@0 747 if(verbosity>=1) {
wolffd@0 748 if((dnum % 100) == 0) {
wolffd@0 749 printf("%ld..",dnum); fflush(stdout);
wolffd@0 750 }
wolffd@0 751 }
wolffd@0 752 }
wolffd@0 753
wolffd@0 754 fclose(docfl);
wolffd@0 755 free(line);
wolffd@0 756 free(words);
wolffd@0 757 if(verbosity>=1) {
wolffd@0 758 fprintf(stdout, "OK. (%ld examples read)\n", dnum);
wolffd@0 759 }
wolffd@0 760 (*totdoc)=dnum;
wolffd@0 761 }
wolffd@0 762
wolffd@0 763 int parse_document(char *line, WORD *words, double *label,
wolffd@0 764 long *queryid, long *slackid, double *costfactor,
wolffd@0 765 long int *numwords, long int max_words_doc,
wolffd@0 766 char **comment)
wolffd@0 767 {
wolffd@0 768 register long wpos,pos;
wolffd@0 769 long wnum;
wolffd@0 770 double weight;
wolffd@0 771 int numread;
wolffd@0 772 char featurepair[1000],junk[1000];
wolffd@0 773
wolffd@0 774 (*queryid)=0;
wolffd@0 775 (*slackid)=0;
wolffd@0 776 (*costfactor)=1;
wolffd@0 777
wolffd@0 778 pos=0;
wolffd@0 779 (*comment)=NULL;
wolffd@0 780 while(line[pos] ) { /* cut off comments */
wolffd@0 781 if((line[pos] == '#') && (!(*comment))) {
wolffd@0 782 line[pos]=0;
wolffd@0 783 (*comment)=&(line[pos+1]);
wolffd@0 784 }
wolffd@0 785 if(line[pos] == '\n') { /* strip the CR */
wolffd@0 786 line[pos]=0;
wolffd@0 787 }
wolffd@0 788 pos++;
wolffd@0 789 }
wolffd@0 790 if(!(*comment)) (*comment)=&(line[pos]);
wolffd@0 791 /* printf("Comment: '%s'\n",(*comment)); */
wolffd@0 792
wolffd@0 793 wpos=0;
wolffd@0 794 /* check, that line starts with target value or zero, but not with
wolffd@0 795 feature pair */
wolffd@0 796 if(sscanf(line,"%s",featurepair) == EOF) return(0);
wolffd@0 797 pos=0;
wolffd@0 798 while((featurepair[pos] != ':') && featurepair[pos]) pos++;
wolffd@0 799 if(featurepair[pos] == ':') {
wolffd@0 800 perror ("Line must start with label or 0!!!\n");
wolffd@0 801 printf("LINE: %s\n",line);
wolffd@0 802 exit (1);
wolffd@0 803 }
wolffd@0 804 /* read the target value */
wolffd@0 805 if(sscanf(line,"%lf",label) == EOF) return(0);
wolffd@0 806 pos=0;
wolffd@0 807 while(space_or_null((int)line[pos])) pos++;
wolffd@0 808 while((!space_or_null((int)line[pos])) && line[pos]) pos++;
wolffd@0 809 while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
wolffd@0 810 (numread > 0) &&
wolffd@0 811 (wpos<max_words_doc)) {
wolffd@0 812 /* printf("%s\n",featurepair); */
wolffd@0 813 while(space_or_null((int)line[pos])) pos++;
wolffd@0 814 while((!space_or_null((int)line[pos])) && line[pos]) pos++;
wolffd@0 815 if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
wolffd@0 816 /* it is the query id */
wolffd@0 817 (*queryid)=(long)wnum;
wolffd@0 818 }
wolffd@0 819 else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
wolffd@0 820 /* it is the slack id */
wolffd@0 821 if(wnum > 0)
wolffd@0 822 (*slackid)=(long)wnum;
wolffd@0 823 else {
wolffd@0 824 perror ("Slack-id must be greater or equal to 1!!!\n");
wolffd@0 825 printf("LINE: %s\n",line);
wolffd@0 826 exit (1);
wolffd@0 827 }
wolffd@0 828 }
wolffd@0 829 else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
wolffd@0 830 /* it is the example-dependent cost factor */
wolffd@0 831 (*costfactor)=(double)weight;
wolffd@0 832 }
wolffd@0 833 else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
wolffd@0 834 /* it is a regular feature */
wolffd@0 835 if(wnum<=0) {
wolffd@0 836 perror ("Feature numbers must be larger or equal to 1!!!\n");
wolffd@0 837 printf("LINE: %s\n",line);
wolffd@0 838 exit (1);
wolffd@0 839 }
wolffd@0 840 if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
wolffd@0 841 perror ("Features must be in increasing order!!!\n");
wolffd@0 842 printf("LINE: %s\n",line);
wolffd@0 843 exit (1);
wolffd@0 844 }
wolffd@0 845 (words[wpos]).wnum=wnum;
wolffd@0 846 (words[wpos]).weight=(FVAL)weight;
wolffd@0 847 wpos++;
wolffd@0 848 }
wolffd@0 849 else {
wolffd@0 850 perror ("Cannot parse feature/value pair!!!\n");
wolffd@0 851 printf("'%s' in LINE: %s\n",featurepair,line);
wolffd@0 852 exit (1);
wolffd@0 853 }
wolffd@0 854 }
wolffd@0 855 (words[wpos]).wnum=0;
wolffd@0 856 (*numwords)=wpos+1;
wolffd@0 857 return(1);
wolffd@0 858 }
wolffd@0 859
wolffd@0 860 double *read_alphas(char *alphafile,long totdoc)
wolffd@0 861 /* reads the alpha vector from a file as written by the
wolffd@0 862 write_alphas function */
wolffd@0 863 {
wolffd@0 864 FILE *fl;
wolffd@0 865 double *alpha;
wolffd@0 866 long dnum;
wolffd@0 867
wolffd@0 868 if ((fl = fopen (alphafile, "r")) == NULL)
wolffd@0 869 { perror (alphafile); exit (1); }
wolffd@0 870
wolffd@0 871 alpha = (double *)my_malloc(sizeof(double)*totdoc);
wolffd@0 872 if(verbosity>=1) {
wolffd@0 873 printf("Reading alphas..."); fflush(stdout);
wolffd@0 874 }
wolffd@0 875 dnum=0;
wolffd@0 876 while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
wolffd@0 877 dnum++;
wolffd@0 878 }
wolffd@0 879 if(dnum != totdoc)
wolffd@0 880 { perror ("\nNot enough values in alpha file!"); exit (1); }
wolffd@0 881 fclose(fl);
wolffd@0 882
wolffd@0 883 if(verbosity>=1) {
wolffd@0 884 printf("done\n"); fflush(stdout);
wolffd@0 885 }
wolffd@0 886
wolffd@0 887 return(alpha);
wolffd@0 888 }
wolffd@0 889
wolffd@0 890 void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
wolffd@0 891 /* Grep through file and count number of lines, maximum number of
wolffd@0 892 spaces per line, and longest line. */
wolffd@0 893 {
wolffd@0 894 FILE *fl;
wolffd@0 895 int ic;
wolffd@0 896 char c;
wolffd@0 897 long current_length,current_wol;
wolffd@0 898
wolffd@0 899 if ((fl = fopen (file, "r")) == NULL)
wolffd@0 900 { perror (file); exit (1); }
wolffd@0 901 current_length=0;
wolffd@0 902 current_wol=0;
wolffd@0 903 (*ll)=0;
wolffd@0 904 (*nol)=1;
wolffd@0 905 (*wol)=0;
wolffd@0 906 while((ic=getc(fl)) != EOF) {
wolffd@0 907 c=(char)ic;
wolffd@0 908 current_length++;
wolffd@0 909 if(space_or_null((int)c)) {
wolffd@0 910 current_wol++;
wolffd@0 911 }
wolffd@0 912 if(c == '\n') {
wolffd@0 913 (*nol)++;
wolffd@0 914 if(current_length>(*ll)) {
wolffd@0 915 (*ll)=current_length;
wolffd@0 916 }
wolffd@0 917 if(current_wol>(*wol)) {
wolffd@0 918 (*wol)=current_wol;
wolffd@0 919 }
wolffd@0 920 current_length=0;
wolffd@0 921 current_wol=0;
wolffd@0 922 }
wolffd@0 923 }
wolffd@0 924 fclose(fl);
wolffd@0 925 }
wolffd@0 926
wolffd@0 927 long minl(long int a, long int b)
wolffd@0 928 {
wolffd@0 929 if(a<b)
wolffd@0 930 return(a);
wolffd@0 931 else
wolffd@0 932 return(b);
wolffd@0 933 }
wolffd@0 934
wolffd@0 935 long maxl(long int a, long int b)
wolffd@0 936 {
wolffd@0 937 if(a>b)
wolffd@0 938 return(a);
wolffd@0 939 else
wolffd@0 940 return(b);
wolffd@0 941 }
wolffd@0 942
wolffd@0 943 long get_runtime(void)
wolffd@0 944 {
wolffd@0 945 clock_t start;
wolffd@0 946 start = clock();
wolffd@0 947 return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
wolffd@0 948 }
wolffd@0 949
wolffd@0 950
wolffd@0 951 # ifdef _MSC_VER
wolffd@0 952
wolffd@0 953 int isnan(double a)
wolffd@0 954 {
wolffd@0 955 return(_isnan(a));
wolffd@0 956 }
wolffd@0 957
wolffd@0 958 # endif
wolffd@0 959
wolffd@0 960 int space_or_null(int c) {
wolffd@0 961 if (c==0)
wolffd@0 962 return 1;
wolffd@0 963 return isspace(c);
wolffd@0 964 }
wolffd@0 965
wolffd@0 966 void *my_malloc(size_t size)
wolffd@0 967 {
wolffd@0 968 void *ptr;
wolffd@0 969 ptr=(void *)malloc(size);
wolffd@0 970 if(!ptr) {
wolffd@0 971 perror ("Out of memory!\n");
wolffd@0 972 exit (1);
wolffd@0 973 }
wolffd@0 974 return(ptr);
wolffd@0 975 }
wolffd@0 976
wolffd@0 977 void copyright_notice(void)
wolffd@0 978 {
wolffd@0 979 printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
wolffd@0 980 printf("This software is available for non-commercial use only. It must not\n");
wolffd@0 981 printf("be modified and distributed without prior permission of the author.\n");
wolffd@0 982 printf("The author is not responsible for implications from the use of this\n");
wolffd@0 983 printf("software.\n\n");
wolffd@0 984 }