comparison toolboxes/SVM-light/src/svm_common.c @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e9a9cd732c1e
1 /************************************************************************/
2 /* */
3 /* svm_common.c */
4 /* */
5 /* Definitions and functions used in both svm_learn and svm_classify. */
6 /* */
7 /* Author: Thorsten Joachims */
8 /* Date: 02.07.04 */
9 /* */
10 /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */
11 /* */
12 /* This software is available for non-commercial use only. It must */
13 /* not be modified and distributed without prior permission of the */
14 /* author. The author is not responsible for implications from the */
15 /* use of this software. */
16 /* */
17 /************************************************************************/
18
19 # include "ctype.h"
20 # include "svm_common.h"
21 # include "kernel.h" /* this contains a user supplied kernel */
22
23 long verbosity; /* verbosity level (0-4) */
24 long kernel_cache_statistic;
25
26 double classify_example(MODEL *model, DOC *ex)
27 /* classifies one example */
28 {
29 register long i;
30 register double dist;
31
32 if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights))
33 return(classify_example_linear(model,ex));
34
35 dist=0;
36 for(i=1;i<model->sv_num;i++) {
37 dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
38 }
39 return(dist-model->b);
40 }
41
42 double classify_example_linear(MODEL *model, DOC *ex)
43 /* classifies example for linear kernel */
44
45 /* important: the model must have the linear weight vector computed */
46 /* use: add_weight_vector_to_linear_model(&model); */
47
48
49 /* important: the feature numbers in the example to classify must */
50 /* not be larger than the weight vector! */
51 {
52 double sum=0;
53 SVECTOR *f;
54
55 for(f=ex->fvec;f;f=f->next)
56 sum+=f->factor*sprod_ns(model->lin_weights,f);
57 return(sum-model->b);
58 }
59
60
61 CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
62 /* calculate the kernel function */
63 {
64 double sum=0;
65 SVECTOR *fa,*fb;
66
67 /* in case the constraints are sums of feature vector as represented
68 as a list of SVECTOR's with their coefficient factor in the sum,
69 take the kernel between all pairs */
70 for(fa=a->fvec;fa;fa=fa->next) {
71 for(fb=b->fvec;fb;fb=fb->next) {
72 if(fa->kernel_id == fb->kernel_id)
73 sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb);
74 }
75 }
76 return(sum);
77 }
78
79 CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b)
80 /* calculate the kernel function between two vectors */
81 {
82 kernel_cache_statistic++;
83 switch(kernel_parm->kernel_type) {
84 case 0: /* linear */
85 return((CFLOAT)sprod_ss(a,b));
86 case 1: /* polynomial */
87 return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
88 case 2: /* radial basis function */
89 return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq)));
90 case 3: /* sigmoid neural net */
91 return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const));
92 case 4: /* custom-kernel supplied in file kernel.h*/
93 return((CFLOAT)custom_kernel(kernel_parm,a,b));
94 default: printf("Error: Unknown kernel function\n"); exit(1);
95 }
96 }
97
98
99 SVECTOR *create_svector(WORD *words,char *userdefined,double factor)
100 {
101 SVECTOR *vec;
102 long fnum,i;
103
104 fnum=0;
105 while(words[fnum].wnum) {
106 fnum++;
107 }
108 fnum++;
109 vec = (SVECTOR *)my_malloc(sizeof(SVECTOR));
110 vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum));
111 for(i=0;i<fnum;i++) {
112 vec->words[i]=words[i];
113 }
114 vec->twonorm_sq=sprod_ss(vec,vec);
115
116 fnum=0;
117 while(userdefined[fnum]) {
118 fnum++;
119 }
120 fnum++;
121 vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum));
122 for(i=0;i<fnum;i++) {
123 vec->userdefined[i]=userdefined[i];
124 }
125 vec->kernel_id=0;
126 vec->next=NULL;
127 vec->factor=factor;
128 return(vec);
129 }
130
131 SVECTOR *copy_svector(SVECTOR *vec)
132 {
133 SVECTOR *newvec=NULL;
134 if(vec) {
135 newvec=create_svector(vec->words,vec->userdefined,vec->factor);
136 newvec->next=copy_svector(vec->next);
137 }
138 return(newvec);
139 }
140
141 void free_svector(SVECTOR *vec)
142 {
143 if(vec) {
144 free(vec->words);
145 if(vec->userdefined)
146 free(vec->userdefined);
147 free_svector(vec->next);
148 free(vec);
149 }
150 }
151
152 double sprod_ss(SVECTOR *a, SVECTOR *b)
153 /* compute the inner product of two sparse vectors */
154 {
155 register CFLOAT sum=0;
156 register WORD *ai,*bj;
157 ai=a->words;
158 bj=b->words;
159 while (ai->wnum && bj->wnum) {
160 if(ai->wnum > bj->wnum) {
161 bj++;
162 }
163 else if (ai->wnum < bj->wnum) {
164 ai++;
165 }
166 else {
167 sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight);
168 ai++;
169 bj++;
170 }
171 }
172 return((double)sum);
173 }
174
175 SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b)
176 /* compute the difference a-b of two sparse vectors */
177 /* Note: SVECTOR lists are not followed, but only the first
178 SVECTOR is used */
179 {
180 SVECTOR *vec;
181 register WORD *sum,*sumi;
182 register WORD *ai,*bj;
183 long veclength;
184
185 ai=a->words;
186 bj=b->words;
187 veclength=0;
188 while (ai->wnum && bj->wnum) {
189 if(ai->wnum > bj->wnum) {
190 veclength++;
191 bj++;
192 }
193 else if (ai->wnum < bj->wnum) {
194 veclength++;
195 ai++;
196 }
197 else {
198 veclength++;
199 ai++;
200 bj++;
201 }
202 }
203 while (bj->wnum) {
204 veclength++;
205 bj++;
206 }
207 while (ai->wnum) {
208 veclength++;
209 ai++;
210 }
211 veclength++;
212
213 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
214 sumi=sum;
215 ai=a->words;
216 bj=b->words;
217 while (ai->wnum && bj->wnum) {
218 if(ai->wnum > bj->wnum) {
219 (*sumi)=(*bj);
220 sumi->weight*=(-1);
221 sumi++;
222 bj++;
223 }
224 else if (ai->wnum < bj->wnum) {
225 (*sumi)=(*ai);
226 sumi++;
227 ai++;
228 }
229 else {
230 (*sumi)=(*ai);
231 sumi->weight-=bj->weight;
232 if(sumi->weight != 0)
233 sumi++;
234 ai++;
235 bj++;
236 }
237 }
238 while (bj->wnum) {
239 (*sumi)=(*bj);
240 sumi->weight*=(-1);
241 sumi++;
242 bj++;
243 }
244 while (ai->wnum) {
245 (*sumi)=(*ai);
246 sumi++;
247 ai++;
248 }
249 sumi->wnum=0;
250
251 vec=create_svector(sum,"",1.0);
252 free(sum);
253
254 return(vec);
255 }
256
257 SVECTOR* add_ss(SVECTOR *a, SVECTOR *b)
258 /* compute the sum a+b of two sparse vectors */
259 /* Note: SVECTOR lists are not followed, but only the first
260 SVECTOR is used */
261 {
262 SVECTOR *vec;
263 register WORD *sum,*sumi;
264 register WORD *ai,*bj;
265 long veclength;
266
267 ai=a->words;
268 bj=b->words;
269 veclength=0;
270 while (ai->wnum && bj->wnum) {
271 if(ai->wnum > bj->wnum) {
272 veclength++;
273 bj++;
274 }
275 else if (ai->wnum < bj->wnum) {
276 veclength++;
277 ai++;
278 }
279 else {
280 veclength++;
281 ai++;
282 bj++;
283 }
284 }
285 while (bj->wnum) {
286 veclength++;
287 bj++;
288 }
289 while (ai->wnum) {
290 veclength++;
291 ai++;
292 }
293 veclength++;
294
295 /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/
296
297 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
298 sumi=sum;
299 ai=a->words;
300 bj=b->words;
301 while (ai->wnum && bj->wnum) {
302 if(ai->wnum > bj->wnum) {
303 (*sumi)=(*bj);
304 sumi++;
305 bj++;
306 }
307 else if (ai->wnum < bj->wnum) {
308 (*sumi)=(*ai);
309 sumi++;
310 ai++;
311 }
312 else {
313 (*sumi)=(*ai);
314 sumi->weight+=bj->weight;
315 if(sumi->weight != 0)
316 sumi++;
317 ai++;
318 bj++;
319 }
320 }
321 while (bj->wnum) {
322 (*sumi)=(*bj);
323 sumi++;
324 bj++;
325 }
326 while (ai->wnum) {
327 (*sumi)=(*ai);
328 sumi++;
329 ai++;
330 }
331 sumi->wnum=0;
332
333 vec=create_svector(sum,"",1.0);
334 free(sum);
335
336 return(vec);
337 }
338
339 SVECTOR* add_list_ss(SVECTOR *a)
340 /* computes the linear combination of the SVECTOR list weighted
341 by the factor of each SVECTOR */
342 {
343 SVECTOR *scaled,*oldsum,*sum,*f;
344 WORD empty[2];
345
346 if(a){
347 sum=smult_s(a,a->factor);
348 for(f=a->next;f;f=f->next) {
349 scaled=smult_s(f,f->factor);
350 oldsum=sum;
351 sum=add_ss(sum,scaled);
352 free_svector(oldsum);
353 free_svector(scaled);
354 }
355 sum->factor=1.0;
356 }
357 else {
358 empty[0].wnum=0;
359 sum=create_svector(empty,"",1.0);
360 }
361 return(sum);
362 }
363
364 void append_svector_list(SVECTOR *a, SVECTOR *b)
365 /* appends SVECTOR b to the end of SVECTOR a. */
366 {
367 SVECTOR *f;
368
369 for(f=a;f->next;f=f->next); /* find end of first vector list */
370 f->next=b; /* append the two vector lists */
371 }
372
373 SVECTOR* smult_s(SVECTOR *a, double factor)
374 /* scale sparse vector a by factor */
375 {
376 SVECTOR *vec;
377 register WORD *sum,*sumi;
378 register WORD *ai;
379 long veclength;
380
381 ai=a->words;
382 veclength=0;
383 while (ai->wnum) {
384 veclength++;
385 ai++;
386 }
387 veclength++;
388
389 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
390 sumi=sum;
391 ai=a->words;
392 while (ai->wnum) {
393 (*sumi)=(*ai);
394 sumi->weight*=factor;
395 if(sumi->weight != 0)
396 sumi++;
397 ai++;
398 }
399 sumi->wnum=0;
400
401 vec=create_svector(sum,a->userdefined,a->factor);
402 free(sum);
403
404 return(vec);
405 }
406
407 int featvec_eq(SVECTOR *a, SVECTOR *b)
408 /* tests two sparse vectors for equality */
409 {
410 register WORD *ai,*bj;
411 ai=a->words;
412 bj=b->words;
413 while (ai->wnum && bj->wnum) {
414 if(ai->wnum > bj->wnum) {
415 if((CFLOAT)(bj->weight) != 0)
416 return(0);
417 bj++;
418 }
419 else if (ai->wnum < bj->wnum) {
420 if((CFLOAT)(ai->weight) != 0)
421 return(0);
422 ai++;
423 }
424 else {
425 if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight))
426 return(0);
427 ai++;
428 bj++;
429 }
430 }
431 return(1);
432 }
433
434 double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
435 /* compute length of weight vector */
436 {
437 register long i,j;
438 register double sum=0,alphai;
439 register DOC *supveci;
440
441 for(i=1;i<model->sv_num;i++) {
442 alphai=model->alpha[i];
443 supveci=model->supvec[i];
444 for(j=1;j<model->sv_num;j++) {
445 sum+=alphai*model->alpha[j]
446 *kernel(kernel_parm,supveci,model->supvec[j]);
447 }
448 }
449 return(sqrt(sum));
450 }
451
452 void clear_vector_n(double *vec, long int n)
453 {
454 register long i;
455 for(i=0;i<=n;i++) vec[i]=0;
456 }
457
458 void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor)
459 {
460 register WORD *ai;
461 ai=vec_s->words;
462 while (ai->wnum) {
463 vec_n[ai->wnum]+=(faktor*ai->weight);
464 ai++;
465 }
466 }
467
468 double sprod_ns(double *vec_n, SVECTOR *vec_s)
469 {
470 register double sum=0;
471 register WORD *ai;
472 ai=vec_s->words;
473 while (ai->wnum) {
474 sum+=(vec_n[ai->wnum]*ai->weight);
475 ai++;
476 }
477 return(sum);
478 }
479
480 void add_weight_vector_to_linear_model(MODEL *model)
481 /* compute weight vector in linear case and add to model */
482 {
483 long i;
484 SVECTOR *f;
485
486 model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
487 clear_vector_n(model->lin_weights,model->totwords);
488 for(i=1;i<model->sv_num;i++) {
489 for(f=(model->supvec[i])->fvec;f;f=f->next)
490 add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]);
491 }
492 }
493
494
495 DOC *create_example(long docnum, long queryid, long slackid,
496 double costfactor, SVECTOR *fvec)
497 {
498 DOC *example;
499 example = (DOC *)my_malloc(sizeof(DOC));
500 example->docnum=docnum;
501 example->queryid=queryid;
502 example->slackid=slackid;
503 example->costfactor=costfactor;
504 example->fvec=fvec;
505 return(example);
506 }
507
508 void free_example(DOC *example, long deep)
509 {
510 if(example) {
511 if(deep) {
512 if(example->fvec)
513 free_svector(example->fvec);
514 }
515 free(example);
516 }
517 }
518
519 void write_model(char *modelfile, MODEL *model)
520 {
521 FILE *modelfl;
522 long j,i,sv_num;
523 SVECTOR *v;
524
525 if(verbosity>=1) {
526 printf("Writing model file..."); fflush(stdout);
527 }
528 if ((modelfl = fopen (modelfile, "w")) == NULL)
529 { perror (modelfile); exit (1); }
530 fprintf(modelfl,"SVM-light Version %s\n",VERSION);
531 fprintf(modelfl,"%ld # kernel type\n",
532 model->kernel_parm.kernel_type);
533 fprintf(modelfl,"%ld # kernel parameter -d \n",
534 model->kernel_parm.poly_degree);
535 fprintf(modelfl,"%.8g # kernel parameter -g \n",
536 model->kernel_parm.rbf_gamma);
537 fprintf(modelfl,"%.8g # kernel parameter -s \n",
538 model->kernel_parm.coef_lin);
539 fprintf(modelfl,"%.8g # kernel parameter -r \n",
540 model->kernel_parm.coef_const);
541 fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
542 fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
543 fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
544
545 sv_num=1;
546 for(i=1;i<model->sv_num;i++) {
547 for(v=model->supvec[i]->fvec;v;v=v->next)
548 sv_num++;
549 }
550 fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
551 fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
552
553 for(i=1;i<model->sv_num;i++) {
554 for(v=model->supvec[i]->fvec;v;v=v->next) {
555 fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
556 for (j=0; (v->words[j]).wnum; j++) {
557 fprintf(modelfl,"%ld:%.8g ",
558 (long)(v->words[j]).wnum,
559 (double)(v->words[j]).weight);
560 }
561 fprintf(modelfl,"#%s\n",v->userdefined);
562 /* NOTE: this could be made more efficient by summing the
563 alpha's of identical vectors before writing them to the
564 file. */
565 }
566 }
567 fclose(modelfl);
568 if(verbosity>=1) {
569 printf("done\n");
570 }
571 }
572
573
574 MODEL *read_model(char *modelfile)
575 {
576 FILE *modelfl;
577 long i,queryid,slackid;
578 double costfactor;
579 long max_sv,max_words,ll,wpos;
580 char *line,*comment;
581 WORD *words;
582 char version_buffer[100];
583 MODEL *model;
584
585 if(verbosity>=1) {
586 printf("Reading model..."); fflush(stdout);
587 }
588
589 nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
590 max_words+=2;
591 ll+=2;
592
593 words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
594 line = (char *)my_malloc(sizeof(char)*ll);
595 model = (MODEL *)my_malloc(sizeof(MODEL));
596
597 if ((modelfl = fopen (modelfile, "r")) == NULL)
598 { perror (modelfile); exit (1); }
599
600 fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
601 if(strcmp(version_buffer,VERSION)) {
602 perror ("Version of model-file does not match version of svm_classify!");
603 exit (1);
604 }
605 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
606 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
607 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
608 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
609 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
610 fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
611
612 fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
613 fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
614 fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
615 fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
616
617 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
618 model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
619 model->index=NULL;
620 model->lin_weights=NULL;
621
622 for(i=1;i<model->sv_num;i++) {
623 fgets(line,(int)ll,modelfl);
624 if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
625 &costfactor,&wpos,max_words,&comment)) {
626 printf("\nParsing error while reading model file in SV %ld!\n%s",
627 i,line);
628 exit(1);
629 }
630 model->supvec[i] = create_example(-1,
631 0,0,
632 0.0,
633 create_svector(words,comment,1.0));
634 }
635 fclose(modelfl);
636 free(line);
637 free(words);
638 if(verbosity>=1) {
639 fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
640 }
641 return(model);
642 }
643
644 MODEL *copy_model(MODEL *model)
645 {
646 MODEL *newmodel;
647 long i;
648
649 newmodel=(MODEL *)my_malloc(sizeof(MODEL));
650 (*newmodel)=(*model);
651 newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
652 newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
653 newmodel->index = NULL; /* index is not copied */
654 newmodel->supvec[0] = NULL;
655 newmodel->alpha[0] = 0;
656 for(i=1;i<model->sv_num;i++) {
657 newmodel->alpha[i]=model->alpha[i];
658 newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
659 model->supvec[i]->queryid,0,
660 model->supvec[i]->costfactor,
661 copy_svector(model->supvec[i]->fvec));
662 }
663 if(model->lin_weights) {
664 newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
665 for(i=0;i<model->totwords+1;i++)
666 newmodel->lin_weights[i]=model->lin_weights[i];
667 }
668 return(newmodel);
669 }
670
671 void free_model(MODEL *model, int deep)
672 {
673 long i;
674
675 if(model->supvec) {
676 if(deep) {
677 for(i=1;i<model->sv_num;i++) {
678 free_example(model->supvec[i],1);
679 }
680 }
681 free(model->supvec);
682 }
683 if(model->alpha) free(model->alpha);
684 if(model->index) free(model->index);
685 if(model->lin_weights) free(model->lin_weights);
686 free(model);
687 }
688
689
690 void read_documents(char *docfile, DOC ***docs, double **label,
691 long int *totwords, long int *totdoc)
692 {
693 char *line,*comment;
694 WORD *words;
695 long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
696 long max_words_doc, ll;
697 double doc_label,costfactor;
698 FILE *docfl;
699
700 if(verbosity>=1) {
701 printf("Scanning examples..."); fflush(stdout);
702 }
703 nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
704 max_words_doc+=2;
705 ll+=2;
706 max_docs+=2;
707 if(verbosity>=1) {
708 printf("done\n"); fflush(stdout);
709 }
710
711 (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */
712 (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
713 line = (char *)my_malloc(sizeof(char)*ll);
714
715 if ((docfl = fopen (docfile, "r")) == NULL)
716 { perror (docfile); exit (1); }
717
718 words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
719 if(verbosity>=1) {
720 printf("Reading examples into memory..."); fflush(stdout);
721 }
722 dnum=0;
723 (*totwords)=0;
724 while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
725 if(line[0] == '#') continue; /* line contains comments */
726 if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
727 &wpos,max_words_doc,&comment)) {
728 printf("\nParsing error in line %ld!\n%s",dnum,line);
729 exit(1);
730 }
731 (*label)[dnum]=doc_label;
732 /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
733 if(doc_label > 0) dpos++;
734 if (doc_label < 0) dneg++;
735 if (doc_label == 0) dunlab++;
736 if((wpos>1) && ((words[wpos-2]).wnum>(*totwords)))
737 (*totwords)=(words[wpos-2]).wnum;
738 if((*totwords) > MAXFEATNUM) {
739 printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
740 printf("LINE: %s\n",line);
741 exit(1);
742 }
743 (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
744 create_svector(words,comment,1.0));
745 /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */
746 dnum++;
747 if(verbosity>=1) {
748 if((dnum % 100) == 0) {
749 printf("%ld..",dnum); fflush(stdout);
750 }
751 }
752 }
753
754 fclose(docfl);
755 free(line);
756 free(words);
757 if(verbosity>=1) {
758 fprintf(stdout, "OK. (%ld examples read)\n", dnum);
759 }
760 (*totdoc)=dnum;
761 }
762
763 int parse_document(char *line, WORD *words, double *label,
764 long *queryid, long *slackid, double *costfactor,
765 long int *numwords, long int max_words_doc,
766 char **comment)
767 {
768 register long wpos,pos;
769 long wnum;
770 double weight;
771 int numread;
772 char featurepair[1000],junk[1000];
773
774 (*queryid)=0;
775 (*slackid)=0;
776 (*costfactor)=1;
777
778 pos=0;
779 (*comment)=NULL;
780 while(line[pos] ) { /* cut off comments */
781 if((line[pos] == '#') && (!(*comment))) {
782 line[pos]=0;
783 (*comment)=&(line[pos+1]);
784 }
785 if(line[pos] == '\n') { /* strip the CR */
786 line[pos]=0;
787 }
788 pos++;
789 }
790 if(!(*comment)) (*comment)=&(line[pos]);
791 /* printf("Comment: '%s'\n",(*comment)); */
792
793 wpos=0;
794 /* check, that line starts with target value or zero, but not with
795 feature pair */
796 if(sscanf(line,"%s",featurepair) == EOF) return(0);
797 pos=0;
798 while((featurepair[pos] != ':') && featurepair[pos]) pos++;
799 if(featurepair[pos] == ':') {
800 perror ("Line must start with label or 0!!!\n");
801 printf("LINE: %s\n",line);
802 exit (1);
803 }
804 /* read the target value */
805 if(sscanf(line,"%lf",label) == EOF) return(0);
806 pos=0;
807 while(space_or_null((int)line[pos])) pos++;
808 while((!space_or_null((int)line[pos])) && line[pos]) pos++;
809 while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
810 (numread > 0) &&
811 (wpos<max_words_doc)) {
812 /* printf("%s\n",featurepair); */
813 while(space_or_null((int)line[pos])) pos++;
814 while((!space_or_null((int)line[pos])) && line[pos]) pos++;
815 if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
816 /* it is the query id */
817 (*queryid)=(long)wnum;
818 }
819 else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
820 /* it is the slack id */
821 if(wnum > 0)
822 (*slackid)=(long)wnum;
823 else {
824 perror ("Slack-id must be greater or equal to 1!!!\n");
825 printf("LINE: %s\n",line);
826 exit (1);
827 }
828 }
829 else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
830 /* it is the example-dependent cost factor */
831 (*costfactor)=(double)weight;
832 }
833 else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
834 /* it is a regular feature */
835 if(wnum<=0) {
836 perror ("Feature numbers must be larger or equal to 1!!!\n");
837 printf("LINE: %s\n",line);
838 exit (1);
839 }
840 if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
841 perror ("Features must be in increasing order!!!\n");
842 printf("LINE: %s\n",line);
843 exit (1);
844 }
845 (words[wpos]).wnum=wnum;
846 (words[wpos]).weight=(FVAL)weight;
847 wpos++;
848 }
849 else {
850 perror ("Cannot parse feature/value pair!!!\n");
851 printf("'%s' in LINE: %s\n",featurepair,line);
852 exit (1);
853 }
854 }
855 (words[wpos]).wnum=0;
856 (*numwords)=wpos+1;
857 return(1);
858 }
859
860 double *read_alphas(char *alphafile,long totdoc)
861 /* reads the alpha vector from a file as written by the
862 write_alphas function */
863 {
864 FILE *fl;
865 double *alpha;
866 long dnum;
867
868 if ((fl = fopen (alphafile, "r")) == NULL)
869 { perror (alphafile); exit (1); }
870
871 alpha = (double *)my_malloc(sizeof(double)*totdoc);
872 if(verbosity>=1) {
873 printf("Reading alphas..."); fflush(stdout);
874 }
875 dnum=0;
876 while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
877 dnum++;
878 }
879 if(dnum != totdoc)
880 { perror ("\nNot enough values in alpha file!"); exit (1); }
881 fclose(fl);
882
883 if(verbosity>=1) {
884 printf("done\n"); fflush(stdout);
885 }
886
887 return(alpha);
888 }
889
890 void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
891 /* Grep through file and count number of lines, maximum number of
892 spaces per line, and longest line. */
893 {
894 FILE *fl;
895 int ic;
896 char c;
897 long current_length,current_wol;
898
899 if ((fl = fopen (file, "r")) == NULL)
900 { perror (file); exit (1); }
901 current_length=0;
902 current_wol=0;
903 (*ll)=0;
904 (*nol)=1;
905 (*wol)=0;
906 while((ic=getc(fl)) != EOF) {
907 c=(char)ic;
908 current_length++;
909 if(space_or_null((int)c)) {
910 current_wol++;
911 }
912 if(c == '\n') {
913 (*nol)++;
914 if(current_length>(*ll)) {
915 (*ll)=current_length;
916 }
917 if(current_wol>(*wol)) {
918 (*wol)=current_wol;
919 }
920 current_length=0;
921 current_wol=0;
922 }
923 }
924 fclose(fl);
925 }
926
927 long minl(long int a, long int b)
928 {
929 if(a<b)
930 return(a);
931 else
932 return(b);
933 }
934
935 long maxl(long int a, long int b)
936 {
937 if(a>b)
938 return(a);
939 else
940 return(b);
941 }
942
943 long get_runtime(void)
944 {
945 clock_t start;
946 start = clock();
947 return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
948 }
949
950
951 # ifdef _MSC_VER
952
953 int isnan(double a)
954 {
955 return(_isnan(a));
956 }
957
958 # endif
959
960 int space_or_null(int c) {
961 if (c==0)
962 return 1;
963 return isspace(c);
964 }
965
966 void *my_malloc(size_t size)
967 {
968 void *ptr;
969 ptr=(void *)malloc(size);
970 if(!ptr) {
971 perror ("Out of memory!\n");
972 exit (1);
973 }
974 return(ptr);
975 }
976
977 void copyright_notice(void)
978 {
979 printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
980 printf("This software is available for non-commercial use only. It must not\n");
981 printf("be modified and distributed without prior permission of the author.\n");
982 printf("The author is not responsible for implications from the use of this\n");
983 printf("software.\n\n");
984 }