Mercurial > hg > camir-aes2014
comparison toolboxes/SVM-light/src/svm_common.c @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 /************************************************************************/ | |
2 /* */ | |
3 /* svm_common.c */ | |
4 /* */ | |
5 /* Definitions and functions used in both svm_learn and svm_classify. */ | |
6 /* */ | |
7 /* Author: Thorsten Joachims */ | |
8 /* Date: 02.07.04 */ | |
9 /* */ | |
10 /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */ | |
11 /* */ | |
12 /* This software is available for non-commercial use only. It must */ | |
13 /* not be modified and distributed without prior permission of the */ | |
14 /* author. The author is not responsible for implications from the */ | |
15 /* use of this software. */ | |
16 /* */ | |
17 /************************************************************************/ | |
18 | |
19 # include "ctype.h" | |
20 # include "svm_common.h" | |
21 # include "kernel.h" /* this contains a user supplied kernel */ | |
22 | |
23 long verbosity; /* verbosity level (0-4) */ | |
24 long kernel_cache_statistic; | |
25 | |
26 double classify_example(MODEL *model, DOC *ex) | |
27 /* classifies one example */ | |
28 { | |
29 register long i; | |
30 register double dist; | |
31 | |
32 if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights)) | |
33 return(classify_example_linear(model,ex)); | |
34 | |
35 dist=0; | |
36 for(i=1;i<model->sv_num;i++) { | |
37 dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i]; | |
38 } | |
39 return(dist-model->b); | |
40 } | |
41 | |
42 double classify_example_linear(MODEL *model, DOC *ex) | |
43 /* classifies example for linear kernel */ | |
44 | |
45 /* important: the model must have the linear weight vector computed */ | |
46 /* use: add_weight_vector_to_linear_model(&model); */ | |
47 | |
48 | |
49 /* important: the feature numbers in the example to classify must */ | |
50 /* not be larger than the weight vector! */ | |
51 { | |
52 double sum=0; | |
53 SVECTOR *f; | |
54 | |
55 for(f=ex->fvec;f;f=f->next) | |
56 sum+=f->factor*sprod_ns(model->lin_weights,f); | |
57 return(sum-model->b); | |
58 } | |
59 | |
60 | |
61 CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b) | |
62 /* calculate the kernel function */ | |
63 { | |
64 double sum=0; | |
65 SVECTOR *fa,*fb; | |
66 | |
67 /* in case the constraints are sums of feature vector as represented | |
68 as a list of SVECTOR's with their coefficient factor in the sum, | |
69 take the kernel between all pairs */ | |
70 for(fa=a->fvec;fa;fa=fa->next) { | |
71 for(fb=b->fvec;fb;fb=fb->next) { | |
72 if(fa->kernel_id == fb->kernel_id) | |
73 sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb); | |
74 } | |
75 } | |
76 return(sum); | |
77 } | |
78 | |
79 CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b) | |
80 /* calculate the kernel function between two vectors */ | |
81 { | |
82 kernel_cache_statistic++; | |
83 switch(kernel_parm->kernel_type) { | |
84 case 0: /* linear */ | |
85 return((CFLOAT)sprod_ss(a,b)); | |
86 case 1: /* polynomial */ | |
87 return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree)); | |
88 case 2: /* radial basis function */ | |
89 return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq))); | |
90 case 3: /* sigmoid neural net */ | |
91 return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const)); | |
92 case 4: /* custom-kernel supplied in file kernel.h*/ | |
93 return((CFLOAT)custom_kernel(kernel_parm,a,b)); | |
94 default: printf("Error: Unknown kernel function\n"); exit(1); | |
95 } | |
96 } | |
97 | |
98 | |
99 SVECTOR *create_svector(WORD *words,char *userdefined,double factor) | |
100 { | |
101 SVECTOR *vec; | |
102 long fnum,i; | |
103 | |
104 fnum=0; | |
105 while(words[fnum].wnum) { | |
106 fnum++; | |
107 } | |
108 fnum++; | |
109 vec = (SVECTOR *)my_malloc(sizeof(SVECTOR)); | |
110 vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum)); | |
111 for(i=0;i<fnum;i++) { | |
112 vec->words[i]=words[i]; | |
113 } | |
114 vec->twonorm_sq=sprod_ss(vec,vec); | |
115 | |
116 fnum=0; | |
117 while(userdefined[fnum]) { | |
118 fnum++; | |
119 } | |
120 fnum++; | |
121 vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum)); | |
122 for(i=0;i<fnum;i++) { | |
123 vec->userdefined[i]=userdefined[i]; | |
124 } | |
125 vec->kernel_id=0; | |
126 vec->next=NULL; | |
127 vec->factor=factor; | |
128 return(vec); | |
129 } | |
130 | |
131 SVECTOR *copy_svector(SVECTOR *vec) | |
132 { | |
133 SVECTOR *newvec=NULL; | |
134 if(vec) { | |
135 newvec=create_svector(vec->words,vec->userdefined,vec->factor); | |
136 newvec->next=copy_svector(vec->next); | |
137 } | |
138 return(newvec); | |
139 } | |
140 | |
141 void free_svector(SVECTOR *vec) | |
142 { | |
143 if(vec) { | |
144 free(vec->words); | |
145 if(vec->userdefined) | |
146 free(vec->userdefined); | |
147 free_svector(vec->next); | |
148 free(vec); | |
149 } | |
150 } | |
151 | |
152 double sprod_ss(SVECTOR *a, SVECTOR *b) | |
153 /* compute the inner product of two sparse vectors */ | |
154 { | |
155 register CFLOAT sum=0; | |
156 register WORD *ai,*bj; | |
157 ai=a->words; | |
158 bj=b->words; | |
159 while (ai->wnum && bj->wnum) { | |
160 if(ai->wnum > bj->wnum) { | |
161 bj++; | |
162 } | |
163 else if (ai->wnum < bj->wnum) { | |
164 ai++; | |
165 } | |
166 else { | |
167 sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight); | |
168 ai++; | |
169 bj++; | |
170 } | |
171 } | |
172 return((double)sum); | |
173 } | |
174 | |
175 SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b) | |
176 /* compute the difference a-b of two sparse vectors */ | |
177 /* Note: SVECTOR lists are not followed, but only the first | |
178 SVECTOR is used */ | |
179 { | |
180 SVECTOR *vec; | |
181 register WORD *sum,*sumi; | |
182 register WORD *ai,*bj; | |
183 long veclength; | |
184 | |
185 ai=a->words; | |
186 bj=b->words; | |
187 veclength=0; | |
188 while (ai->wnum && bj->wnum) { | |
189 if(ai->wnum > bj->wnum) { | |
190 veclength++; | |
191 bj++; | |
192 } | |
193 else if (ai->wnum < bj->wnum) { | |
194 veclength++; | |
195 ai++; | |
196 } | |
197 else { | |
198 veclength++; | |
199 ai++; | |
200 bj++; | |
201 } | |
202 } | |
203 while (bj->wnum) { | |
204 veclength++; | |
205 bj++; | |
206 } | |
207 while (ai->wnum) { | |
208 veclength++; | |
209 ai++; | |
210 } | |
211 veclength++; | |
212 | |
213 sum=(WORD *)my_malloc(sizeof(WORD)*veclength); | |
214 sumi=sum; | |
215 ai=a->words; | |
216 bj=b->words; | |
217 while (ai->wnum && bj->wnum) { | |
218 if(ai->wnum > bj->wnum) { | |
219 (*sumi)=(*bj); | |
220 sumi->weight*=(-1); | |
221 sumi++; | |
222 bj++; | |
223 } | |
224 else if (ai->wnum < bj->wnum) { | |
225 (*sumi)=(*ai); | |
226 sumi++; | |
227 ai++; | |
228 } | |
229 else { | |
230 (*sumi)=(*ai); | |
231 sumi->weight-=bj->weight; | |
232 if(sumi->weight != 0) | |
233 sumi++; | |
234 ai++; | |
235 bj++; | |
236 } | |
237 } | |
238 while (bj->wnum) { | |
239 (*sumi)=(*bj); | |
240 sumi->weight*=(-1); | |
241 sumi++; | |
242 bj++; | |
243 } | |
244 while (ai->wnum) { | |
245 (*sumi)=(*ai); | |
246 sumi++; | |
247 ai++; | |
248 } | |
249 sumi->wnum=0; | |
250 | |
251 vec=create_svector(sum,"",1.0); | |
252 free(sum); | |
253 | |
254 return(vec); | |
255 } | |
256 | |
257 SVECTOR* add_ss(SVECTOR *a, SVECTOR *b) | |
258 /* compute the sum a+b of two sparse vectors */ | |
259 /* Note: SVECTOR lists are not followed, but only the first | |
260 SVECTOR is used */ | |
261 { | |
262 SVECTOR *vec; | |
263 register WORD *sum,*sumi; | |
264 register WORD *ai,*bj; | |
265 long veclength; | |
266 | |
267 ai=a->words; | |
268 bj=b->words; | |
269 veclength=0; | |
270 while (ai->wnum && bj->wnum) { | |
271 if(ai->wnum > bj->wnum) { | |
272 veclength++; | |
273 bj++; | |
274 } | |
275 else if (ai->wnum < bj->wnum) { | |
276 veclength++; | |
277 ai++; | |
278 } | |
279 else { | |
280 veclength++; | |
281 ai++; | |
282 bj++; | |
283 } | |
284 } | |
285 while (bj->wnum) { | |
286 veclength++; | |
287 bj++; | |
288 } | |
289 while (ai->wnum) { | |
290 veclength++; | |
291 ai++; | |
292 } | |
293 veclength++; | |
294 | |
295 /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/ | |
296 | |
297 sum=(WORD *)my_malloc(sizeof(WORD)*veclength); | |
298 sumi=sum; | |
299 ai=a->words; | |
300 bj=b->words; | |
301 while (ai->wnum && bj->wnum) { | |
302 if(ai->wnum > bj->wnum) { | |
303 (*sumi)=(*bj); | |
304 sumi++; | |
305 bj++; | |
306 } | |
307 else if (ai->wnum < bj->wnum) { | |
308 (*sumi)=(*ai); | |
309 sumi++; | |
310 ai++; | |
311 } | |
312 else { | |
313 (*sumi)=(*ai); | |
314 sumi->weight+=bj->weight; | |
315 if(sumi->weight != 0) | |
316 sumi++; | |
317 ai++; | |
318 bj++; | |
319 } | |
320 } | |
321 while (bj->wnum) { | |
322 (*sumi)=(*bj); | |
323 sumi++; | |
324 bj++; | |
325 } | |
326 while (ai->wnum) { | |
327 (*sumi)=(*ai); | |
328 sumi++; | |
329 ai++; | |
330 } | |
331 sumi->wnum=0; | |
332 | |
333 vec=create_svector(sum,"",1.0); | |
334 free(sum); | |
335 | |
336 return(vec); | |
337 } | |
338 | |
339 SVECTOR* add_list_ss(SVECTOR *a) | |
340 /* computes the linear combination of the SVECTOR list weighted | |
341 by the factor of each SVECTOR */ | |
342 { | |
343 SVECTOR *scaled,*oldsum,*sum,*f; | |
344 WORD empty[2]; | |
345 | |
346 if(a){ | |
347 sum=smult_s(a,a->factor); | |
348 for(f=a->next;f;f=f->next) { | |
349 scaled=smult_s(f,f->factor); | |
350 oldsum=sum; | |
351 sum=add_ss(sum,scaled); | |
352 free_svector(oldsum); | |
353 free_svector(scaled); | |
354 } | |
355 sum->factor=1.0; | |
356 } | |
357 else { | |
358 empty[0].wnum=0; | |
359 sum=create_svector(empty,"",1.0); | |
360 } | |
361 return(sum); | |
362 } | |
363 | |
364 void append_svector_list(SVECTOR *a, SVECTOR *b) | |
365 /* appends SVECTOR b to the end of SVECTOR a. */ | |
366 { | |
367 SVECTOR *f; | |
368 | |
369 for(f=a;f->next;f=f->next); /* find end of first vector list */ | |
370 f->next=b; /* append the two vector lists */ | |
371 } | |
372 | |
373 SVECTOR* smult_s(SVECTOR *a, double factor) | |
374 /* scale sparse vector a by factor */ | |
375 { | |
376 SVECTOR *vec; | |
377 register WORD *sum,*sumi; | |
378 register WORD *ai; | |
379 long veclength; | |
380 | |
381 ai=a->words; | |
382 veclength=0; | |
383 while (ai->wnum) { | |
384 veclength++; | |
385 ai++; | |
386 } | |
387 veclength++; | |
388 | |
389 sum=(WORD *)my_malloc(sizeof(WORD)*veclength); | |
390 sumi=sum; | |
391 ai=a->words; | |
392 while (ai->wnum) { | |
393 (*sumi)=(*ai); | |
394 sumi->weight*=factor; | |
395 if(sumi->weight != 0) | |
396 sumi++; | |
397 ai++; | |
398 } | |
399 sumi->wnum=0; | |
400 | |
401 vec=create_svector(sum,a->userdefined,a->factor); | |
402 free(sum); | |
403 | |
404 return(vec); | |
405 } | |
406 | |
407 int featvec_eq(SVECTOR *a, SVECTOR *b) | |
408 /* tests two sparse vectors for equality */ | |
409 { | |
410 register WORD *ai,*bj; | |
411 ai=a->words; | |
412 bj=b->words; | |
413 while (ai->wnum && bj->wnum) { | |
414 if(ai->wnum > bj->wnum) { | |
415 if((CFLOAT)(bj->weight) != 0) | |
416 return(0); | |
417 bj++; | |
418 } | |
419 else if (ai->wnum < bj->wnum) { | |
420 if((CFLOAT)(ai->weight) != 0) | |
421 return(0); | |
422 ai++; | |
423 } | |
424 else { | |
425 if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight)) | |
426 return(0); | |
427 ai++; | |
428 bj++; | |
429 } | |
430 } | |
431 return(1); | |
432 } | |
433 | |
434 double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm) | |
435 /* compute length of weight vector */ | |
436 { | |
437 register long i,j; | |
438 register double sum=0,alphai; | |
439 register DOC *supveci; | |
440 | |
441 for(i=1;i<model->sv_num;i++) { | |
442 alphai=model->alpha[i]; | |
443 supveci=model->supvec[i]; | |
444 for(j=1;j<model->sv_num;j++) { | |
445 sum+=alphai*model->alpha[j] | |
446 *kernel(kernel_parm,supveci,model->supvec[j]); | |
447 } | |
448 } | |
449 return(sqrt(sum)); | |
450 } | |
451 | |
452 void clear_vector_n(double *vec, long int n) | |
453 { | |
454 register long i; | |
455 for(i=0;i<=n;i++) vec[i]=0; | |
456 } | |
457 | |
458 void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor) | |
459 { | |
460 register WORD *ai; | |
461 ai=vec_s->words; | |
462 while (ai->wnum) { | |
463 vec_n[ai->wnum]+=(faktor*ai->weight); | |
464 ai++; | |
465 } | |
466 } | |
467 | |
468 double sprod_ns(double *vec_n, SVECTOR *vec_s) | |
469 { | |
470 register double sum=0; | |
471 register WORD *ai; | |
472 ai=vec_s->words; | |
473 while (ai->wnum) { | |
474 sum+=(vec_n[ai->wnum]*ai->weight); | |
475 ai++; | |
476 } | |
477 return(sum); | |
478 } | |
479 | |
480 void add_weight_vector_to_linear_model(MODEL *model) | |
481 /* compute weight vector in linear case and add to model */ | |
482 { | |
483 long i; | |
484 SVECTOR *f; | |
485 | |
486 model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1)); | |
487 clear_vector_n(model->lin_weights,model->totwords); | |
488 for(i=1;i<model->sv_num;i++) { | |
489 for(f=(model->supvec[i])->fvec;f;f=f->next) | |
490 add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]); | |
491 } | |
492 } | |
493 | |
494 | |
495 DOC *create_example(long docnum, long queryid, long slackid, | |
496 double costfactor, SVECTOR *fvec) | |
497 { | |
498 DOC *example; | |
499 example = (DOC *)my_malloc(sizeof(DOC)); | |
500 example->docnum=docnum; | |
501 example->queryid=queryid; | |
502 example->slackid=slackid; | |
503 example->costfactor=costfactor; | |
504 example->fvec=fvec; | |
505 return(example); | |
506 } | |
507 | |
508 void free_example(DOC *example, long deep) | |
509 { | |
510 if(example) { | |
511 if(deep) { | |
512 if(example->fvec) | |
513 free_svector(example->fvec); | |
514 } | |
515 free(example); | |
516 } | |
517 } | |
518 | |
519 void write_model(char *modelfile, MODEL *model) | |
520 { | |
521 FILE *modelfl; | |
522 long j,i,sv_num; | |
523 SVECTOR *v; | |
524 | |
525 if(verbosity>=1) { | |
526 printf("Writing model file..."); fflush(stdout); | |
527 } | |
528 if ((modelfl = fopen (modelfile, "w")) == NULL) | |
529 { perror (modelfile); exit (1); } | |
530 fprintf(modelfl,"SVM-light Version %s\n",VERSION); | |
531 fprintf(modelfl,"%ld # kernel type\n", | |
532 model->kernel_parm.kernel_type); | |
533 fprintf(modelfl,"%ld # kernel parameter -d \n", | |
534 model->kernel_parm.poly_degree); | |
535 fprintf(modelfl,"%.8g # kernel parameter -g \n", | |
536 model->kernel_parm.rbf_gamma); | |
537 fprintf(modelfl,"%.8g # kernel parameter -s \n", | |
538 model->kernel_parm.coef_lin); | |
539 fprintf(modelfl,"%.8g # kernel parameter -r \n", | |
540 model->kernel_parm.coef_const); | |
541 fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom); | |
542 fprintf(modelfl,"%ld # highest feature index \n",model->totwords); | |
543 fprintf(modelfl,"%ld # number of training documents \n",model->totdoc); | |
544 | |
545 sv_num=1; | |
546 for(i=1;i<model->sv_num;i++) { | |
547 for(v=model->supvec[i]->fvec;v;v=v->next) | |
548 sv_num++; | |
549 } | |
550 fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num); | |
551 fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b); | |
552 | |
553 for(i=1;i<model->sv_num;i++) { | |
554 for(v=model->supvec[i]->fvec;v;v=v->next) { | |
555 fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor); | |
556 for (j=0; (v->words[j]).wnum; j++) { | |
557 fprintf(modelfl,"%ld:%.8g ", | |
558 (long)(v->words[j]).wnum, | |
559 (double)(v->words[j]).weight); | |
560 } | |
561 fprintf(modelfl,"#%s\n",v->userdefined); | |
562 /* NOTE: this could be made more efficient by summing the | |
563 alpha's of identical vectors before writing them to the | |
564 file. */ | |
565 } | |
566 } | |
567 fclose(modelfl); | |
568 if(verbosity>=1) { | |
569 printf("done\n"); | |
570 } | |
571 } | |
572 | |
573 | |
574 MODEL *read_model(char *modelfile) | |
575 { | |
576 FILE *modelfl; | |
577 long i,queryid,slackid; | |
578 double costfactor; | |
579 long max_sv,max_words,ll,wpos; | |
580 char *line,*comment; | |
581 WORD *words; | |
582 char version_buffer[100]; | |
583 MODEL *model; | |
584 | |
585 if(verbosity>=1) { | |
586 printf("Reading model..."); fflush(stdout); | |
587 } | |
588 | |
589 nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */ | |
590 max_words+=2; | |
591 ll+=2; | |
592 | |
593 words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10)); | |
594 line = (char *)my_malloc(sizeof(char)*ll); | |
595 model = (MODEL *)my_malloc(sizeof(MODEL)); | |
596 | |
597 if ((modelfl = fopen (modelfile, "r")) == NULL) | |
598 { perror (modelfile); exit (1); } | |
599 | |
600 fscanf(modelfl,"SVM-light Version %s\n",version_buffer); | |
601 if(strcmp(version_buffer,VERSION)) { | |
602 perror ("Version of model-file does not match version of svm_classify!"); | |
603 exit (1); | |
604 } | |
605 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type); | |
606 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree); | |
607 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma); | |
608 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin); | |
609 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const); | |
610 fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom); | |
611 | |
612 fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords); | |
613 fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc); | |
614 fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num); | |
615 fscanf(modelfl,"%lf%*[^\n]\n", &model->b); | |
616 | |
617 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); | |
618 model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); | |
619 model->index=NULL; | |
620 model->lin_weights=NULL; | |
621 | |
622 for(i=1;i<model->sv_num;i++) { | |
623 fgets(line,(int)ll,modelfl); | |
624 if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, | |
625 &costfactor,&wpos,max_words,&comment)) { | |
626 printf("\nParsing error while reading model file in SV %ld!\n%s", | |
627 i,line); | |
628 exit(1); | |
629 } | |
630 model->supvec[i] = create_example(-1, | |
631 0,0, | |
632 0.0, | |
633 create_svector(words,comment,1.0)); | |
634 } | |
635 fclose(modelfl); | |
636 free(line); | |
637 free(words); | |
638 if(verbosity>=1) { | |
639 fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1)); | |
640 } | |
641 return(model); | |
642 } | |
643 | |
644 MODEL *copy_model(MODEL *model) | |
645 { | |
646 MODEL *newmodel; | |
647 long i; | |
648 | |
649 newmodel=(MODEL *)my_malloc(sizeof(MODEL)); | |
650 (*newmodel)=(*model); | |
651 newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); | |
652 newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); | |
653 newmodel->index = NULL; /* index is not copied */ | |
654 newmodel->supvec[0] = NULL; | |
655 newmodel->alpha[0] = 0; | |
656 for(i=1;i<model->sv_num;i++) { | |
657 newmodel->alpha[i]=model->alpha[i]; | |
658 newmodel->supvec[i]=create_example(model->supvec[i]->docnum, | |
659 model->supvec[i]->queryid,0, | |
660 model->supvec[i]->costfactor, | |
661 copy_svector(model->supvec[i]->fvec)); | |
662 } | |
663 if(model->lin_weights) { | |
664 newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1)); | |
665 for(i=0;i<model->totwords+1;i++) | |
666 newmodel->lin_weights[i]=model->lin_weights[i]; | |
667 } | |
668 return(newmodel); | |
669 } | |
670 | |
671 void free_model(MODEL *model, int deep) | |
672 { | |
673 long i; | |
674 | |
675 if(model->supvec) { | |
676 if(deep) { | |
677 for(i=1;i<model->sv_num;i++) { | |
678 free_example(model->supvec[i],1); | |
679 } | |
680 } | |
681 free(model->supvec); | |
682 } | |
683 if(model->alpha) free(model->alpha); | |
684 if(model->index) free(model->index); | |
685 if(model->lin_weights) free(model->lin_weights); | |
686 free(model); | |
687 } | |
688 | |
689 | |
690 void read_documents(char *docfile, DOC ***docs, double **label, | |
691 long int *totwords, long int *totdoc) | |
692 { | |
693 char *line,*comment; | |
694 WORD *words; | |
695 long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs; | |
696 long max_words_doc, ll; | |
697 double doc_label,costfactor; | |
698 FILE *docfl; | |
699 | |
700 if(verbosity>=1) { | |
701 printf("Scanning examples..."); fflush(stdout); | |
702 } | |
703 nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */ | |
704 max_words_doc+=2; | |
705 ll+=2; | |
706 max_docs+=2; | |
707 if(verbosity>=1) { | |
708 printf("done\n"); fflush(stdout); | |
709 } | |
710 | |
711 (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */ | |
712 (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */ | |
713 line = (char *)my_malloc(sizeof(char)*ll); | |
714 | |
715 if ((docfl = fopen (docfile, "r")) == NULL) | |
716 { perror (docfile); exit (1); } | |
717 | |
718 words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); | |
719 if(verbosity>=1) { | |
720 printf("Reading examples into memory..."); fflush(stdout); | |
721 } | |
722 dnum=0; | |
723 (*totwords)=0; | |
724 while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { | |
725 if(line[0] == '#') continue; /* line contains comments */ | |
726 if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor, | |
727 &wpos,max_words_doc,&comment)) { | |
728 printf("\nParsing error in line %ld!\n%s",dnum,line); | |
729 exit(1); | |
730 } | |
731 (*label)[dnum]=doc_label; | |
732 /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */ | |
733 if(doc_label > 0) dpos++; | |
734 if (doc_label < 0) dneg++; | |
735 if (doc_label == 0) dunlab++; | |
736 if((wpos>1) && ((words[wpos-2]).wnum>(*totwords))) | |
737 (*totwords)=(words[wpos-2]).wnum; | |
738 if((*totwords) > MAXFEATNUM) { | |
739 printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n"); | |
740 printf("LINE: %s\n",line); | |
741 exit(1); | |
742 } | |
743 (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor, | |
744 create_svector(words,comment,1.0)); | |
745 /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */ | |
746 dnum++; | |
747 if(verbosity>=1) { | |
748 if((dnum % 100) == 0) { | |
749 printf("%ld..",dnum); fflush(stdout); | |
750 } | |
751 } | |
752 } | |
753 | |
754 fclose(docfl); | |
755 free(line); | |
756 free(words); | |
757 if(verbosity>=1) { | |
758 fprintf(stdout, "OK. (%ld examples read)\n", dnum); | |
759 } | |
760 (*totdoc)=dnum; | |
761 } | |
762 | |
763 int parse_document(char *line, WORD *words, double *label, | |
764 long *queryid, long *slackid, double *costfactor, | |
765 long int *numwords, long int max_words_doc, | |
766 char **comment) | |
767 { | |
768 register long wpos,pos; | |
769 long wnum; | |
770 double weight; | |
771 int numread; | |
772 char featurepair[1000],junk[1000]; | |
773 | |
774 (*queryid)=0; | |
775 (*slackid)=0; | |
776 (*costfactor)=1; | |
777 | |
778 pos=0; | |
779 (*comment)=NULL; | |
780 while(line[pos] ) { /* cut off comments */ | |
781 if((line[pos] == '#') && (!(*comment))) { | |
782 line[pos]=0; | |
783 (*comment)=&(line[pos+1]); | |
784 } | |
785 if(line[pos] == '\n') { /* strip the CR */ | |
786 line[pos]=0; | |
787 } | |
788 pos++; | |
789 } | |
790 if(!(*comment)) (*comment)=&(line[pos]); | |
791 /* printf("Comment: '%s'\n",(*comment)); */ | |
792 | |
793 wpos=0; | |
794 /* check, that line starts with target value or zero, but not with | |
795 feature pair */ | |
796 if(sscanf(line,"%s",featurepair) == EOF) return(0); | |
797 pos=0; | |
798 while((featurepair[pos] != ':') && featurepair[pos]) pos++; | |
799 if(featurepair[pos] == ':') { | |
800 perror ("Line must start with label or 0!!!\n"); | |
801 printf("LINE: %s\n",line); | |
802 exit (1); | |
803 } | |
804 /* read the target value */ | |
805 if(sscanf(line,"%lf",label) == EOF) return(0); | |
806 pos=0; | |
807 while(space_or_null((int)line[pos])) pos++; | |
808 while((!space_or_null((int)line[pos])) && line[pos]) pos++; | |
809 while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) && | |
810 (numread > 0) && | |
811 (wpos<max_words_doc)) { | |
812 /* printf("%s\n",featurepair); */ | |
813 while(space_or_null((int)line[pos])) pos++; | |
814 while((!space_or_null((int)line[pos])) && line[pos]) pos++; | |
815 if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) { | |
816 /* it is the query id */ | |
817 (*queryid)=(long)wnum; | |
818 } | |
819 else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) { | |
820 /* it is the slack id */ | |
821 if(wnum > 0) | |
822 (*slackid)=(long)wnum; | |
823 else { | |
824 perror ("Slack-id must be greater or equal to 1!!!\n"); | |
825 printf("LINE: %s\n",line); | |
826 exit (1); | |
827 } | |
828 } | |
829 else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) { | |
830 /* it is the example-dependent cost factor */ | |
831 (*costfactor)=(double)weight; | |
832 } | |
833 else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) { | |
834 /* it is a regular feature */ | |
835 if(wnum<=0) { | |
836 perror ("Feature numbers must be larger or equal to 1!!!\n"); | |
837 printf("LINE: %s\n",line); | |
838 exit (1); | |
839 } | |
840 if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) { | |
841 perror ("Features must be in increasing order!!!\n"); | |
842 printf("LINE: %s\n",line); | |
843 exit (1); | |
844 } | |
845 (words[wpos]).wnum=wnum; | |
846 (words[wpos]).weight=(FVAL)weight; | |
847 wpos++; | |
848 } | |
849 else { | |
850 perror ("Cannot parse feature/value pair!!!\n"); | |
851 printf("'%s' in LINE: %s\n",featurepair,line); | |
852 exit (1); | |
853 } | |
854 } | |
855 (words[wpos]).wnum=0; | |
856 (*numwords)=wpos+1; | |
857 return(1); | |
858 } | |
859 | |
860 double *read_alphas(char *alphafile,long totdoc) | |
861 /* reads the alpha vector from a file as written by the | |
862 write_alphas function */ | |
863 { | |
864 FILE *fl; | |
865 double *alpha; | |
866 long dnum; | |
867 | |
868 if ((fl = fopen (alphafile, "r")) == NULL) | |
869 { perror (alphafile); exit (1); } | |
870 | |
871 alpha = (double *)my_malloc(sizeof(double)*totdoc); | |
872 if(verbosity>=1) { | |
873 printf("Reading alphas..."); fflush(stdout); | |
874 } | |
875 dnum=0; | |
876 while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) { | |
877 dnum++; | |
878 } | |
879 if(dnum != totdoc) | |
880 { perror ("\nNot enough values in alpha file!"); exit (1); } | |
881 fclose(fl); | |
882 | |
883 if(verbosity>=1) { | |
884 printf("done\n"); fflush(stdout); | |
885 } | |
886 | |
887 return(alpha); | |
888 } | |
889 | |
890 void nol_ll(char *file, long int *nol, long int *wol, long int *ll) | |
891 /* Grep through file and count number of lines, maximum number of | |
892 spaces per line, and longest line. */ | |
893 { | |
894 FILE *fl; | |
895 int ic; | |
896 char c; | |
897 long current_length,current_wol; | |
898 | |
899 if ((fl = fopen (file, "r")) == NULL) | |
900 { perror (file); exit (1); } | |
901 current_length=0; | |
902 current_wol=0; | |
903 (*ll)=0; | |
904 (*nol)=1; | |
905 (*wol)=0; | |
906 while((ic=getc(fl)) != EOF) { | |
907 c=(char)ic; | |
908 current_length++; | |
909 if(space_or_null((int)c)) { | |
910 current_wol++; | |
911 } | |
912 if(c == '\n') { | |
913 (*nol)++; | |
914 if(current_length>(*ll)) { | |
915 (*ll)=current_length; | |
916 } | |
917 if(current_wol>(*wol)) { | |
918 (*wol)=current_wol; | |
919 } | |
920 current_length=0; | |
921 current_wol=0; | |
922 } | |
923 } | |
924 fclose(fl); | |
925 } | |
926 | |
927 long minl(long int a, long int b) | |
928 { | |
929 if(a<b) | |
930 return(a); | |
931 else | |
932 return(b); | |
933 } | |
934 | |
935 long maxl(long int a, long int b) | |
936 { | |
937 if(a>b) | |
938 return(a); | |
939 else | |
940 return(b); | |
941 } | |
942 | |
943 long get_runtime(void) | |
944 { | |
945 clock_t start; | |
946 start = clock(); | |
947 return((long)((double)start*100.0/(double)CLOCKS_PER_SEC)); | |
948 } | |
949 | |
950 | |
951 # ifdef _MSC_VER | |
952 | |
953 int isnan(double a) | |
954 { | |
955 return(_isnan(a)); | |
956 } | |
957 | |
958 # endif | |
959 | |
960 int space_or_null(int c) { | |
961 if (c==0) | |
962 return 1; | |
963 return isspace(c); | |
964 } | |
965 | |
966 void *my_malloc(size_t size) | |
967 { | |
968 void *ptr; | |
969 ptr=(void *)malloc(size); | |
970 if(!ptr) { | |
971 perror ("Out of memory!\n"); | |
972 exit (1); | |
973 } | |
974 return(ptr); | |
975 } | |
976 | |
977 void copyright_notice(void) | |
978 { | |
979 printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n"); | |
980 printf("This software is available for non-commercial use only. It must not\n"); | |
981 printf("be modified and distributed without prior permission of the author.\n"); | |
982 printf("The author is not responsible for implications from the use of this\n"); | |
983 printf("software.\n\n"); | |
984 } |