wolffd@0
|
1 /************************************************************************/
|
wolffd@0
|
2 /* */
|
wolffd@0
|
3 /* svm_common.c */
|
wolffd@0
|
4 /* */
|
wolffd@0
|
5 /* Definitions and functions used in both svm_learn and svm_classify. */
|
wolffd@0
|
6 /* */
|
wolffd@0
|
7 /* Author: Thorsten Joachims */
|
wolffd@0
|
8 /* Date: 02.07.04 */
|
wolffd@0
|
9 /* */
|
wolffd@0
|
10 /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */
|
wolffd@0
|
11 /* */
|
wolffd@0
|
12 /* This software is available for non-commercial use only. It must */
|
wolffd@0
|
13 /* not be modified and distributed without prior permission of the */
|
wolffd@0
|
14 /* author. The author is not responsible for implications from the */
|
wolffd@0
|
15 /* use of this software. */
|
wolffd@0
|
16 /* */
|
wolffd@0
|
17 /************************************************************************/
|
wolffd@0
|
18
|
wolffd@0
|
19 # include "ctype.h"
|
wolffd@0
|
20 # include "svm_common.h"
|
wolffd@0
|
21 # include "kernel.h" /* this contains a user supplied kernel */
|
wolffd@0
|
22
|
wolffd@0
|
23 long verbosity; /* verbosity level (0-4) */
|
wolffd@0
|
24 long kernel_cache_statistic;
|
wolffd@0
|
25
|
wolffd@0
|
26 double classify_example(MODEL *model, DOC *ex)
|
wolffd@0
|
27 /* classifies one example */
|
wolffd@0
|
28 {
|
wolffd@0
|
29 register long i;
|
wolffd@0
|
30 register double dist;
|
wolffd@0
|
31
|
wolffd@0
|
32 if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights))
|
wolffd@0
|
33 return(classify_example_linear(model,ex));
|
wolffd@0
|
34
|
wolffd@0
|
35 dist=0;
|
wolffd@0
|
36 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
37 dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
|
wolffd@0
|
38 }
|
wolffd@0
|
39 return(dist-model->b);
|
wolffd@0
|
40 }
|
wolffd@0
|
41
|
wolffd@0
|
42 double classify_example_linear(MODEL *model, DOC *ex)
|
wolffd@0
|
43 /* classifies example for linear kernel */
|
wolffd@0
|
44
|
wolffd@0
|
45 /* important: the model must have the linear weight vector computed */
|
wolffd@0
|
46 /* use: add_weight_vector_to_linear_model(&model); */
|
wolffd@0
|
47
|
wolffd@0
|
48
|
wolffd@0
|
49 /* important: the feature numbers in the example to classify must */
|
wolffd@0
|
50 /* not be larger than the weight vector! */
|
wolffd@0
|
51 {
|
wolffd@0
|
52 double sum=0;
|
wolffd@0
|
53 SVECTOR *f;
|
wolffd@0
|
54
|
wolffd@0
|
55 for(f=ex->fvec;f;f=f->next)
|
wolffd@0
|
56 sum+=f->factor*sprod_ns(model->lin_weights,f);
|
wolffd@0
|
57 return(sum-model->b);
|
wolffd@0
|
58 }
|
wolffd@0
|
59
|
wolffd@0
|
60
|
wolffd@0
|
61 CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b)
|
wolffd@0
|
62 /* calculate the kernel function */
|
wolffd@0
|
63 {
|
wolffd@0
|
64 double sum=0;
|
wolffd@0
|
65 SVECTOR *fa,*fb;
|
wolffd@0
|
66
|
wolffd@0
|
67 /* in case the constraints are sums of feature vector as represented
|
wolffd@0
|
68 as a list of SVECTOR's with their coefficient factor in the sum,
|
wolffd@0
|
69 take the kernel between all pairs */
|
wolffd@0
|
70 for(fa=a->fvec;fa;fa=fa->next) {
|
wolffd@0
|
71 for(fb=b->fvec;fb;fb=fb->next) {
|
wolffd@0
|
72 if(fa->kernel_id == fb->kernel_id)
|
wolffd@0
|
73 sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb);
|
wolffd@0
|
74 }
|
wolffd@0
|
75 }
|
wolffd@0
|
76 return(sum);
|
wolffd@0
|
77 }
|
wolffd@0
|
78
|
wolffd@0
|
79 CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b)
|
wolffd@0
|
80 /* calculate the kernel function between two vectors */
|
wolffd@0
|
81 {
|
wolffd@0
|
82 kernel_cache_statistic++;
|
wolffd@0
|
83 switch(kernel_parm->kernel_type) {
|
wolffd@0
|
84 case 0: /* linear */
|
wolffd@0
|
85 return((CFLOAT)sprod_ss(a,b));
|
wolffd@0
|
86 case 1: /* polynomial */
|
wolffd@0
|
87 return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree));
|
wolffd@0
|
88 case 2: /* radial basis function */
|
wolffd@0
|
89 return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq)));
|
wolffd@0
|
90 case 3: /* sigmoid neural net */
|
wolffd@0
|
91 return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const));
|
wolffd@0
|
92 case 4: /* custom-kernel supplied in file kernel.h*/
|
wolffd@0
|
93 return((CFLOAT)custom_kernel(kernel_parm,a,b));
|
wolffd@0
|
94 default: printf("Error: Unknown kernel function\n"); exit(1);
|
wolffd@0
|
95 }
|
wolffd@0
|
96 }
|
wolffd@0
|
97
|
wolffd@0
|
98
|
wolffd@0
|
99 SVECTOR *create_svector(WORD *words,char *userdefined,double factor)
|
wolffd@0
|
100 {
|
wolffd@0
|
101 SVECTOR *vec;
|
wolffd@0
|
102 long fnum,i;
|
wolffd@0
|
103
|
wolffd@0
|
104 fnum=0;
|
wolffd@0
|
105 while(words[fnum].wnum) {
|
wolffd@0
|
106 fnum++;
|
wolffd@0
|
107 }
|
wolffd@0
|
108 fnum++;
|
wolffd@0
|
109 vec = (SVECTOR *)my_malloc(sizeof(SVECTOR));
|
wolffd@0
|
110 vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum));
|
wolffd@0
|
111 for(i=0;i<fnum;i++) {
|
wolffd@0
|
112 vec->words[i]=words[i];
|
wolffd@0
|
113 }
|
wolffd@0
|
114 vec->twonorm_sq=sprod_ss(vec,vec);
|
wolffd@0
|
115
|
wolffd@0
|
116 fnum=0;
|
wolffd@0
|
117 while(userdefined[fnum]) {
|
wolffd@0
|
118 fnum++;
|
wolffd@0
|
119 }
|
wolffd@0
|
120 fnum++;
|
wolffd@0
|
121 vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum));
|
wolffd@0
|
122 for(i=0;i<fnum;i++) {
|
wolffd@0
|
123 vec->userdefined[i]=userdefined[i];
|
wolffd@0
|
124 }
|
wolffd@0
|
125 vec->kernel_id=0;
|
wolffd@0
|
126 vec->next=NULL;
|
wolffd@0
|
127 vec->factor=factor;
|
wolffd@0
|
128 return(vec);
|
wolffd@0
|
129 }
|
wolffd@0
|
130
|
wolffd@0
|
131 SVECTOR *copy_svector(SVECTOR *vec)
|
wolffd@0
|
132 {
|
wolffd@0
|
133 SVECTOR *newvec=NULL;
|
wolffd@0
|
134 if(vec) {
|
wolffd@0
|
135 newvec=create_svector(vec->words,vec->userdefined,vec->factor);
|
wolffd@0
|
136 newvec->next=copy_svector(vec->next);
|
wolffd@0
|
137 }
|
wolffd@0
|
138 return(newvec);
|
wolffd@0
|
139 }
|
wolffd@0
|
140
|
wolffd@0
|
141 void free_svector(SVECTOR *vec)
|
wolffd@0
|
142 {
|
wolffd@0
|
143 if(vec) {
|
wolffd@0
|
144 free(vec->words);
|
wolffd@0
|
145 if(vec->userdefined)
|
wolffd@0
|
146 free(vec->userdefined);
|
wolffd@0
|
147 free_svector(vec->next);
|
wolffd@0
|
148 free(vec);
|
wolffd@0
|
149 }
|
wolffd@0
|
150 }
|
wolffd@0
|
151
|
wolffd@0
|
152 double sprod_ss(SVECTOR *a, SVECTOR *b)
|
wolffd@0
|
153 /* compute the inner product of two sparse vectors */
|
wolffd@0
|
154 {
|
wolffd@0
|
155 register CFLOAT sum=0;
|
wolffd@0
|
156 register WORD *ai,*bj;
|
wolffd@0
|
157 ai=a->words;
|
wolffd@0
|
158 bj=b->words;
|
wolffd@0
|
159 while (ai->wnum && bj->wnum) {
|
wolffd@0
|
160 if(ai->wnum > bj->wnum) {
|
wolffd@0
|
161 bj++;
|
wolffd@0
|
162 }
|
wolffd@0
|
163 else if (ai->wnum < bj->wnum) {
|
wolffd@0
|
164 ai++;
|
wolffd@0
|
165 }
|
wolffd@0
|
166 else {
|
wolffd@0
|
167 sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight);
|
wolffd@0
|
168 ai++;
|
wolffd@0
|
169 bj++;
|
wolffd@0
|
170 }
|
wolffd@0
|
171 }
|
wolffd@0
|
172 return((double)sum);
|
wolffd@0
|
173 }
|
wolffd@0
|
174
|
wolffd@0
|
175 SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b)
|
wolffd@0
|
176 /* compute the difference a-b of two sparse vectors */
|
wolffd@0
|
177 /* Note: SVECTOR lists are not followed, but only the first
|
wolffd@0
|
178 SVECTOR is used */
|
wolffd@0
|
179 {
|
wolffd@0
|
180 SVECTOR *vec;
|
wolffd@0
|
181 register WORD *sum,*sumi;
|
wolffd@0
|
182 register WORD *ai,*bj;
|
wolffd@0
|
183 long veclength;
|
wolffd@0
|
184
|
wolffd@0
|
185 ai=a->words;
|
wolffd@0
|
186 bj=b->words;
|
wolffd@0
|
187 veclength=0;
|
wolffd@0
|
188 while (ai->wnum && bj->wnum) {
|
wolffd@0
|
189 if(ai->wnum > bj->wnum) {
|
wolffd@0
|
190 veclength++;
|
wolffd@0
|
191 bj++;
|
wolffd@0
|
192 }
|
wolffd@0
|
193 else if (ai->wnum < bj->wnum) {
|
wolffd@0
|
194 veclength++;
|
wolffd@0
|
195 ai++;
|
wolffd@0
|
196 }
|
wolffd@0
|
197 else {
|
wolffd@0
|
198 veclength++;
|
wolffd@0
|
199 ai++;
|
wolffd@0
|
200 bj++;
|
wolffd@0
|
201 }
|
wolffd@0
|
202 }
|
wolffd@0
|
203 while (bj->wnum) {
|
wolffd@0
|
204 veclength++;
|
wolffd@0
|
205 bj++;
|
wolffd@0
|
206 }
|
wolffd@0
|
207 while (ai->wnum) {
|
wolffd@0
|
208 veclength++;
|
wolffd@0
|
209 ai++;
|
wolffd@0
|
210 }
|
wolffd@0
|
211 veclength++;
|
wolffd@0
|
212
|
wolffd@0
|
213 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
|
wolffd@0
|
214 sumi=sum;
|
wolffd@0
|
215 ai=a->words;
|
wolffd@0
|
216 bj=b->words;
|
wolffd@0
|
217 while (ai->wnum && bj->wnum) {
|
wolffd@0
|
218 if(ai->wnum > bj->wnum) {
|
wolffd@0
|
219 (*sumi)=(*bj);
|
wolffd@0
|
220 sumi->weight*=(-1);
|
wolffd@0
|
221 sumi++;
|
wolffd@0
|
222 bj++;
|
wolffd@0
|
223 }
|
wolffd@0
|
224 else if (ai->wnum < bj->wnum) {
|
wolffd@0
|
225 (*sumi)=(*ai);
|
wolffd@0
|
226 sumi++;
|
wolffd@0
|
227 ai++;
|
wolffd@0
|
228 }
|
wolffd@0
|
229 else {
|
wolffd@0
|
230 (*sumi)=(*ai);
|
wolffd@0
|
231 sumi->weight-=bj->weight;
|
wolffd@0
|
232 if(sumi->weight != 0)
|
wolffd@0
|
233 sumi++;
|
wolffd@0
|
234 ai++;
|
wolffd@0
|
235 bj++;
|
wolffd@0
|
236 }
|
wolffd@0
|
237 }
|
wolffd@0
|
238 while (bj->wnum) {
|
wolffd@0
|
239 (*sumi)=(*bj);
|
wolffd@0
|
240 sumi->weight*=(-1);
|
wolffd@0
|
241 sumi++;
|
wolffd@0
|
242 bj++;
|
wolffd@0
|
243 }
|
wolffd@0
|
244 while (ai->wnum) {
|
wolffd@0
|
245 (*sumi)=(*ai);
|
wolffd@0
|
246 sumi++;
|
wolffd@0
|
247 ai++;
|
wolffd@0
|
248 }
|
wolffd@0
|
249 sumi->wnum=0;
|
wolffd@0
|
250
|
wolffd@0
|
251 vec=create_svector(sum,"",1.0);
|
wolffd@0
|
252 free(sum);
|
wolffd@0
|
253
|
wolffd@0
|
254 return(vec);
|
wolffd@0
|
255 }
|
wolffd@0
|
256
|
wolffd@0
|
257 SVECTOR* add_ss(SVECTOR *a, SVECTOR *b)
|
wolffd@0
|
258 /* compute the sum a+b of two sparse vectors */
|
wolffd@0
|
259 /* Note: SVECTOR lists are not followed, but only the first
|
wolffd@0
|
260 SVECTOR is used */
|
wolffd@0
|
261 {
|
wolffd@0
|
262 SVECTOR *vec;
|
wolffd@0
|
263 register WORD *sum,*sumi;
|
wolffd@0
|
264 register WORD *ai,*bj;
|
wolffd@0
|
265 long veclength;
|
wolffd@0
|
266
|
wolffd@0
|
267 ai=a->words;
|
wolffd@0
|
268 bj=b->words;
|
wolffd@0
|
269 veclength=0;
|
wolffd@0
|
270 while (ai->wnum && bj->wnum) {
|
wolffd@0
|
271 if(ai->wnum > bj->wnum) {
|
wolffd@0
|
272 veclength++;
|
wolffd@0
|
273 bj++;
|
wolffd@0
|
274 }
|
wolffd@0
|
275 else if (ai->wnum < bj->wnum) {
|
wolffd@0
|
276 veclength++;
|
wolffd@0
|
277 ai++;
|
wolffd@0
|
278 }
|
wolffd@0
|
279 else {
|
wolffd@0
|
280 veclength++;
|
wolffd@0
|
281 ai++;
|
wolffd@0
|
282 bj++;
|
wolffd@0
|
283 }
|
wolffd@0
|
284 }
|
wolffd@0
|
285 while (bj->wnum) {
|
wolffd@0
|
286 veclength++;
|
wolffd@0
|
287 bj++;
|
wolffd@0
|
288 }
|
wolffd@0
|
289 while (ai->wnum) {
|
wolffd@0
|
290 veclength++;
|
wolffd@0
|
291 ai++;
|
wolffd@0
|
292 }
|
wolffd@0
|
293 veclength++;
|
wolffd@0
|
294
|
wolffd@0
|
295 /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/
|
wolffd@0
|
296
|
wolffd@0
|
297 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
|
wolffd@0
|
298 sumi=sum;
|
wolffd@0
|
299 ai=a->words;
|
wolffd@0
|
300 bj=b->words;
|
wolffd@0
|
301 while (ai->wnum && bj->wnum) {
|
wolffd@0
|
302 if(ai->wnum > bj->wnum) {
|
wolffd@0
|
303 (*sumi)=(*bj);
|
wolffd@0
|
304 sumi++;
|
wolffd@0
|
305 bj++;
|
wolffd@0
|
306 }
|
wolffd@0
|
307 else if (ai->wnum < bj->wnum) {
|
wolffd@0
|
308 (*sumi)=(*ai);
|
wolffd@0
|
309 sumi++;
|
wolffd@0
|
310 ai++;
|
wolffd@0
|
311 }
|
wolffd@0
|
312 else {
|
wolffd@0
|
313 (*sumi)=(*ai);
|
wolffd@0
|
314 sumi->weight+=bj->weight;
|
wolffd@0
|
315 if(sumi->weight != 0)
|
wolffd@0
|
316 sumi++;
|
wolffd@0
|
317 ai++;
|
wolffd@0
|
318 bj++;
|
wolffd@0
|
319 }
|
wolffd@0
|
320 }
|
wolffd@0
|
321 while (bj->wnum) {
|
wolffd@0
|
322 (*sumi)=(*bj);
|
wolffd@0
|
323 sumi++;
|
wolffd@0
|
324 bj++;
|
wolffd@0
|
325 }
|
wolffd@0
|
326 while (ai->wnum) {
|
wolffd@0
|
327 (*sumi)=(*ai);
|
wolffd@0
|
328 sumi++;
|
wolffd@0
|
329 ai++;
|
wolffd@0
|
330 }
|
wolffd@0
|
331 sumi->wnum=0;
|
wolffd@0
|
332
|
wolffd@0
|
333 vec=create_svector(sum,"",1.0);
|
wolffd@0
|
334 free(sum);
|
wolffd@0
|
335
|
wolffd@0
|
336 return(vec);
|
wolffd@0
|
337 }
|
wolffd@0
|
338
|
wolffd@0
|
339 SVECTOR* add_list_ss(SVECTOR *a)
|
wolffd@0
|
340 /* computes the linear combination of the SVECTOR list weighted
|
wolffd@0
|
341 by the factor of each SVECTOR */
|
wolffd@0
|
342 {
|
wolffd@0
|
343 SVECTOR *scaled,*oldsum,*sum,*f;
|
wolffd@0
|
344 WORD empty[2];
|
wolffd@0
|
345
|
wolffd@0
|
346 if(a){
|
wolffd@0
|
347 sum=smult_s(a,a->factor);
|
wolffd@0
|
348 for(f=a->next;f;f=f->next) {
|
wolffd@0
|
349 scaled=smult_s(f,f->factor);
|
wolffd@0
|
350 oldsum=sum;
|
wolffd@0
|
351 sum=add_ss(sum,scaled);
|
wolffd@0
|
352 free_svector(oldsum);
|
wolffd@0
|
353 free_svector(scaled);
|
wolffd@0
|
354 }
|
wolffd@0
|
355 sum->factor=1.0;
|
wolffd@0
|
356 }
|
wolffd@0
|
357 else {
|
wolffd@0
|
358 empty[0].wnum=0;
|
wolffd@0
|
359 sum=create_svector(empty,"",1.0);
|
wolffd@0
|
360 }
|
wolffd@0
|
361 return(sum);
|
wolffd@0
|
362 }
|
wolffd@0
|
363
|
wolffd@0
|
364 void append_svector_list(SVECTOR *a, SVECTOR *b)
|
wolffd@0
|
365 /* appends SVECTOR b to the end of SVECTOR a. */
|
wolffd@0
|
366 {
|
wolffd@0
|
367 SVECTOR *f;
|
wolffd@0
|
368
|
wolffd@0
|
369 for(f=a;f->next;f=f->next); /* find end of first vector list */
|
wolffd@0
|
370 f->next=b; /* append the two vector lists */
|
wolffd@0
|
371 }
|
wolffd@0
|
372
|
wolffd@0
|
373 SVECTOR* smult_s(SVECTOR *a, double factor)
|
wolffd@0
|
374 /* scale sparse vector a by factor */
|
wolffd@0
|
375 {
|
wolffd@0
|
376 SVECTOR *vec;
|
wolffd@0
|
377 register WORD *sum,*sumi;
|
wolffd@0
|
378 register WORD *ai;
|
wolffd@0
|
379 long veclength;
|
wolffd@0
|
380
|
wolffd@0
|
381 ai=a->words;
|
wolffd@0
|
382 veclength=0;
|
wolffd@0
|
383 while (ai->wnum) {
|
wolffd@0
|
384 veclength++;
|
wolffd@0
|
385 ai++;
|
wolffd@0
|
386 }
|
wolffd@0
|
387 veclength++;
|
wolffd@0
|
388
|
wolffd@0
|
389 sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
|
wolffd@0
|
390 sumi=sum;
|
wolffd@0
|
391 ai=a->words;
|
wolffd@0
|
392 while (ai->wnum) {
|
wolffd@0
|
393 (*sumi)=(*ai);
|
wolffd@0
|
394 sumi->weight*=factor;
|
wolffd@0
|
395 if(sumi->weight != 0)
|
wolffd@0
|
396 sumi++;
|
wolffd@0
|
397 ai++;
|
wolffd@0
|
398 }
|
wolffd@0
|
399 sumi->wnum=0;
|
wolffd@0
|
400
|
wolffd@0
|
401 vec=create_svector(sum,a->userdefined,a->factor);
|
wolffd@0
|
402 free(sum);
|
wolffd@0
|
403
|
wolffd@0
|
404 return(vec);
|
wolffd@0
|
405 }
|
wolffd@0
|
406
|
wolffd@0
|
407 int featvec_eq(SVECTOR *a, SVECTOR *b)
|
wolffd@0
|
408 /* tests two sparse vectors for equality */
|
wolffd@0
|
409 {
|
wolffd@0
|
410 register WORD *ai,*bj;
|
wolffd@0
|
411 ai=a->words;
|
wolffd@0
|
412 bj=b->words;
|
wolffd@0
|
413 while (ai->wnum && bj->wnum) {
|
wolffd@0
|
414 if(ai->wnum > bj->wnum) {
|
wolffd@0
|
415 if((CFLOAT)(bj->weight) != 0)
|
wolffd@0
|
416 return(0);
|
wolffd@0
|
417 bj++;
|
wolffd@0
|
418 }
|
wolffd@0
|
419 else if (ai->wnum < bj->wnum) {
|
wolffd@0
|
420 if((CFLOAT)(ai->weight) != 0)
|
wolffd@0
|
421 return(0);
|
wolffd@0
|
422 ai++;
|
wolffd@0
|
423 }
|
wolffd@0
|
424 else {
|
wolffd@0
|
425 if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight))
|
wolffd@0
|
426 return(0);
|
wolffd@0
|
427 ai++;
|
wolffd@0
|
428 bj++;
|
wolffd@0
|
429 }
|
wolffd@0
|
430 }
|
wolffd@0
|
431 return(1);
|
wolffd@0
|
432 }
|
wolffd@0
|
433
|
wolffd@0
|
434 double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm)
|
wolffd@0
|
435 /* compute length of weight vector */
|
wolffd@0
|
436 {
|
wolffd@0
|
437 register long i,j;
|
wolffd@0
|
438 register double sum=0,alphai;
|
wolffd@0
|
439 register DOC *supveci;
|
wolffd@0
|
440
|
wolffd@0
|
441 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
442 alphai=model->alpha[i];
|
wolffd@0
|
443 supveci=model->supvec[i];
|
wolffd@0
|
444 for(j=1;j<model->sv_num;j++) {
|
wolffd@0
|
445 sum+=alphai*model->alpha[j]
|
wolffd@0
|
446 *kernel(kernel_parm,supveci,model->supvec[j]);
|
wolffd@0
|
447 }
|
wolffd@0
|
448 }
|
wolffd@0
|
449 return(sqrt(sum));
|
wolffd@0
|
450 }
|
wolffd@0
|
451
|
wolffd@0
|
452 void clear_vector_n(double *vec, long int n)
|
wolffd@0
|
453 {
|
wolffd@0
|
454 register long i;
|
wolffd@0
|
455 for(i=0;i<=n;i++) vec[i]=0;
|
wolffd@0
|
456 }
|
wolffd@0
|
457
|
wolffd@0
|
458 void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor)
|
wolffd@0
|
459 {
|
wolffd@0
|
460 register WORD *ai;
|
wolffd@0
|
461 ai=vec_s->words;
|
wolffd@0
|
462 while (ai->wnum) {
|
wolffd@0
|
463 vec_n[ai->wnum]+=(faktor*ai->weight);
|
wolffd@0
|
464 ai++;
|
wolffd@0
|
465 }
|
wolffd@0
|
466 }
|
wolffd@0
|
467
|
wolffd@0
|
468 double sprod_ns(double *vec_n, SVECTOR *vec_s)
|
wolffd@0
|
469 {
|
wolffd@0
|
470 register double sum=0;
|
wolffd@0
|
471 register WORD *ai;
|
wolffd@0
|
472 ai=vec_s->words;
|
wolffd@0
|
473 while (ai->wnum) {
|
wolffd@0
|
474 sum+=(vec_n[ai->wnum]*ai->weight);
|
wolffd@0
|
475 ai++;
|
wolffd@0
|
476 }
|
wolffd@0
|
477 return(sum);
|
wolffd@0
|
478 }
|
wolffd@0
|
479
|
wolffd@0
|
480 void add_weight_vector_to_linear_model(MODEL *model)
|
wolffd@0
|
481 /* compute weight vector in linear case and add to model */
|
wolffd@0
|
482 {
|
wolffd@0
|
483 long i;
|
wolffd@0
|
484 SVECTOR *f;
|
wolffd@0
|
485
|
wolffd@0
|
486 model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
|
wolffd@0
|
487 clear_vector_n(model->lin_weights,model->totwords);
|
wolffd@0
|
488 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
489 for(f=(model->supvec[i])->fvec;f;f=f->next)
|
wolffd@0
|
490 add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]);
|
wolffd@0
|
491 }
|
wolffd@0
|
492 }
|
wolffd@0
|
493
|
wolffd@0
|
494
|
wolffd@0
|
495 DOC *create_example(long docnum, long queryid, long slackid,
|
wolffd@0
|
496 double costfactor, SVECTOR *fvec)
|
wolffd@0
|
497 {
|
wolffd@0
|
498 DOC *example;
|
wolffd@0
|
499 example = (DOC *)my_malloc(sizeof(DOC));
|
wolffd@0
|
500 example->docnum=docnum;
|
wolffd@0
|
501 example->queryid=queryid;
|
wolffd@0
|
502 example->slackid=slackid;
|
wolffd@0
|
503 example->costfactor=costfactor;
|
wolffd@0
|
504 example->fvec=fvec;
|
wolffd@0
|
505 return(example);
|
wolffd@0
|
506 }
|
wolffd@0
|
507
|
wolffd@0
|
508 void free_example(DOC *example, long deep)
|
wolffd@0
|
509 {
|
wolffd@0
|
510 if(example) {
|
wolffd@0
|
511 if(deep) {
|
wolffd@0
|
512 if(example->fvec)
|
wolffd@0
|
513 free_svector(example->fvec);
|
wolffd@0
|
514 }
|
wolffd@0
|
515 free(example);
|
wolffd@0
|
516 }
|
wolffd@0
|
517 }
|
wolffd@0
|
518
|
wolffd@0
|
519 void write_model(char *modelfile, MODEL *model)
|
wolffd@0
|
520 {
|
wolffd@0
|
521 FILE *modelfl;
|
wolffd@0
|
522 long j,i,sv_num;
|
wolffd@0
|
523 SVECTOR *v;
|
wolffd@0
|
524
|
wolffd@0
|
525 if(verbosity>=1) {
|
wolffd@0
|
526 printf("Writing model file..."); fflush(stdout);
|
wolffd@0
|
527 }
|
wolffd@0
|
528 if ((modelfl = fopen (modelfile, "w")) == NULL)
|
wolffd@0
|
529 { perror (modelfile); exit (1); }
|
wolffd@0
|
530 fprintf(modelfl,"SVM-light Version %s\n",VERSION);
|
wolffd@0
|
531 fprintf(modelfl,"%ld # kernel type\n",
|
wolffd@0
|
532 model->kernel_parm.kernel_type);
|
wolffd@0
|
533 fprintf(modelfl,"%ld # kernel parameter -d \n",
|
wolffd@0
|
534 model->kernel_parm.poly_degree);
|
wolffd@0
|
535 fprintf(modelfl,"%.8g # kernel parameter -g \n",
|
wolffd@0
|
536 model->kernel_parm.rbf_gamma);
|
wolffd@0
|
537 fprintf(modelfl,"%.8g # kernel parameter -s \n",
|
wolffd@0
|
538 model->kernel_parm.coef_lin);
|
wolffd@0
|
539 fprintf(modelfl,"%.8g # kernel parameter -r \n",
|
wolffd@0
|
540 model->kernel_parm.coef_const);
|
wolffd@0
|
541 fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
|
wolffd@0
|
542 fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
|
wolffd@0
|
543 fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
|
wolffd@0
|
544
|
wolffd@0
|
545 sv_num=1;
|
wolffd@0
|
546 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
547 for(v=model->supvec[i]->fvec;v;v=v->next)
|
wolffd@0
|
548 sv_num++;
|
wolffd@0
|
549 }
|
wolffd@0
|
550 fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
|
wolffd@0
|
551 fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
|
wolffd@0
|
552
|
wolffd@0
|
553 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
554 for(v=model->supvec[i]->fvec;v;v=v->next) {
|
wolffd@0
|
555 fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
|
wolffd@0
|
556 for (j=0; (v->words[j]).wnum; j++) {
|
wolffd@0
|
557 fprintf(modelfl,"%ld:%.8g ",
|
wolffd@0
|
558 (long)(v->words[j]).wnum,
|
wolffd@0
|
559 (double)(v->words[j]).weight);
|
wolffd@0
|
560 }
|
wolffd@0
|
561 fprintf(modelfl,"#%s\n",v->userdefined);
|
wolffd@0
|
562 /* NOTE: this could be made more efficient by summing the
|
wolffd@0
|
563 alpha's of identical vectors before writing them to the
|
wolffd@0
|
564 file. */
|
wolffd@0
|
565 }
|
wolffd@0
|
566 }
|
wolffd@0
|
567 fclose(modelfl);
|
wolffd@0
|
568 if(verbosity>=1) {
|
wolffd@0
|
569 printf("done\n");
|
wolffd@0
|
570 }
|
wolffd@0
|
571 }
|
wolffd@0
|
572
|
wolffd@0
|
573
|
wolffd@0
|
574 MODEL *read_model(char *modelfile)
|
wolffd@0
|
575 {
|
wolffd@0
|
576 FILE *modelfl;
|
wolffd@0
|
577 long i,queryid,slackid;
|
wolffd@0
|
578 double costfactor;
|
wolffd@0
|
579 long max_sv,max_words,ll,wpos;
|
wolffd@0
|
580 char *line,*comment;
|
wolffd@0
|
581 WORD *words;
|
wolffd@0
|
582 char version_buffer[100];
|
wolffd@0
|
583 MODEL *model;
|
wolffd@0
|
584
|
wolffd@0
|
585 if(verbosity>=1) {
|
wolffd@0
|
586 printf("Reading model..."); fflush(stdout);
|
wolffd@0
|
587 }
|
wolffd@0
|
588
|
wolffd@0
|
589 nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
|
wolffd@0
|
590 max_words+=2;
|
wolffd@0
|
591 ll+=2;
|
wolffd@0
|
592
|
wolffd@0
|
593 words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
|
wolffd@0
|
594 line = (char *)my_malloc(sizeof(char)*ll);
|
wolffd@0
|
595 model = (MODEL *)my_malloc(sizeof(MODEL));
|
wolffd@0
|
596
|
wolffd@0
|
597 if ((modelfl = fopen (modelfile, "r")) == NULL)
|
wolffd@0
|
598 { perror (modelfile); exit (1); }
|
wolffd@0
|
599
|
wolffd@0
|
600 fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
|
wolffd@0
|
601 if(strcmp(version_buffer,VERSION)) {
|
wolffd@0
|
602 perror ("Version of model-file does not match version of svm_classify!");
|
wolffd@0
|
603 exit (1);
|
wolffd@0
|
604 }
|
wolffd@0
|
605 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
|
wolffd@0
|
606 fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
|
wolffd@0
|
607 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
|
wolffd@0
|
608 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
|
wolffd@0
|
609 fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
|
wolffd@0
|
610 fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
|
wolffd@0
|
611
|
wolffd@0
|
612 fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
|
wolffd@0
|
613 fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
|
wolffd@0
|
614 fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
|
wolffd@0
|
615 fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
|
wolffd@0
|
616
|
wolffd@0
|
617 model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
|
wolffd@0
|
618 model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
|
wolffd@0
|
619 model->index=NULL;
|
wolffd@0
|
620 model->lin_weights=NULL;
|
wolffd@0
|
621
|
wolffd@0
|
622 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
623 fgets(line,(int)ll,modelfl);
|
wolffd@0
|
624 if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
|
wolffd@0
|
625 &costfactor,&wpos,max_words,&comment)) {
|
wolffd@0
|
626 printf("\nParsing error while reading model file in SV %ld!\n%s",
|
wolffd@0
|
627 i,line);
|
wolffd@0
|
628 exit(1);
|
wolffd@0
|
629 }
|
wolffd@0
|
630 model->supvec[i] = create_example(-1,
|
wolffd@0
|
631 0,0,
|
wolffd@0
|
632 0.0,
|
wolffd@0
|
633 create_svector(words,comment,1.0));
|
wolffd@0
|
634 }
|
wolffd@0
|
635 fclose(modelfl);
|
wolffd@0
|
636 free(line);
|
wolffd@0
|
637 free(words);
|
wolffd@0
|
638 if(verbosity>=1) {
|
wolffd@0
|
639 fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
|
wolffd@0
|
640 }
|
wolffd@0
|
641 return(model);
|
wolffd@0
|
642 }
|
wolffd@0
|
643
|
wolffd@0
|
644 MODEL *copy_model(MODEL *model)
|
wolffd@0
|
645 {
|
wolffd@0
|
646 MODEL *newmodel;
|
wolffd@0
|
647 long i;
|
wolffd@0
|
648
|
wolffd@0
|
649 newmodel=(MODEL *)my_malloc(sizeof(MODEL));
|
wolffd@0
|
650 (*newmodel)=(*model);
|
wolffd@0
|
651 newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
|
wolffd@0
|
652 newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
|
wolffd@0
|
653 newmodel->index = NULL; /* index is not copied */
|
wolffd@0
|
654 newmodel->supvec[0] = NULL;
|
wolffd@0
|
655 newmodel->alpha[0] = 0;
|
wolffd@0
|
656 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
657 newmodel->alpha[i]=model->alpha[i];
|
wolffd@0
|
658 newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
|
wolffd@0
|
659 model->supvec[i]->queryid,0,
|
wolffd@0
|
660 model->supvec[i]->costfactor,
|
wolffd@0
|
661 copy_svector(model->supvec[i]->fvec));
|
wolffd@0
|
662 }
|
wolffd@0
|
663 if(model->lin_weights) {
|
wolffd@0
|
664 newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
|
wolffd@0
|
665 for(i=0;i<model->totwords+1;i++)
|
wolffd@0
|
666 newmodel->lin_weights[i]=model->lin_weights[i];
|
wolffd@0
|
667 }
|
wolffd@0
|
668 return(newmodel);
|
wolffd@0
|
669 }
|
wolffd@0
|
670
|
wolffd@0
|
671 void free_model(MODEL *model, int deep)
|
wolffd@0
|
672 {
|
wolffd@0
|
673 long i;
|
wolffd@0
|
674
|
wolffd@0
|
675 if(model->supvec) {
|
wolffd@0
|
676 if(deep) {
|
wolffd@0
|
677 for(i=1;i<model->sv_num;i++) {
|
wolffd@0
|
678 free_example(model->supvec[i],1);
|
wolffd@0
|
679 }
|
wolffd@0
|
680 }
|
wolffd@0
|
681 free(model->supvec);
|
wolffd@0
|
682 }
|
wolffd@0
|
683 if(model->alpha) free(model->alpha);
|
wolffd@0
|
684 if(model->index) free(model->index);
|
wolffd@0
|
685 if(model->lin_weights) free(model->lin_weights);
|
wolffd@0
|
686 free(model);
|
wolffd@0
|
687 }
|
wolffd@0
|
688
|
wolffd@0
|
689
|
wolffd@0
|
690 void read_documents(char *docfile, DOC ***docs, double **label,
|
wolffd@0
|
691 long int *totwords, long int *totdoc)
|
wolffd@0
|
692 {
|
wolffd@0
|
693 char *line,*comment;
|
wolffd@0
|
694 WORD *words;
|
wolffd@0
|
695 long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
|
wolffd@0
|
696 long max_words_doc, ll;
|
wolffd@0
|
697 double doc_label,costfactor;
|
wolffd@0
|
698 FILE *docfl;
|
wolffd@0
|
699
|
wolffd@0
|
700 if(verbosity>=1) {
|
wolffd@0
|
701 printf("Scanning examples..."); fflush(stdout);
|
wolffd@0
|
702 }
|
wolffd@0
|
703 nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
|
wolffd@0
|
704 max_words_doc+=2;
|
wolffd@0
|
705 ll+=2;
|
wolffd@0
|
706 max_docs+=2;
|
wolffd@0
|
707 if(verbosity>=1) {
|
wolffd@0
|
708 printf("done\n"); fflush(stdout);
|
wolffd@0
|
709 }
|
wolffd@0
|
710
|
wolffd@0
|
711 (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */
|
wolffd@0
|
712 (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
|
wolffd@0
|
713 line = (char *)my_malloc(sizeof(char)*ll);
|
wolffd@0
|
714
|
wolffd@0
|
715 if ((docfl = fopen (docfile, "r")) == NULL)
|
wolffd@0
|
716 { perror (docfile); exit (1); }
|
wolffd@0
|
717
|
wolffd@0
|
718 words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
|
wolffd@0
|
719 if(verbosity>=1) {
|
wolffd@0
|
720 printf("Reading examples into memory..."); fflush(stdout);
|
wolffd@0
|
721 }
|
wolffd@0
|
722 dnum=0;
|
wolffd@0
|
723 (*totwords)=0;
|
wolffd@0
|
724 while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
|
wolffd@0
|
725 if(line[0] == '#') continue; /* line contains comments */
|
wolffd@0
|
726 if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
|
wolffd@0
|
727 &wpos,max_words_doc,&comment)) {
|
wolffd@0
|
728 printf("\nParsing error in line %ld!\n%s",dnum,line);
|
wolffd@0
|
729 exit(1);
|
wolffd@0
|
730 }
|
wolffd@0
|
731 (*label)[dnum]=doc_label;
|
wolffd@0
|
732 /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
|
wolffd@0
|
733 if(doc_label > 0) dpos++;
|
wolffd@0
|
734 if (doc_label < 0) dneg++;
|
wolffd@0
|
735 if (doc_label == 0) dunlab++;
|
wolffd@0
|
736 if((wpos>1) && ((words[wpos-2]).wnum>(*totwords)))
|
wolffd@0
|
737 (*totwords)=(words[wpos-2]).wnum;
|
wolffd@0
|
738 if((*totwords) > MAXFEATNUM) {
|
wolffd@0
|
739 printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
|
wolffd@0
|
740 printf("LINE: %s\n",line);
|
wolffd@0
|
741 exit(1);
|
wolffd@0
|
742 }
|
wolffd@0
|
743 (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
|
wolffd@0
|
744 create_svector(words,comment,1.0));
|
wolffd@0
|
745 /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */
|
wolffd@0
|
746 dnum++;
|
wolffd@0
|
747 if(verbosity>=1) {
|
wolffd@0
|
748 if((dnum % 100) == 0) {
|
wolffd@0
|
749 printf("%ld..",dnum); fflush(stdout);
|
wolffd@0
|
750 }
|
wolffd@0
|
751 }
|
wolffd@0
|
752 }
|
wolffd@0
|
753
|
wolffd@0
|
754 fclose(docfl);
|
wolffd@0
|
755 free(line);
|
wolffd@0
|
756 free(words);
|
wolffd@0
|
757 if(verbosity>=1) {
|
wolffd@0
|
758 fprintf(stdout, "OK. (%ld examples read)\n", dnum);
|
wolffd@0
|
759 }
|
wolffd@0
|
760 (*totdoc)=dnum;
|
wolffd@0
|
761 }
|
wolffd@0
|
762
|
wolffd@0
|
763 int parse_document(char *line, WORD *words, double *label,
|
wolffd@0
|
764 long *queryid, long *slackid, double *costfactor,
|
wolffd@0
|
765 long int *numwords, long int max_words_doc,
|
wolffd@0
|
766 char **comment)
|
wolffd@0
|
767 {
|
wolffd@0
|
768 register long wpos,pos;
|
wolffd@0
|
769 long wnum;
|
wolffd@0
|
770 double weight;
|
wolffd@0
|
771 int numread;
|
wolffd@0
|
772 char featurepair[1000],junk[1000];
|
wolffd@0
|
773
|
wolffd@0
|
774 (*queryid)=0;
|
wolffd@0
|
775 (*slackid)=0;
|
wolffd@0
|
776 (*costfactor)=1;
|
wolffd@0
|
777
|
wolffd@0
|
778 pos=0;
|
wolffd@0
|
779 (*comment)=NULL;
|
wolffd@0
|
780 while(line[pos] ) { /* cut off comments */
|
wolffd@0
|
781 if((line[pos] == '#') && (!(*comment))) {
|
wolffd@0
|
782 line[pos]=0;
|
wolffd@0
|
783 (*comment)=&(line[pos+1]);
|
wolffd@0
|
784 }
|
wolffd@0
|
785 if(line[pos] == '\n') { /* strip the CR */
|
wolffd@0
|
786 line[pos]=0;
|
wolffd@0
|
787 }
|
wolffd@0
|
788 pos++;
|
wolffd@0
|
789 }
|
wolffd@0
|
790 if(!(*comment)) (*comment)=&(line[pos]);
|
wolffd@0
|
791 /* printf("Comment: '%s'\n",(*comment)); */
|
wolffd@0
|
792
|
wolffd@0
|
793 wpos=0;
|
wolffd@0
|
794 /* check, that line starts with target value or zero, but not with
|
wolffd@0
|
795 feature pair */
|
wolffd@0
|
796 if(sscanf(line,"%s",featurepair) == EOF) return(0);
|
wolffd@0
|
797 pos=0;
|
wolffd@0
|
798 while((featurepair[pos] != ':') && featurepair[pos]) pos++;
|
wolffd@0
|
799 if(featurepair[pos] == ':') {
|
wolffd@0
|
800 perror ("Line must start with label or 0!!!\n");
|
wolffd@0
|
801 printf("LINE: %s\n",line);
|
wolffd@0
|
802 exit (1);
|
wolffd@0
|
803 }
|
wolffd@0
|
804 /* read the target value */
|
wolffd@0
|
805 if(sscanf(line,"%lf",label) == EOF) return(0);
|
wolffd@0
|
806 pos=0;
|
wolffd@0
|
807 while(space_or_null((int)line[pos])) pos++;
|
wolffd@0
|
808 while((!space_or_null((int)line[pos])) && line[pos]) pos++;
|
wolffd@0
|
809 while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
|
wolffd@0
|
810 (numread > 0) &&
|
wolffd@0
|
811 (wpos<max_words_doc)) {
|
wolffd@0
|
812 /* printf("%s\n",featurepair); */
|
wolffd@0
|
813 while(space_or_null((int)line[pos])) pos++;
|
wolffd@0
|
814 while((!space_or_null((int)line[pos])) && line[pos]) pos++;
|
wolffd@0
|
815 if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
|
wolffd@0
|
816 /* it is the query id */
|
wolffd@0
|
817 (*queryid)=(long)wnum;
|
wolffd@0
|
818 }
|
wolffd@0
|
819 else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
|
wolffd@0
|
820 /* it is the slack id */
|
wolffd@0
|
821 if(wnum > 0)
|
wolffd@0
|
822 (*slackid)=(long)wnum;
|
wolffd@0
|
823 else {
|
wolffd@0
|
824 perror ("Slack-id must be greater or equal to 1!!!\n");
|
wolffd@0
|
825 printf("LINE: %s\n",line);
|
wolffd@0
|
826 exit (1);
|
wolffd@0
|
827 }
|
wolffd@0
|
828 }
|
wolffd@0
|
829 else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
|
wolffd@0
|
830 /* it is the example-dependent cost factor */
|
wolffd@0
|
831 (*costfactor)=(double)weight;
|
wolffd@0
|
832 }
|
wolffd@0
|
833 else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
|
wolffd@0
|
834 /* it is a regular feature */
|
wolffd@0
|
835 if(wnum<=0) {
|
wolffd@0
|
836 perror ("Feature numbers must be larger or equal to 1!!!\n");
|
wolffd@0
|
837 printf("LINE: %s\n",line);
|
wolffd@0
|
838 exit (1);
|
wolffd@0
|
839 }
|
wolffd@0
|
840 if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
|
wolffd@0
|
841 perror ("Features must be in increasing order!!!\n");
|
wolffd@0
|
842 printf("LINE: %s\n",line);
|
wolffd@0
|
843 exit (1);
|
wolffd@0
|
844 }
|
wolffd@0
|
845 (words[wpos]).wnum=wnum;
|
wolffd@0
|
846 (words[wpos]).weight=(FVAL)weight;
|
wolffd@0
|
847 wpos++;
|
wolffd@0
|
848 }
|
wolffd@0
|
849 else {
|
wolffd@0
|
850 perror ("Cannot parse feature/value pair!!!\n");
|
wolffd@0
|
851 printf("'%s' in LINE: %s\n",featurepair,line);
|
wolffd@0
|
852 exit (1);
|
wolffd@0
|
853 }
|
wolffd@0
|
854 }
|
wolffd@0
|
855 (words[wpos]).wnum=0;
|
wolffd@0
|
856 (*numwords)=wpos+1;
|
wolffd@0
|
857 return(1);
|
wolffd@0
|
858 }
|
wolffd@0
|
859
|
wolffd@0
|
860 double *read_alphas(char *alphafile,long totdoc)
|
wolffd@0
|
861 /* reads the alpha vector from a file as written by the
|
wolffd@0
|
862 write_alphas function */
|
wolffd@0
|
863 {
|
wolffd@0
|
864 FILE *fl;
|
wolffd@0
|
865 double *alpha;
|
wolffd@0
|
866 long dnum;
|
wolffd@0
|
867
|
wolffd@0
|
868 if ((fl = fopen (alphafile, "r")) == NULL)
|
wolffd@0
|
869 { perror (alphafile); exit (1); }
|
wolffd@0
|
870
|
wolffd@0
|
871 alpha = (double *)my_malloc(sizeof(double)*totdoc);
|
wolffd@0
|
872 if(verbosity>=1) {
|
wolffd@0
|
873 printf("Reading alphas..."); fflush(stdout);
|
wolffd@0
|
874 }
|
wolffd@0
|
875 dnum=0;
|
wolffd@0
|
876 while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
|
wolffd@0
|
877 dnum++;
|
wolffd@0
|
878 }
|
wolffd@0
|
879 if(dnum != totdoc)
|
wolffd@0
|
880 { perror ("\nNot enough values in alpha file!"); exit (1); }
|
wolffd@0
|
881 fclose(fl);
|
wolffd@0
|
882
|
wolffd@0
|
883 if(verbosity>=1) {
|
wolffd@0
|
884 printf("done\n"); fflush(stdout);
|
wolffd@0
|
885 }
|
wolffd@0
|
886
|
wolffd@0
|
887 return(alpha);
|
wolffd@0
|
888 }
|
wolffd@0
|
889
|
wolffd@0
|
890 void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
|
wolffd@0
|
891 /* Grep through file and count number of lines, maximum number of
|
wolffd@0
|
892 spaces per line, and longest line. */
|
wolffd@0
|
893 {
|
wolffd@0
|
894 FILE *fl;
|
wolffd@0
|
895 int ic;
|
wolffd@0
|
896 char c;
|
wolffd@0
|
897 long current_length,current_wol;
|
wolffd@0
|
898
|
wolffd@0
|
899 if ((fl = fopen (file, "r")) == NULL)
|
wolffd@0
|
900 { perror (file); exit (1); }
|
wolffd@0
|
901 current_length=0;
|
wolffd@0
|
902 current_wol=0;
|
wolffd@0
|
903 (*ll)=0;
|
wolffd@0
|
904 (*nol)=1;
|
wolffd@0
|
905 (*wol)=0;
|
wolffd@0
|
906 while((ic=getc(fl)) != EOF) {
|
wolffd@0
|
907 c=(char)ic;
|
wolffd@0
|
908 current_length++;
|
wolffd@0
|
909 if(space_or_null((int)c)) {
|
wolffd@0
|
910 current_wol++;
|
wolffd@0
|
911 }
|
wolffd@0
|
912 if(c == '\n') {
|
wolffd@0
|
913 (*nol)++;
|
wolffd@0
|
914 if(current_length>(*ll)) {
|
wolffd@0
|
915 (*ll)=current_length;
|
wolffd@0
|
916 }
|
wolffd@0
|
917 if(current_wol>(*wol)) {
|
wolffd@0
|
918 (*wol)=current_wol;
|
wolffd@0
|
919 }
|
wolffd@0
|
920 current_length=0;
|
wolffd@0
|
921 current_wol=0;
|
wolffd@0
|
922 }
|
wolffd@0
|
923 }
|
wolffd@0
|
924 fclose(fl);
|
wolffd@0
|
925 }
|
wolffd@0
|
926
|
wolffd@0
|
927 long minl(long int a, long int b)
|
wolffd@0
|
928 {
|
wolffd@0
|
929 if(a<b)
|
wolffd@0
|
930 return(a);
|
wolffd@0
|
931 else
|
wolffd@0
|
932 return(b);
|
wolffd@0
|
933 }
|
wolffd@0
|
934
|
wolffd@0
|
935 long maxl(long int a, long int b)
|
wolffd@0
|
936 {
|
wolffd@0
|
937 if(a>b)
|
wolffd@0
|
938 return(a);
|
wolffd@0
|
939 else
|
wolffd@0
|
940 return(b);
|
wolffd@0
|
941 }
|
wolffd@0
|
942
|
wolffd@0
|
943 long get_runtime(void)
|
wolffd@0
|
944 {
|
wolffd@0
|
945 clock_t start;
|
wolffd@0
|
946 start = clock();
|
wolffd@0
|
947 return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
|
wolffd@0
|
948 }
|
wolffd@0
|
949
|
wolffd@0
|
950
|
wolffd@0
|
951 # ifdef _MSC_VER
|
wolffd@0
|
952
|
wolffd@0
|
953 int isnan(double a)
|
wolffd@0
|
954 {
|
wolffd@0
|
955 return(_isnan(a));
|
wolffd@0
|
956 }
|
wolffd@0
|
957
|
wolffd@0
|
958 # endif
|
wolffd@0
|
959
|
wolffd@0
|
960 int space_or_null(int c) {
|
wolffd@0
|
961 if (c==0)
|
wolffd@0
|
962 return 1;
|
wolffd@0
|
963 return isspace(c);
|
wolffd@0
|
964 }
|
wolffd@0
|
965
|
wolffd@0
|
966 void *my_malloc(size_t size)
|
wolffd@0
|
967 {
|
wolffd@0
|
968 void *ptr;
|
wolffd@0
|
969 ptr=(void *)malloc(size);
|
wolffd@0
|
970 if(!ptr) {
|
wolffd@0
|
971 perror ("Out of memory!\n");
|
wolffd@0
|
972 exit (1);
|
wolffd@0
|
973 }
|
wolffd@0
|
974 return(ptr);
|
wolffd@0
|
975 }
|
wolffd@0
|
976
|
wolffd@0
|
977 void copyright_notice(void)
|
wolffd@0
|
978 {
|
wolffd@0
|
979 printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
|
wolffd@0
|
980 printf("This software is available for non-commercial use only. It must not\n");
|
wolffd@0
|
981 printf("be modified and distributed without prior permission of the author.\n");
|
wolffd@0
|
982 printf("The author is not responsible for implications from the use of this\n");
|
wolffd@0
|
983 printf("software.\n\n");
|
wolffd@0
|
984 }
|