wolffd@0: /************************************************************************/ wolffd@0: /* */ wolffd@0: /* svm_common.c */ wolffd@0: /* */ wolffd@0: /* Definitions and functions used in both svm_learn and svm_classify. */ wolffd@0: /* */ wolffd@0: /* Author: Thorsten Joachims */ wolffd@0: /* Date: 02.07.04 */ wolffd@0: /* */ wolffd@0: /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */ wolffd@0: /* */ wolffd@0: /* This software is available for non-commercial use only. It must */ wolffd@0: /* not be modified and distributed without prior permission of the */ wolffd@0: /* author. The author is not responsible for implications from the */ wolffd@0: /* use of this software. */ wolffd@0: /* */ wolffd@0: /************************************************************************/ wolffd@0: wolffd@0: # include "ctype.h" wolffd@0: # include "svm_common.h" wolffd@0: # include "kernel.h" /* this contains a user supplied kernel */ wolffd@0: wolffd@0: long verbosity; /* verbosity level (0-4) */ wolffd@0: long kernel_cache_statistic; wolffd@0: wolffd@0: double classify_example(MODEL *model, DOC *ex) wolffd@0: /* classifies one example */ wolffd@0: { wolffd@0: register long i; wolffd@0: register double dist; wolffd@0: wolffd@0: if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights)) wolffd@0: return(classify_example_linear(model,ex)); wolffd@0: wolffd@0: dist=0; wolffd@0: for(i=1;isv_num;i++) { wolffd@0: dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i]; wolffd@0: } wolffd@0: return(dist-model->b); wolffd@0: } wolffd@0: wolffd@0: double classify_example_linear(MODEL *model, DOC *ex) wolffd@0: /* classifies example for linear kernel */ wolffd@0: wolffd@0: /* important: the model must have the linear weight vector computed */ wolffd@0: /* use: add_weight_vector_to_linear_model(&model); */ wolffd@0: wolffd@0: wolffd@0: /* important: the feature numbers in the example to classify must */ wolffd@0: /* not be larger than the weight vector! */ wolffd@0: { wolffd@0: double sum=0; wolffd@0: SVECTOR *f; wolffd@0: wolffd@0: for(f=ex->fvec;f;f=f->next) wolffd@0: sum+=f->factor*sprod_ns(model->lin_weights,f); wolffd@0: return(sum-model->b); wolffd@0: } wolffd@0: wolffd@0: wolffd@0: CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b) wolffd@0: /* calculate the kernel function */ wolffd@0: { wolffd@0: double sum=0; wolffd@0: SVECTOR *fa,*fb; wolffd@0: wolffd@0: /* in case the constraints are sums of feature vector as represented wolffd@0: as a list of SVECTOR's with their coefficient factor in the sum, wolffd@0: take the kernel between all pairs */ wolffd@0: for(fa=a->fvec;fa;fa=fa->next) { wolffd@0: for(fb=b->fvec;fb;fb=fb->next) { wolffd@0: if(fa->kernel_id == fb->kernel_id) wolffd@0: sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb); wolffd@0: } wolffd@0: } wolffd@0: return(sum); wolffd@0: } wolffd@0: wolffd@0: CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b) wolffd@0: /* calculate the kernel function between two vectors */ wolffd@0: { wolffd@0: kernel_cache_statistic++; wolffd@0: switch(kernel_parm->kernel_type) { wolffd@0: case 0: /* linear */ wolffd@0: return((CFLOAT)sprod_ss(a,b)); wolffd@0: case 1: /* polynomial */ wolffd@0: return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree)); wolffd@0: case 2: /* radial basis function */ wolffd@0: return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq))); wolffd@0: case 3: /* sigmoid neural net */ wolffd@0: return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const)); wolffd@0: case 4: /* custom-kernel supplied in file kernel.h*/ wolffd@0: return((CFLOAT)custom_kernel(kernel_parm,a,b)); wolffd@0: default: printf("Error: Unknown kernel function\n"); exit(1); wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: wolffd@0: SVECTOR *create_svector(WORD *words,char *userdefined,double factor) wolffd@0: { wolffd@0: SVECTOR *vec; wolffd@0: long fnum,i; wolffd@0: wolffd@0: fnum=0; wolffd@0: while(words[fnum].wnum) { wolffd@0: fnum++; wolffd@0: } wolffd@0: fnum++; wolffd@0: vec = (SVECTOR *)my_malloc(sizeof(SVECTOR)); wolffd@0: vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum)); wolffd@0: for(i=0;iwords[i]=words[i]; wolffd@0: } wolffd@0: vec->twonorm_sq=sprod_ss(vec,vec); wolffd@0: wolffd@0: fnum=0; wolffd@0: while(userdefined[fnum]) { wolffd@0: fnum++; wolffd@0: } wolffd@0: fnum++; wolffd@0: vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum)); wolffd@0: for(i=0;iuserdefined[i]=userdefined[i]; wolffd@0: } wolffd@0: vec->kernel_id=0; wolffd@0: vec->next=NULL; wolffd@0: vec->factor=factor; wolffd@0: return(vec); wolffd@0: } wolffd@0: wolffd@0: SVECTOR *copy_svector(SVECTOR *vec) wolffd@0: { wolffd@0: SVECTOR *newvec=NULL; wolffd@0: if(vec) { wolffd@0: newvec=create_svector(vec->words,vec->userdefined,vec->factor); wolffd@0: newvec->next=copy_svector(vec->next); wolffd@0: } wolffd@0: return(newvec); wolffd@0: } wolffd@0: wolffd@0: void free_svector(SVECTOR *vec) wolffd@0: { wolffd@0: if(vec) { wolffd@0: free(vec->words); wolffd@0: if(vec->userdefined) wolffd@0: free(vec->userdefined); wolffd@0: free_svector(vec->next); wolffd@0: free(vec); wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: double sprod_ss(SVECTOR *a, SVECTOR *b) wolffd@0: /* compute the inner product of two sparse vectors */ wolffd@0: { wolffd@0: register CFLOAT sum=0; wolffd@0: register WORD *ai,*bj; wolffd@0: ai=a->words; wolffd@0: bj=b->words; wolffd@0: while (ai->wnum && bj->wnum) { wolffd@0: if(ai->wnum > bj->wnum) { wolffd@0: bj++; wolffd@0: } wolffd@0: else if (ai->wnum < bj->wnum) { wolffd@0: ai++; wolffd@0: } wolffd@0: else { wolffd@0: sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight); wolffd@0: ai++; wolffd@0: bj++; wolffd@0: } wolffd@0: } wolffd@0: return((double)sum); wolffd@0: } wolffd@0: wolffd@0: SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b) wolffd@0: /* compute the difference a-b of two sparse vectors */ wolffd@0: /* Note: SVECTOR lists are not followed, but only the first wolffd@0: SVECTOR is used */ wolffd@0: { wolffd@0: SVECTOR *vec; wolffd@0: register WORD *sum,*sumi; wolffd@0: register WORD *ai,*bj; wolffd@0: long veclength; wolffd@0: wolffd@0: ai=a->words; wolffd@0: bj=b->words; wolffd@0: veclength=0; wolffd@0: while (ai->wnum && bj->wnum) { wolffd@0: if(ai->wnum > bj->wnum) { wolffd@0: veclength++; wolffd@0: bj++; wolffd@0: } wolffd@0: else if (ai->wnum < bj->wnum) { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: } wolffd@0: else { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: bj++; wolffd@0: } wolffd@0: } wolffd@0: while (bj->wnum) { wolffd@0: veclength++; wolffd@0: bj++; wolffd@0: } wolffd@0: while (ai->wnum) { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: } wolffd@0: veclength++; wolffd@0: wolffd@0: sum=(WORD *)my_malloc(sizeof(WORD)*veclength); wolffd@0: sumi=sum; wolffd@0: ai=a->words; wolffd@0: bj=b->words; wolffd@0: while (ai->wnum && bj->wnum) { wolffd@0: if(ai->wnum > bj->wnum) { wolffd@0: (*sumi)=(*bj); wolffd@0: sumi->weight*=(-1); wolffd@0: sumi++; wolffd@0: bj++; wolffd@0: } wolffd@0: else if (ai->wnum < bj->wnum) { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: } wolffd@0: else { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi->weight-=bj->weight; wolffd@0: if(sumi->weight != 0) wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: bj++; wolffd@0: } wolffd@0: } wolffd@0: while (bj->wnum) { wolffd@0: (*sumi)=(*bj); wolffd@0: sumi->weight*=(-1); wolffd@0: sumi++; wolffd@0: bj++; wolffd@0: } wolffd@0: while (ai->wnum) { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: } wolffd@0: sumi->wnum=0; wolffd@0: wolffd@0: vec=create_svector(sum,"",1.0); wolffd@0: free(sum); wolffd@0: wolffd@0: return(vec); wolffd@0: } wolffd@0: wolffd@0: SVECTOR* add_ss(SVECTOR *a, SVECTOR *b) wolffd@0: /* compute the sum a+b of two sparse vectors */ wolffd@0: /* Note: SVECTOR lists are not followed, but only the first wolffd@0: SVECTOR is used */ wolffd@0: { wolffd@0: SVECTOR *vec; wolffd@0: register WORD *sum,*sumi; wolffd@0: register WORD *ai,*bj; wolffd@0: long veclength; wolffd@0: wolffd@0: ai=a->words; wolffd@0: bj=b->words; wolffd@0: veclength=0; wolffd@0: while (ai->wnum && bj->wnum) { wolffd@0: if(ai->wnum > bj->wnum) { wolffd@0: veclength++; wolffd@0: bj++; wolffd@0: } wolffd@0: else if (ai->wnum < bj->wnum) { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: } wolffd@0: else { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: bj++; wolffd@0: } wolffd@0: } wolffd@0: while (bj->wnum) { wolffd@0: veclength++; wolffd@0: bj++; wolffd@0: } wolffd@0: while (ai->wnum) { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: } wolffd@0: veclength++; wolffd@0: wolffd@0: /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/ wolffd@0: wolffd@0: sum=(WORD *)my_malloc(sizeof(WORD)*veclength); wolffd@0: sumi=sum; wolffd@0: ai=a->words; wolffd@0: bj=b->words; wolffd@0: while (ai->wnum && bj->wnum) { wolffd@0: if(ai->wnum > bj->wnum) { wolffd@0: (*sumi)=(*bj); wolffd@0: sumi++; wolffd@0: bj++; wolffd@0: } wolffd@0: else if (ai->wnum < bj->wnum) { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: } wolffd@0: else { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi->weight+=bj->weight; wolffd@0: if(sumi->weight != 0) wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: bj++; wolffd@0: } wolffd@0: } wolffd@0: while (bj->wnum) { wolffd@0: (*sumi)=(*bj); wolffd@0: sumi++; wolffd@0: bj++; wolffd@0: } wolffd@0: while (ai->wnum) { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: } wolffd@0: sumi->wnum=0; wolffd@0: wolffd@0: vec=create_svector(sum,"",1.0); wolffd@0: free(sum); wolffd@0: wolffd@0: return(vec); wolffd@0: } wolffd@0: wolffd@0: SVECTOR* add_list_ss(SVECTOR *a) wolffd@0: /* computes the linear combination of the SVECTOR list weighted wolffd@0: by the factor of each SVECTOR */ wolffd@0: { wolffd@0: SVECTOR *scaled,*oldsum,*sum,*f; wolffd@0: WORD empty[2]; wolffd@0: wolffd@0: if(a){ wolffd@0: sum=smult_s(a,a->factor); wolffd@0: for(f=a->next;f;f=f->next) { wolffd@0: scaled=smult_s(f,f->factor); wolffd@0: oldsum=sum; wolffd@0: sum=add_ss(sum,scaled); wolffd@0: free_svector(oldsum); wolffd@0: free_svector(scaled); wolffd@0: } wolffd@0: sum->factor=1.0; wolffd@0: } wolffd@0: else { wolffd@0: empty[0].wnum=0; wolffd@0: sum=create_svector(empty,"",1.0); wolffd@0: } wolffd@0: return(sum); wolffd@0: } wolffd@0: wolffd@0: void append_svector_list(SVECTOR *a, SVECTOR *b) wolffd@0: /* appends SVECTOR b to the end of SVECTOR a. */ wolffd@0: { wolffd@0: SVECTOR *f; wolffd@0: wolffd@0: for(f=a;f->next;f=f->next); /* find end of first vector list */ wolffd@0: f->next=b; /* append the two vector lists */ wolffd@0: } wolffd@0: wolffd@0: SVECTOR* smult_s(SVECTOR *a, double factor) wolffd@0: /* scale sparse vector a by factor */ wolffd@0: { wolffd@0: SVECTOR *vec; wolffd@0: register WORD *sum,*sumi; wolffd@0: register WORD *ai; wolffd@0: long veclength; wolffd@0: wolffd@0: ai=a->words; wolffd@0: veclength=0; wolffd@0: while (ai->wnum) { wolffd@0: veclength++; wolffd@0: ai++; wolffd@0: } wolffd@0: veclength++; wolffd@0: wolffd@0: sum=(WORD *)my_malloc(sizeof(WORD)*veclength); wolffd@0: sumi=sum; wolffd@0: ai=a->words; wolffd@0: while (ai->wnum) { wolffd@0: (*sumi)=(*ai); wolffd@0: sumi->weight*=factor; wolffd@0: if(sumi->weight != 0) wolffd@0: sumi++; wolffd@0: ai++; wolffd@0: } wolffd@0: sumi->wnum=0; wolffd@0: wolffd@0: vec=create_svector(sum,a->userdefined,a->factor); wolffd@0: free(sum); wolffd@0: wolffd@0: return(vec); wolffd@0: } wolffd@0: wolffd@0: int featvec_eq(SVECTOR *a, SVECTOR *b) wolffd@0: /* tests two sparse vectors for equality */ wolffd@0: { wolffd@0: register WORD *ai,*bj; wolffd@0: ai=a->words; wolffd@0: bj=b->words; wolffd@0: while (ai->wnum && bj->wnum) { wolffd@0: if(ai->wnum > bj->wnum) { wolffd@0: if((CFLOAT)(bj->weight) != 0) wolffd@0: return(0); wolffd@0: bj++; wolffd@0: } wolffd@0: else if (ai->wnum < bj->wnum) { wolffd@0: if((CFLOAT)(ai->weight) != 0) wolffd@0: return(0); wolffd@0: ai++; wolffd@0: } wolffd@0: else { wolffd@0: if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight)) wolffd@0: return(0); wolffd@0: ai++; wolffd@0: bj++; wolffd@0: } wolffd@0: } wolffd@0: return(1); wolffd@0: } wolffd@0: wolffd@0: double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm) wolffd@0: /* compute length of weight vector */ wolffd@0: { wolffd@0: register long i,j; wolffd@0: register double sum=0,alphai; wolffd@0: register DOC *supveci; wolffd@0: wolffd@0: for(i=1;isv_num;i++) { wolffd@0: alphai=model->alpha[i]; wolffd@0: supveci=model->supvec[i]; wolffd@0: for(j=1;jsv_num;j++) { wolffd@0: sum+=alphai*model->alpha[j] wolffd@0: *kernel(kernel_parm,supveci,model->supvec[j]); wolffd@0: } wolffd@0: } wolffd@0: return(sqrt(sum)); wolffd@0: } wolffd@0: wolffd@0: void clear_vector_n(double *vec, long int n) wolffd@0: { wolffd@0: register long i; wolffd@0: for(i=0;i<=n;i++) vec[i]=0; wolffd@0: } wolffd@0: wolffd@0: void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor) wolffd@0: { wolffd@0: register WORD *ai; wolffd@0: ai=vec_s->words; wolffd@0: while (ai->wnum) { wolffd@0: vec_n[ai->wnum]+=(faktor*ai->weight); wolffd@0: ai++; wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: double sprod_ns(double *vec_n, SVECTOR *vec_s) wolffd@0: { wolffd@0: register double sum=0; wolffd@0: register WORD *ai; wolffd@0: ai=vec_s->words; wolffd@0: while (ai->wnum) { wolffd@0: sum+=(vec_n[ai->wnum]*ai->weight); wolffd@0: ai++; wolffd@0: } wolffd@0: return(sum); wolffd@0: } wolffd@0: wolffd@0: void add_weight_vector_to_linear_model(MODEL *model) wolffd@0: /* compute weight vector in linear case and add to model */ wolffd@0: { wolffd@0: long i; wolffd@0: SVECTOR *f; wolffd@0: wolffd@0: model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1)); wolffd@0: clear_vector_n(model->lin_weights,model->totwords); wolffd@0: for(i=1;isv_num;i++) { wolffd@0: for(f=(model->supvec[i])->fvec;f;f=f->next) wolffd@0: add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]); wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: wolffd@0: DOC *create_example(long docnum, long queryid, long slackid, wolffd@0: double costfactor, SVECTOR *fvec) wolffd@0: { wolffd@0: DOC *example; wolffd@0: example = (DOC *)my_malloc(sizeof(DOC)); wolffd@0: example->docnum=docnum; wolffd@0: example->queryid=queryid; wolffd@0: example->slackid=slackid; wolffd@0: example->costfactor=costfactor; wolffd@0: example->fvec=fvec; wolffd@0: return(example); wolffd@0: } wolffd@0: wolffd@0: void free_example(DOC *example, long deep) wolffd@0: { wolffd@0: if(example) { wolffd@0: if(deep) { wolffd@0: if(example->fvec) wolffd@0: free_svector(example->fvec); wolffd@0: } wolffd@0: free(example); wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: void write_model(char *modelfile, MODEL *model) wolffd@0: { wolffd@0: FILE *modelfl; wolffd@0: long j,i,sv_num; wolffd@0: SVECTOR *v; wolffd@0: wolffd@0: if(verbosity>=1) { wolffd@0: printf("Writing model file..."); fflush(stdout); wolffd@0: } wolffd@0: if ((modelfl = fopen (modelfile, "w")) == NULL) wolffd@0: { perror (modelfile); exit (1); } wolffd@0: fprintf(modelfl,"SVM-light Version %s\n",VERSION); wolffd@0: fprintf(modelfl,"%ld # kernel type\n", wolffd@0: model->kernel_parm.kernel_type); wolffd@0: fprintf(modelfl,"%ld # kernel parameter -d \n", wolffd@0: model->kernel_parm.poly_degree); wolffd@0: fprintf(modelfl,"%.8g # kernel parameter -g \n", wolffd@0: model->kernel_parm.rbf_gamma); wolffd@0: fprintf(modelfl,"%.8g # kernel parameter -s \n", wolffd@0: model->kernel_parm.coef_lin); wolffd@0: fprintf(modelfl,"%.8g # kernel parameter -r \n", wolffd@0: model->kernel_parm.coef_const); wolffd@0: fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom); wolffd@0: fprintf(modelfl,"%ld # highest feature index \n",model->totwords); wolffd@0: fprintf(modelfl,"%ld # number of training documents \n",model->totdoc); wolffd@0: wolffd@0: sv_num=1; wolffd@0: for(i=1;isv_num;i++) { wolffd@0: for(v=model->supvec[i]->fvec;v;v=v->next) wolffd@0: sv_num++; wolffd@0: } wolffd@0: fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num); wolffd@0: fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b); wolffd@0: wolffd@0: for(i=1;isv_num;i++) { wolffd@0: for(v=model->supvec[i]->fvec;v;v=v->next) { wolffd@0: fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor); wolffd@0: for (j=0; (v->words[j]).wnum; j++) { wolffd@0: fprintf(modelfl,"%ld:%.8g ", wolffd@0: (long)(v->words[j]).wnum, wolffd@0: (double)(v->words[j]).weight); wolffd@0: } wolffd@0: fprintf(modelfl,"#%s\n",v->userdefined); wolffd@0: /* NOTE: this could be made more efficient by summing the wolffd@0: alpha's of identical vectors before writing them to the wolffd@0: file. */ wolffd@0: } wolffd@0: } wolffd@0: fclose(modelfl); wolffd@0: if(verbosity>=1) { wolffd@0: printf("done\n"); wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: wolffd@0: MODEL *read_model(char *modelfile) wolffd@0: { wolffd@0: FILE *modelfl; wolffd@0: long i,queryid,slackid; wolffd@0: double costfactor; wolffd@0: long max_sv,max_words,ll,wpos; wolffd@0: char *line,*comment; wolffd@0: WORD *words; wolffd@0: char version_buffer[100]; wolffd@0: MODEL *model; wolffd@0: wolffd@0: if(verbosity>=1) { wolffd@0: printf("Reading model..."); fflush(stdout); wolffd@0: } wolffd@0: wolffd@0: nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */ wolffd@0: max_words+=2; wolffd@0: ll+=2; wolffd@0: wolffd@0: words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10)); wolffd@0: line = (char *)my_malloc(sizeof(char)*ll); wolffd@0: model = (MODEL *)my_malloc(sizeof(MODEL)); wolffd@0: wolffd@0: if ((modelfl = fopen (modelfile, "r")) == NULL) wolffd@0: { perror (modelfile); exit (1); } wolffd@0: wolffd@0: fscanf(modelfl,"SVM-light Version %s\n",version_buffer); wolffd@0: if(strcmp(version_buffer,VERSION)) { wolffd@0: perror ("Version of model-file does not match version of svm_classify!"); wolffd@0: exit (1); wolffd@0: } wolffd@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type); wolffd@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree); wolffd@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma); wolffd@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin); wolffd@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const); wolffd@0: fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom); wolffd@0: wolffd@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords); wolffd@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc); wolffd@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num); wolffd@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->b); wolffd@0: wolffd@0: model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); wolffd@0: model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); wolffd@0: model->index=NULL; wolffd@0: model->lin_weights=NULL; wolffd@0: wolffd@0: for(i=1;isv_num;i++) { wolffd@0: fgets(line,(int)ll,modelfl); wolffd@0: if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, wolffd@0: &costfactor,&wpos,max_words,&comment)) { wolffd@0: printf("\nParsing error while reading model file in SV %ld!\n%s", wolffd@0: i,line); wolffd@0: exit(1); wolffd@0: } wolffd@0: model->supvec[i] = create_example(-1, wolffd@0: 0,0, wolffd@0: 0.0, wolffd@0: create_svector(words,comment,1.0)); wolffd@0: } wolffd@0: fclose(modelfl); wolffd@0: free(line); wolffd@0: free(words); wolffd@0: if(verbosity>=1) { wolffd@0: fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1)); wolffd@0: } wolffd@0: return(model); wolffd@0: } wolffd@0: wolffd@0: MODEL *copy_model(MODEL *model) wolffd@0: { wolffd@0: MODEL *newmodel; wolffd@0: long i; wolffd@0: wolffd@0: newmodel=(MODEL *)my_malloc(sizeof(MODEL)); wolffd@0: (*newmodel)=(*model); wolffd@0: newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); wolffd@0: newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); wolffd@0: newmodel->index = NULL; /* index is not copied */ wolffd@0: newmodel->supvec[0] = NULL; wolffd@0: newmodel->alpha[0] = 0; wolffd@0: for(i=1;isv_num;i++) { wolffd@0: newmodel->alpha[i]=model->alpha[i]; wolffd@0: newmodel->supvec[i]=create_example(model->supvec[i]->docnum, wolffd@0: model->supvec[i]->queryid,0, wolffd@0: model->supvec[i]->costfactor, wolffd@0: copy_svector(model->supvec[i]->fvec)); wolffd@0: } wolffd@0: if(model->lin_weights) { wolffd@0: newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1)); wolffd@0: for(i=0;itotwords+1;i++) wolffd@0: newmodel->lin_weights[i]=model->lin_weights[i]; wolffd@0: } wolffd@0: return(newmodel); wolffd@0: } wolffd@0: wolffd@0: void free_model(MODEL *model, int deep) wolffd@0: { wolffd@0: long i; wolffd@0: wolffd@0: if(model->supvec) { wolffd@0: if(deep) { wolffd@0: for(i=1;isv_num;i++) { wolffd@0: free_example(model->supvec[i],1); wolffd@0: } wolffd@0: } wolffd@0: free(model->supvec); wolffd@0: } wolffd@0: if(model->alpha) free(model->alpha); wolffd@0: if(model->index) free(model->index); wolffd@0: if(model->lin_weights) free(model->lin_weights); wolffd@0: free(model); wolffd@0: } wolffd@0: wolffd@0: wolffd@0: void read_documents(char *docfile, DOC ***docs, double **label, wolffd@0: long int *totwords, long int *totdoc) wolffd@0: { wolffd@0: char *line,*comment; wolffd@0: WORD *words; wolffd@0: long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs; wolffd@0: long max_words_doc, ll; wolffd@0: double doc_label,costfactor; wolffd@0: FILE *docfl; wolffd@0: wolffd@0: if(verbosity>=1) { wolffd@0: printf("Scanning examples..."); fflush(stdout); wolffd@0: } wolffd@0: nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */ wolffd@0: max_words_doc+=2; wolffd@0: ll+=2; wolffd@0: max_docs+=2; wolffd@0: if(verbosity>=1) { wolffd@0: printf("done\n"); fflush(stdout); wolffd@0: } wolffd@0: wolffd@0: (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */ wolffd@0: (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */ wolffd@0: line = (char *)my_malloc(sizeof(char)*ll); wolffd@0: wolffd@0: if ((docfl = fopen (docfile, "r")) == NULL) wolffd@0: { perror (docfile); exit (1); } wolffd@0: wolffd@0: words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); wolffd@0: if(verbosity>=1) { wolffd@0: printf("Reading examples into memory..."); fflush(stdout); wolffd@0: } wolffd@0: dnum=0; wolffd@0: (*totwords)=0; wolffd@0: while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { wolffd@0: if(line[0] == '#') continue; /* line contains comments */ wolffd@0: if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor, wolffd@0: &wpos,max_words_doc,&comment)) { wolffd@0: printf("\nParsing error in line %ld!\n%s",dnum,line); wolffd@0: exit(1); wolffd@0: } wolffd@0: (*label)[dnum]=doc_label; wolffd@0: /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */ wolffd@0: if(doc_label > 0) dpos++; wolffd@0: if (doc_label < 0) dneg++; wolffd@0: if (doc_label == 0) dunlab++; wolffd@0: if((wpos>1) && ((words[wpos-2]).wnum>(*totwords))) wolffd@0: (*totwords)=(words[wpos-2]).wnum; wolffd@0: if((*totwords) > MAXFEATNUM) { wolffd@0: printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n"); wolffd@0: printf("LINE: %s\n",line); wolffd@0: exit(1); wolffd@0: } wolffd@0: (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor, wolffd@0: create_svector(words,comment,1.0)); wolffd@0: /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */ wolffd@0: dnum++; wolffd@0: if(verbosity>=1) { wolffd@0: if((dnum % 100) == 0) { wolffd@0: printf("%ld..",dnum); fflush(stdout); wolffd@0: } wolffd@0: } wolffd@0: } wolffd@0: wolffd@0: fclose(docfl); wolffd@0: free(line); wolffd@0: free(words); wolffd@0: if(verbosity>=1) { wolffd@0: fprintf(stdout, "OK. (%ld examples read)\n", dnum); wolffd@0: } wolffd@0: (*totdoc)=dnum; wolffd@0: } wolffd@0: wolffd@0: int parse_document(char *line, WORD *words, double *label, wolffd@0: long *queryid, long *slackid, double *costfactor, wolffd@0: long int *numwords, long int max_words_doc, wolffd@0: char **comment) wolffd@0: { wolffd@0: register long wpos,pos; wolffd@0: long wnum; wolffd@0: double weight; wolffd@0: int numread; wolffd@0: char featurepair[1000],junk[1000]; wolffd@0: wolffd@0: (*queryid)=0; wolffd@0: (*slackid)=0; wolffd@0: (*costfactor)=1; wolffd@0: wolffd@0: pos=0; wolffd@0: (*comment)=NULL; wolffd@0: while(line[pos] ) { /* cut off comments */ wolffd@0: if((line[pos] == '#') && (!(*comment))) { wolffd@0: line[pos]=0; wolffd@0: (*comment)=&(line[pos+1]); wolffd@0: } wolffd@0: if(line[pos] == '\n') { /* strip the CR */ wolffd@0: line[pos]=0; wolffd@0: } wolffd@0: pos++; wolffd@0: } wolffd@0: if(!(*comment)) (*comment)=&(line[pos]); wolffd@0: /* printf("Comment: '%s'\n",(*comment)); */ wolffd@0: wolffd@0: wpos=0; wolffd@0: /* check, that line starts with target value or zero, but not with wolffd@0: feature pair */ wolffd@0: if(sscanf(line,"%s",featurepair) == EOF) return(0); wolffd@0: pos=0; wolffd@0: while((featurepair[pos] != ':') && featurepair[pos]) pos++; wolffd@0: if(featurepair[pos] == ':') { wolffd@0: perror ("Line must start with label or 0!!!\n"); wolffd@0: printf("LINE: %s\n",line); wolffd@0: exit (1); wolffd@0: } wolffd@0: /* read the target value */ wolffd@0: if(sscanf(line,"%lf",label) == EOF) return(0); wolffd@0: pos=0; wolffd@0: while(space_or_null((int)line[pos])) pos++; wolffd@0: while((!space_or_null((int)line[pos])) && line[pos]) pos++; wolffd@0: while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) && wolffd@0: (numread > 0) && wolffd@0: (wpos 0) wolffd@0: (*slackid)=(long)wnum; wolffd@0: else { wolffd@0: perror ("Slack-id must be greater or equal to 1!!!\n"); wolffd@0: printf("LINE: %s\n",line); wolffd@0: exit (1); wolffd@0: } wolffd@0: } wolffd@0: else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) { wolffd@0: /* it is the example-dependent cost factor */ wolffd@0: (*costfactor)=(double)weight; wolffd@0: } wolffd@0: else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) { wolffd@0: /* it is a regular feature */ wolffd@0: if(wnum<=0) { wolffd@0: perror ("Feature numbers must be larger or equal to 1!!!\n"); wolffd@0: printf("LINE: %s\n",line); wolffd@0: exit (1); wolffd@0: } wolffd@0: if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) { wolffd@0: perror ("Features must be in increasing order!!!\n"); wolffd@0: printf("LINE: %s\n",line); wolffd@0: exit (1); wolffd@0: } wolffd@0: (words[wpos]).wnum=wnum; wolffd@0: (words[wpos]).weight=(FVAL)weight; wolffd@0: wpos++; wolffd@0: } wolffd@0: else { wolffd@0: perror ("Cannot parse feature/value pair!!!\n"); wolffd@0: printf("'%s' in LINE: %s\n",featurepair,line); wolffd@0: exit (1); wolffd@0: } wolffd@0: } wolffd@0: (words[wpos]).wnum=0; wolffd@0: (*numwords)=wpos+1; wolffd@0: return(1); wolffd@0: } wolffd@0: wolffd@0: double *read_alphas(char *alphafile,long totdoc) wolffd@0: /* reads the alpha vector from a file as written by the wolffd@0: write_alphas function */ wolffd@0: { wolffd@0: FILE *fl; wolffd@0: double *alpha; wolffd@0: long dnum; wolffd@0: wolffd@0: if ((fl = fopen (alphafile, "r")) == NULL) wolffd@0: { perror (alphafile); exit (1); } wolffd@0: wolffd@0: alpha = (double *)my_malloc(sizeof(double)*totdoc); wolffd@0: if(verbosity>=1) { wolffd@0: printf("Reading alphas..."); fflush(stdout); wolffd@0: } wolffd@0: dnum=0; wolffd@0: while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum=1) { wolffd@0: printf("done\n"); fflush(stdout); wolffd@0: } wolffd@0: wolffd@0: return(alpha); wolffd@0: } wolffd@0: wolffd@0: void nol_ll(char *file, long int *nol, long int *wol, long int *ll) wolffd@0: /* Grep through file and count number of lines, maximum number of wolffd@0: spaces per line, and longest line. */ wolffd@0: { wolffd@0: FILE *fl; wolffd@0: int ic; wolffd@0: char c; wolffd@0: long current_length,current_wol; wolffd@0: wolffd@0: if ((fl = fopen (file, "r")) == NULL) wolffd@0: { perror (file); exit (1); } wolffd@0: current_length=0; wolffd@0: current_wol=0; wolffd@0: (*ll)=0; wolffd@0: (*nol)=1; wolffd@0: (*wol)=0; wolffd@0: while((ic=getc(fl)) != EOF) { wolffd@0: c=(char)ic; wolffd@0: current_length++; wolffd@0: if(space_or_null((int)c)) { wolffd@0: current_wol++; wolffd@0: } wolffd@0: if(c == '\n') { wolffd@0: (*nol)++; wolffd@0: if(current_length>(*ll)) { wolffd@0: (*ll)=current_length; wolffd@0: } wolffd@0: if(current_wol>(*wol)) { wolffd@0: (*wol)=current_wol; wolffd@0: } wolffd@0: current_length=0; wolffd@0: current_wol=0; wolffd@0: } wolffd@0: } wolffd@0: fclose(fl); wolffd@0: } wolffd@0: wolffd@0: long minl(long int a, long int b) wolffd@0: { wolffd@0: if(ab) wolffd@0: return(a); wolffd@0: else wolffd@0: return(b); wolffd@0: } wolffd@0: wolffd@0: long get_runtime(void) wolffd@0: { wolffd@0: clock_t start; wolffd@0: start = clock(); wolffd@0: return((long)((double)start*100.0/(double)CLOCKS_PER_SEC)); wolffd@0: } wolffd@0: wolffd@0: wolffd@0: # ifdef _MSC_VER wolffd@0: wolffd@0: int isnan(double a) wolffd@0: { wolffd@0: return(_isnan(a)); wolffd@0: } wolffd@0: wolffd@0: # endif wolffd@0: wolffd@0: int space_or_null(int c) { wolffd@0: if (c==0) wolffd@0: return 1; wolffd@0: return isspace(c); wolffd@0: } wolffd@0: wolffd@0: void *my_malloc(size_t size) wolffd@0: { wolffd@0: void *ptr; wolffd@0: ptr=(void *)malloc(size); wolffd@0: if(!ptr) { wolffd@0: perror ("Out of memory!\n"); wolffd@0: exit (1); wolffd@0: } wolffd@0: return(ptr); wolffd@0: } wolffd@0: wolffd@0: void copyright_notice(void) wolffd@0: { wolffd@0: printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n"); wolffd@0: printf("This software is available for non-commercial use only. It must not\n"); wolffd@0: printf("be modified and distributed without prior permission of the author.\n"); wolffd@0: printf("The author is not responsible for implications from the use of this\n"); wolffd@0: printf("software.\n\n"); wolffd@0: }