Daniel@0: /************************************************************************/ Daniel@0: /* */ Daniel@0: /* svm_common.c */ Daniel@0: /* */ Daniel@0: /* Definitions and functions used in both svm_learn and svm_classify. */ Daniel@0: /* */ Daniel@0: /* Author: Thorsten Joachims */ Daniel@0: /* Date: 02.07.04 */ Daniel@0: /* */ Daniel@0: /* Copyright (c) 2004 Thorsten Joachims - All rights reserved */ Daniel@0: /* */ Daniel@0: /* This software is available for non-commercial use only. It must */ Daniel@0: /* not be modified and distributed without prior permission of the */ Daniel@0: /* author. The author is not responsible for implications from the */ Daniel@0: /* use of this software. */ Daniel@0: /* */ Daniel@0: /************************************************************************/ Daniel@0: Daniel@0: # include "ctype.h" Daniel@0: # include "svm_common.h" Daniel@0: # include "kernel.h" /* this contains a user supplied kernel */ Daniel@0: Daniel@0: long verbosity; /* verbosity level (0-4) */ Daniel@0: long kernel_cache_statistic; Daniel@0: Daniel@0: double classify_example(MODEL *model, DOC *ex) Daniel@0: /* classifies one example */ Daniel@0: { Daniel@0: register long i; Daniel@0: register double dist; Daniel@0: Daniel@0: if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights)) Daniel@0: return(classify_example_linear(model,ex)); Daniel@0: Daniel@0: dist=0; Daniel@0: for(i=1;isv_num;i++) { Daniel@0: dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i]; Daniel@0: } Daniel@0: return(dist-model->b); Daniel@0: } Daniel@0: Daniel@0: double classify_example_linear(MODEL *model, DOC *ex) Daniel@0: /* classifies example for linear kernel */ Daniel@0: Daniel@0: /* important: the model must have the linear weight vector computed */ Daniel@0: /* use: add_weight_vector_to_linear_model(&model); */ Daniel@0: Daniel@0: Daniel@0: /* important: the feature numbers in the example to classify must */ Daniel@0: /* not be larger than the weight vector! */ Daniel@0: { Daniel@0: double sum=0; Daniel@0: SVECTOR *f; Daniel@0: Daniel@0: for(f=ex->fvec;f;f=f->next) Daniel@0: sum+=f->factor*sprod_ns(model->lin_weights,f); Daniel@0: return(sum-model->b); Daniel@0: } Daniel@0: Daniel@0: Daniel@0: CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b) Daniel@0: /* calculate the kernel function */ Daniel@0: { Daniel@0: double sum=0; Daniel@0: SVECTOR *fa,*fb; Daniel@0: Daniel@0: /* in case the constraints are sums of feature vector as represented Daniel@0: as a list of SVECTOR's with their coefficient factor in the sum, Daniel@0: take the kernel between all pairs */ Daniel@0: for(fa=a->fvec;fa;fa=fa->next) { Daniel@0: for(fb=b->fvec;fb;fb=fb->next) { Daniel@0: if(fa->kernel_id == fb->kernel_id) Daniel@0: sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb); Daniel@0: } Daniel@0: } Daniel@0: return(sum); Daniel@0: } Daniel@0: Daniel@0: CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b) Daniel@0: /* calculate the kernel function between two vectors */ Daniel@0: { Daniel@0: kernel_cache_statistic++; Daniel@0: switch(kernel_parm->kernel_type) { Daniel@0: case 0: /* linear */ Daniel@0: return((CFLOAT)sprod_ss(a,b)); Daniel@0: case 1: /* polynomial */ Daniel@0: return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree)); Daniel@0: case 2: /* radial basis function */ Daniel@0: return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq))); Daniel@0: case 3: /* sigmoid neural net */ Daniel@0: return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const)); Daniel@0: case 4: /* custom-kernel supplied in file kernel.h*/ Daniel@0: return((CFLOAT)custom_kernel(kernel_parm,a,b)); Daniel@0: default: printf("Error: Unknown kernel function\n"); exit(1); Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: Daniel@0: SVECTOR *create_svector(WORD *words,char *userdefined,double factor) Daniel@0: { Daniel@0: SVECTOR *vec; Daniel@0: long fnum,i; Daniel@0: Daniel@0: fnum=0; Daniel@0: while(words[fnum].wnum) { Daniel@0: fnum++; Daniel@0: } Daniel@0: fnum++; Daniel@0: vec = (SVECTOR *)my_malloc(sizeof(SVECTOR)); Daniel@0: vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum)); Daniel@0: for(i=0;iwords[i]=words[i]; Daniel@0: } Daniel@0: vec->twonorm_sq=sprod_ss(vec,vec); Daniel@0: Daniel@0: fnum=0; Daniel@0: while(userdefined[fnum]) { Daniel@0: fnum++; Daniel@0: } Daniel@0: fnum++; Daniel@0: vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum)); Daniel@0: for(i=0;iuserdefined[i]=userdefined[i]; Daniel@0: } Daniel@0: vec->kernel_id=0; Daniel@0: vec->next=NULL; Daniel@0: vec->factor=factor; Daniel@0: return(vec); Daniel@0: } Daniel@0: Daniel@0: SVECTOR *copy_svector(SVECTOR *vec) Daniel@0: { Daniel@0: SVECTOR *newvec=NULL; Daniel@0: if(vec) { Daniel@0: newvec=create_svector(vec->words,vec->userdefined,vec->factor); Daniel@0: newvec->next=copy_svector(vec->next); Daniel@0: } Daniel@0: return(newvec); Daniel@0: } Daniel@0: Daniel@0: void free_svector(SVECTOR *vec) Daniel@0: { Daniel@0: if(vec) { Daniel@0: free(vec->words); Daniel@0: if(vec->userdefined) Daniel@0: free(vec->userdefined); Daniel@0: free_svector(vec->next); Daniel@0: free(vec); Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: double sprod_ss(SVECTOR *a, SVECTOR *b) Daniel@0: /* compute the inner product of two sparse vectors */ Daniel@0: { Daniel@0: register CFLOAT sum=0; Daniel@0: register WORD *ai,*bj; Daniel@0: ai=a->words; Daniel@0: bj=b->words; Daniel@0: while (ai->wnum && bj->wnum) { Daniel@0: if(ai->wnum > bj->wnum) { Daniel@0: bj++; Daniel@0: } Daniel@0: else if (ai->wnum < bj->wnum) { Daniel@0: ai++; Daniel@0: } Daniel@0: else { Daniel@0: sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight); Daniel@0: ai++; Daniel@0: bj++; Daniel@0: } Daniel@0: } Daniel@0: return((double)sum); Daniel@0: } Daniel@0: Daniel@0: SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b) Daniel@0: /* compute the difference a-b of two sparse vectors */ Daniel@0: /* Note: SVECTOR lists are not followed, but only the first Daniel@0: SVECTOR is used */ Daniel@0: { Daniel@0: SVECTOR *vec; Daniel@0: register WORD *sum,*sumi; Daniel@0: register WORD *ai,*bj; Daniel@0: long veclength; Daniel@0: Daniel@0: ai=a->words; Daniel@0: bj=b->words; Daniel@0: veclength=0; Daniel@0: while (ai->wnum && bj->wnum) { Daniel@0: if(ai->wnum > bj->wnum) { Daniel@0: veclength++; Daniel@0: bj++; Daniel@0: } Daniel@0: else if (ai->wnum < bj->wnum) { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: } Daniel@0: else { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: bj++; Daniel@0: } Daniel@0: } Daniel@0: while (bj->wnum) { Daniel@0: veclength++; Daniel@0: bj++; Daniel@0: } Daniel@0: while (ai->wnum) { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: } Daniel@0: veclength++; Daniel@0: Daniel@0: sum=(WORD *)my_malloc(sizeof(WORD)*veclength); Daniel@0: sumi=sum; Daniel@0: ai=a->words; Daniel@0: bj=b->words; Daniel@0: while (ai->wnum && bj->wnum) { Daniel@0: if(ai->wnum > bj->wnum) { Daniel@0: (*sumi)=(*bj); Daniel@0: sumi->weight*=(-1); Daniel@0: sumi++; Daniel@0: bj++; Daniel@0: } Daniel@0: else if (ai->wnum < bj->wnum) { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: } Daniel@0: else { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi->weight-=bj->weight; Daniel@0: if(sumi->weight != 0) Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: bj++; Daniel@0: } Daniel@0: } Daniel@0: while (bj->wnum) { Daniel@0: (*sumi)=(*bj); Daniel@0: sumi->weight*=(-1); Daniel@0: sumi++; Daniel@0: bj++; Daniel@0: } Daniel@0: while (ai->wnum) { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: } Daniel@0: sumi->wnum=0; Daniel@0: Daniel@0: vec=create_svector(sum,"",1.0); Daniel@0: free(sum); Daniel@0: Daniel@0: return(vec); Daniel@0: } Daniel@0: Daniel@0: SVECTOR* add_ss(SVECTOR *a, SVECTOR *b) Daniel@0: /* compute the sum a+b of two sparse vectors */ Daniel@0: /* Note: SVECTOR lists are not followed, but only the first Daniel@0: SVECTOR is used */ Daniel@0: { Daniel@0: SVECTOR *vec; Daniel@0: register WORD *sum,*sumi; Daniel@0: register WORD *ai,*bj; Daniel@0: long veclength; Daniel@0: Daniel@0: ai=a->words; Daniel@0: bj=b->words; Daniel@0: veclength=0; Daniel@0: while (ai->wnum && bj->wnum) { Daniel@0: if(ai->wnum > bj->wnum) { Daniel@0: veclength++; Daniel@0: bj++; Daniel@0: } Daniel@0: else if (ai->wnum < bj->wnum) { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: } Daniel@0: else { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: bj++; Daniel@0: } Daniel@0: } Daniel@0: while (bj->wnum) { Daniel@0: veclength++; Daniel@0: bj++; Daniel@0: } Daniel@0: while (ai->wnum) { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: } Daniel@0: veclength++; Daniel@0: Daniel@0: /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/ Daniel@0: Daniel@0: sum=(WORD *)my_malloc(sizeof(WORD)*veclength); Daniel@0: sumi=sum; Daniel@0: ai=a->words; Daniel@0: bj=b->words; Daniel@0: while (ai->wnum && bj->wnum) { Daniel@0: if(ai->wnum > bj->wnum) { Daniel@0: (*sumi)=(*bj); Daniel@0: sumi++; Daniel@0: bj++; Daniel@0: } Daniel@0: else if (ai->wnum < bj->wnum) { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: } Daniel@0: else { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi->weight+=bj->weight; Daniel@0: if(sumi->weight != 0) Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: bj++; Daniel@0: } Daniel@0: } Daniel@0: while (bj->wnum) { Daniel@0: (*sumi)=(*bj); Daniel@0: sumi++; Daniel@0: bj++; Daniel@0: } Daniel@0: while (ai->wnum) { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: } Daniel@0: sumi->wnum=0; Daniel@0: Daniel@0: vec=create_svector(sum,"",1.0); Daniel@0: free(sum); Daniel@0: Daniel@0: return(vec); Daniel@0: } Daniel@0: Daniel@0: SVECTOR* add_list_ss(SVECTOR *a) Daniel@0: /* computes the linear combination of the SVECTOR list weighted Daniel@0: by the factor of each SVECTOR */ Daniel@0: { Daniel@0: SVECTOR *scaled,*oldsum,*sum,*f; Daniel@0: WORD empty[2]; Daniel@0: Daniel@0: if(a){ Daniel@0: sum=smult_s(a,a->factor); Daniel@0: for(f=a->next;f;f=f->next) { Daniel@0: scaled=smult_s(f,f->factor); Daniel@0: oldsum=sum; Daniel@0: sum=add_ss(sum,scaled); Daniel@0: free_svector(oldsum); Daniel@0: free_svector(scaled); Daniel@0: } Daniel@0: sum->factor=1.0; Daniel@0: } Daniel@0: else { Daniel@0: empty[0].wnum=0; Daniel@0: sum=create_svector(empty,"",1.0); Daniel@0: } Daniel@0: return(sum); Daniel@0: } Daniel@0: Daniel@0: void append_svector_list(SVECTOR *a, SVECTOR *b) Daniel@0: /* appends SVECTOR b to the end of SVECTOR a. */ Daniel@0: { Daniel@0: SVECTOR *f; Daniel@0: Daniel@0: for(f=a;f->next;f=f->next); /* find end of first vector list */ Daniel@0: f->next=b; /* append the two vector lists */ Daniel@0: } Daniel@0: Daniel@0: SVECTOR* smult_s(SVECTOR *a, double factor) Daniel@0: /* scale sparse vector a by factor */ Daniel@0: { Daniel@0: SVECTOR *vec; Daniel@0: register WORD *sum,*sumi; Daniel@0: register WORD *ai; Daniel@0: long veclength; Daniel@0: Daniel@0: ai=a->words; Daniel@0: veclength=0; Daniel@0: while (ai->wnum) { Daniel@0: veclength++; Daniel@0: ai++; Daniel@0: } Daniel@0: veclength++; Daniel@0: Daniel@0: sum=(WORD *)my_malloc(sizeof(WORD)*veclength); Daniel@0: sumi=sum; Daniel@0: ai=a->words; Daniel@0: while (ai->wnum) { Daniel@0: (*sumi)=(*ai); Daniel@0: sumi->weight*=factor; Daniel@0: if(sumi->weight != 0) Daniel@0: sumi++; Daniel@0: ai++; Daniel@0: } Daniel@0: sumi->wnum=0; Daniel@0: Daniel@0: vec=create_svector(sum,a->userdefined,a->factor); Daniel@0: free(sum); Daniel@0: Daniel@0: return(vec); Daniel@0: } Daniel@0: Daniel@0: int featvec_eq(SVECTOR *a, SVECTOR *b) Daniel@0: /* tests two sparse vectors for equality */ Daniel@0: { Daniel@0: register WORD *ai,*bj; Daniel@0: ai=a->words; Daniel@0: bj=b->words; Daniel@0: while (ai->wnum && bj->wnum) { Daniel@0: if(ai->wnum > bj->wnum) { Daniel@0: if((CFLOAT)(bj->weight) != 0) Daniel@0: return(0); Daniel@0: bj++; Daniel@0: } Daniel@0: else if (ai->wnum < bj->wnum) { Daniel@0: if((CFLOAT)(ai->weight) != 0) Daniel@0: return(0); Daniel@0: ai++; Daniel@0: } Daniel@0: else { Daniel@0: if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight)) Daniel@0: return(0); Daniel@0: ai++; Daniel@0: bj++; Daniel@0: } Daniel@0: } Daniel@0: return(1); Daniel@0: } Daniel@0: Daniel@0: double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm) Daniel@0: /* compute length of weight vector */ Daniel@0: { Daniel@0: register long i,j; Daniel@0: register double sum=0,alphai; Daniel@0: register DOC *supveci; Daniel@0: Daniel@0: for(i=1;isv_num;i++) { Daniel@0: alphai=model->alpha[i]; Daniel@0: supveci=model->supvec[i]; Daniel@0: for(j=1;jsv_num;j++) { Daniel@0: sum+=alphai*model->alpha[j] Daniel@0: *kernel(kernel_parm,supveci,model->supvec[j]); Daniel@0: } Daniel@0: } Daniel@0: return(sqrt(sum)); Daniel@0: } Daniel@0: Daniel@0: void clear_vector_n(double *vec, long int n) Daniel@0: { Daniel@0: register long i; Daniel@0: for(i=0;i<=n;i++) vec[i]=0; Daniel@0: } Daniel@0: Daniel@0: void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor) Daniel@0: { Daniel@0: register WORD *ai; Daniel@0: ai=vec_s->words; Daniel@0: while (ai->wnum) { Daniel@0: vec_n[ai->wnum]+=(faktor*ai->weight); Daniel@0: ai++; Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: double sprod_ns(double *vec_n, SVECTOR *vec_s) Daniel@0: { Daniel@0: register double sum=0; Daniel@0: register WORD *ai; Daniel@0: ai=vec_s->words; Daniel@0: while (ai->wnum) { Daniel@0: sum+=(vec_n[ai->wnum]*ai->weight); Daniel@0: ai++; Daniel@0: } Daniel@0: return(sum); Daniel@0: } Daniel@0: Daniel@0: void add_weight_vector_to_linear_model(MODEL *model) Daniel@0: /* compute weight vector in linear case and add to model */ Daniel@0: { Daniel@0: long i; Daniel@0: SVECTOR *f; Daniel@0: Daniel@0: model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1)); Daniel@0: clear_vector_n(model->lin_weights,model->totwords); Daniel@0: for(i=1;isv_num;i++) { Daniel@0: for(f=(model->supvec[i])->fvec;f;f=f->next) Daniel@0: add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]); Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: Daniel@0: DOC *create_example(long docnum, long queryid, long slackid, Daniel@0: double costfactor, SVECTOR *fvec) Daniel@0: { Daniel@0: DOC *example; Daniel@0: example = (DOC *)my_malloc(sizeof(DOC)); Daniel@0: example->docnum=docnum; Daniel@0: example->queryid=queryid; Daniel@0: example->slackid=slackid; Daniel@0: example->costfactor=costfactor; Daniel@0: example->fvec=fvec; Daniel@0: return(example); Daniel@0: } Daniel@0: Daniel@0: void free_example(DOC *example, long deep) Daniel@0: { Daniel@0: if(example) { Daniel@0: if(deep) { Daniel@0: if(example->fvec) Daniel@0: free_svector(example->fvec); Daniel@0: } Daniel@0: free(example); Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: void write_model(char *modelfile, MODEL *model) Daniel@0: { Daniel@0: FILE *modelfl; Daniel@0: long j,i,sv_num; Daniel@0: SVECTOR *v; Daniel@0: Daniel@0: if(verbosity>=1) { Daniel@0: printf("Writing model file..."); fflush(stdout); Daniel@0: } Daniel@0: if ((modelfl = fopen (modelfile, "w")) == NULL) Daniel@0: { perror (modelfile); exit (1); } Daniel@0: fprintf(modelfl,"SVM-light Version %s\n",VERSION); Daniel@0: fprintf(modelfl,"%ld # kernel type\n", Daniel@0: model->kernel_parm.kernel_type); Daniel@0: fprintf(modelfl,"%ld # kernel parameter -d \n", Daniel@0: model->kernel_parm.poly_degree); Daniel@0: fprintf(modelfl,"%.8g # kernel parameter -g \n", Daniel@0: model->kernel_parm.rbf_gamma); Daniel@0: fprintf(modelfl,"%.8g # kernel parameter -s \n", Daniel@0: model->kernel_parm.coef_lin); Daniel@0: fprintf(modelfl,"%.8g # kernel parameter -r \n", Daniel@0: model->kernel_parm.coef_const); Daniel@0: fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom); Daniel@0: fprintf(modelfl,"%ld # highest feature index \n",model->totwords); Daniel@0: fprintf(modelfl,"%ld # number of training documents \n",model->totdoc); Daniel@0: Daniel@0: sv_num=1; Daniel@0: for(i=1;isv_num;i++) { Daniel@0: for(v=model->supvec[i]->fvec;v;v=v->next) Daniel@0: sv_num++; Daniel@0: } Daniel@0: fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num); Daniel@0: fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b); Daniel@0: Daniel@0: for(i=1;isv_num;i++) { Daniel@0: for(v=model->supvec[i]->fvec;v;v=v->next) { Daniel@0: fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor); Daniel@0: for (j=0; (v->words[j]).wnum; j++) { Daniel@0: fprintf(modelfl,"%ld:%.8g ", Daniel@0: (long)(v->words[j]).wnum, Daniel@0: (double)(v->words[j]).weight); Daniel@0: } Daniel@0: fprintf(modelfl,"#%s\n",v->userdefined); Daniel@0: /* NOTE: this could be made more efficient by summing the Daniel@0: alpha's of identical vectors before writing them to the Daniel@0: file. */ Daniel@0: } Daniel@0: } Daniel@0: fclose(modelfl); Daniel@0: if(verbosity>=1) { Daniel@0: printf("done\n"); Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: Daniel@0: MODEL *read_model(char *modelfile) Daniel@0: { Daniel@0: FILE *modelfl; Daniel@0: long i,queryid,slackid; Daniel@0: double costfactor; Daniel@0: long max_sv,max_words,ll,wpos; Daniel@0: char *line,*comment; Daniel@0: WORD *words; Daniel@0: char version_buffer[100]; Daniel@0: MODEL *model; Daniel@0: Daniel@0: if(verbosity>=1) { Daniel@0: printf("Reading model..."); fflush(stdout); Daniel@0: } Daniel@0: Daniel@0: nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */ Daniel@0: max_words+=2; Daniel@0: ll+=2; Daniel@0: Daniel@0: words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10)); Daniel@0: line = (char *)my_malloc(sizeof(char)*ll); Daniel@0: model = (MODEL *)my_malloc(sizeof(MODEL)); Daniel@0: Daniel@0: if ((modelfl = fopen (modelfile, "r")) == NULL) Daniel@0: { perror (modelfile); exit (1); } Daniel@0: Daniel@0: fscanf(modelfl,"SVM-light Version %s\n",version_buffer); Daniel@0: if(strcmp(version_buffer,VERSION)) { Daniel@0: perror ("Version of model-file does not match version of svm_classify!"); Daniel@0: exit (1); Daniel@0: } Daniel@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type); Daniel@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree); Daniel@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma); Daniel@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin); Daniel@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const); Daniel@0: fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom); Daniel@0: Daniel@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords); Daniel@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc); Daniel@0: fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num); Daniel@0: fscanf(modelfl,"%lf%*[^\n]\n", &model->b); Daniel@0: Daniel@0: model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); Daniel@0: model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); Daniel@0: model->index=NULL; Daniel@0: model->lin_weights=NULL; Daniel@0: Daniel@0: for(i=1;isv_num;i++) { Daniel@0: fgets(line,(int)ll,modelfl); Daniel@0: if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid, Daniel@0: &costfactor,&wpos,max_words,&comment)) { Daniel@0: printf("\nParsing error while reading model file in SV %ld!\n%s", Daniel@0: i,line); Daniel@0: exit(1); Daniel@0: } Daniel@0: model->supvec[i] = create_example(-1, Daniel@0: 0,0, Daniel@0: 0.0, Daniel@0: create_svector(words,comment,1.0)); Daniel@0: } Daniel@0: fclose(modelfl); Daniel@0: free(line); Daniel@0: free(words); Daniel@0: if(verbosity>=1) { Daniel@0: fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1)); Daniel@0: } Daniel@0: return(model); Daniel@0: } Daniel@0: Daniel@0: MODEL *copy_model(MODEL *model) Daniel@0: { Daniel@0: MODEL *newmodel; Daniel@0: long i; Daniel@0: Daniel@0: newmodel=(MODEL *)my_malloc(sizeof(MODEL)); Daniel@0: (*newmodel)=(*model); Daniel@0: newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num); Daniel@0: newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num); Daniel@0: newmodel->index = NULL; /* index is not copied */ Daniel@0: newmodel->supvec[0] = NULL; Daniel@0: newmodel->alpha[0] = 0; Daniel@0: for(i=1;isv_num;i++) { Daniel@0: newmodel->alpha[i]=model->alpha[i]; Daniel@0: newmodel->supvec[i]=create_example(model->supvec[i]->docnum, Daniel@0: model->supvec[i]->queryid,0, Daniel@0: model->supvec[i]->costfactor, Daniel@0: copy_svector(model->supvec[i]->fvec)); Daniel@0: } Daniel@0: if(model->lin_weights) { Daniel@0: newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1)); Daniel@0: for(i=0;itotwords+1;i++) Daniel@0: newmodel->lin_weights[i]=model->lin_weights[i]; Daniel@0: } Daniel@0: return(newmodel); Daniel@0: } Daniel@0: Daniel@0: void free_model(MODEL *model, int deep) Daniel@0: { Daniel@0: long i; Daniel@0: Daniel@0: if(model->supvec) { Daniel@0: if(deep) { Daniel@0: for(i=1;isv_num;i++) { Daniel@0: free_example(model->supvec[i],1); Daniel@0: } Daniel@0: } Daniel@0: free(model->supvec); Daniel@0: } Daniel@0: if(model->alpha) free(model->alpha); Daniel@0: if(model->index) free(model->index); Daniel@0: if(model->lin_weights) free(model->lin_weights); Daniel@0: free(model); Daniel@0: } Daniel@0: Daniel@0: Daniel@0: void read_documents(char *docfile, DOC ***docs, double **label, Daniel@0: long int *totwords, long int *totdoc) Daniel@0: { Daniel@0: char *line,*comment; Daniel@0: WORD *words; Daniel@0: long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs; Daniel@0: long max_words_doc, ll; Daniel@0: double doc_label,costfactor; Daniel@0: FILE *docfl; Daniel@0: Daniel@0: if(verbosity>=1) { Daniel@0: printf("Scanning examples..."); fflush(stdout); Daniel@0: } Daniel@0: nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */ Daniel@0: max_words_doc+=2; Daniel@0: ll+=2; Daniel@0: max_docs+=2; Daniel@0: if(verbosity>=1) { Daniel@0: printf("done\n"); fflush(stdout); Daniel@0: } Daniel@0: Daniel@0: (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */ Daniel@0: (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */ Daniel@0: line = (char *)my_malloc(sizeof(char)*ll); Daniel@0: Daniel@0: if ((docfl = fopen (docfile, "r")) == NULL) Daniel@0: { perror (docfile); exit (1); } Daniel@0: Daniel@0: words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10)); Daniel@0: if(verbosity>=1) { Daniel@0: printf("Reading examples into memory..."); fflush(stdout); Daniel@0: } Daniel@0: dnum=0; Daniel@0: (*totwords)=0; Daniel@0: while((!feof(docfl)) && fgets(line,(int)ll,docfl)) { Daniel@0: if(line[0] == '#') continue; /* line contains comments */ Daniel@0: if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor, Daniel@0: &wpos,max_words_doc,&comment)) { Daniel@0: printf("\nParsing error in line %ld!\n%s",dnum,line); Daniel@0: exit(1); Daniel@0: } Daniel@0: (*label)[dnum]=doc_label; Daniel@0: /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */ Daniel@0: if(doc_label > 0) dpos++; Daniel@0: if (doc_label < 0) dneg++; Daniel@0: if (doc_label == 0) dunlab++; Daniel@0: if((wpos>1) && ((words[wpos-2]).wnum>(*totwords))) Daniel@0: (*totwords)=(words[wpos-2]).wnum; Daniel@0: if((*totwords) > MAXFEATNUM) { Daniel@0: printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n"); Daniel@0: printf("LINE: %s\n",line); Daniel@0: exit(1); Daniel@0: } Daniel@0: (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor, Daniel@0: create_svector(words,comment,1.0)); Daniel@0: /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */ Daniel@0: dnum++; Daniel@0: if(verbosity>=1) { Daniel@0: if((dnum % 100) == 0) { Daniel@0: printf("%ld..",dnum); fflush(stdout); Daniel@0: } Daniel@0: } Daniel@0: } Daniel@0: Daniel@0: fclose(docfl); Daniel@0: free(line); Daniel@0: free(words); Daniel@0: if(verbosity>=1) { Daniel@0: fprintf(stdout, "OK. (%ld examples read)\n", dnum); Daniel@0: } Daniel@0: (*totdoc)=dnum; Daniel@0: } Daniel@0: Daniel@0: int parse_document(char *line, WORD *words, double *label, Daniel@0: long *queryid, long *slackid, double *costfactor, Daniel@0: long int *numwords, long int max_words_doc, Daniel@0: char **comment) Daniel@0: { Daniel@0: register long wpos,pos; Daniel@0: long wnum; Daniel@0: double weight; Daniel@0: int numread; Daniel@0: char featurepair[1000],junk[1000]; Daniel@0: Daniel@0: (*queryid)=0; Daniel@0: (*slackid)=0; Daniel@0: (*costfactor)=1; Daniel@0: Daniel@0: pos=0; Daniel@0: (*comment)=NULL; Daniel@0: while(line[pos] ) { /* cut off comments */ Daniel@0: if((line[pos] == '#') && (!(*comment))) { Daniel@0: line[pos]=0; Daniel@0: (*comment)=&(line[pos+1]); Daniel@0: } Daniel@0: if(line[pos] == '\n') { /* strip the CR */ Daniel@0: line[pos]=0; Daniel@0: } Daniel@0: pos++; Daniel@0: } Daniel@0: if(!(*comment)) (*comment)=&(line[pos]); Daniel@0: /* printf("Comment: '%s'\n",(*comment)); */ Daniel@0: Daniel@0: wpos=0; Daniel@0: /* check, that line starts with target value or zero, but not with Daniel@0: feature pair */ Daniel@0: if(sscanf(line,"%s",featurepair) == EOF) return(0); Daniel@0: pos=0; Daniel@0: while((featurepair[pos] != ':') && featurepair[pos]) pos++; Daniel@0: if(featurepair[pos] == ':') { Daniel@0: perror ("Line must start with label or 0!!!\n"); Daniel@0: printf("LINE: %s\n",line); Daniel@0: exit (1); Daniel@0: } Daniel@0: /* read the target value */ Daniel@0: if(sscanf(line,"%lf",label) == EOF) return(0); Daniel@0: pos=0; Daniel@0: while(space_or_null((int)line[pos])) pos++; Daniel@0: while((!space_or_null((int)line[pos])) && line[pos]) pos++; Daniel@0: while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) && Daniel@0: (numread > 0) && Daniel@0: (wpos 0) Daniel@0: (*slackid)=(long)wnum; Daniel@0: else { Daniel@0: perror ("Slack-id must be greater or equal to 1!!!\n"); Daniel@0: printf("LINE: %s\n",line); Daniel@0: exit (1); Daniel@0: } Daniel@0: } Daniel@0: else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) { Daniel@0: /* it is the example-dependent cost factor */ Daniel@0: (*costfactor)=(double)weight; Daniel@0: } Daniel@0: else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) { Daniel@0: /* it is a regular feature */ Daniel@0: if(wnum<=0) { Daniel@0: perror ("Feature numbers must be larger or equal to 1!!!\n"); Daniel@0: printf("LINE: %s\n",line); Daniel@0: exit (1); Daniel@0: } Daniel@0: if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) { Daniel@0: perror ("Features must be in increasing order!!!\n"); Daniel@0: printf("LINE: %s\n",line); Daniel@0: exit (1); Daniel@0: } Daniel@0: (words[wpos]).wnum=wnum; Daniel@0: (words[wpos]).weight=(FVAL)weight; Daniel@0: wpos++; Daniel@0: } Daniel@0: else { Daniel@0: perror ("Cannot parse feature/value pair!!!\n"); Daniel@0: printf("'%s' in LINE: %s\n",featurepair,line); Daniel@0: exit (1); Daniel@0: } Daniel@0: } Daniel@0: (words[wpos]).wnum=0; Daniel@0: (*numwords)=wpos+1; Daniel@0: return(1); Daniel@0: } Daniel@0: Daniel@0: double *read_alphas(char *alphafile,long totdoc) Daniel@0: /* reads the alpha vector from a file as written by the Daniel@0: write_alphas function */ Daniel@0: { Daniel@0: FILE *fl; Daniel@0: double *alpha; Daniel@0: long dnum; Daniel@0: Daniel@0: if ((fl = fopen (alphafile, "r")) == NULL) Daniel@0: { perror (alphafile); exit (1); } Daniel@0: Daniel@0: alpha = (double *)my_malloc(sizeof(double)*totdoc); Daniel@0: if(verbosity>=1) { Daniel@0: printf("Reading alphas..."); fflush(stdout); Daniel@0: } Daniel@0: dnum=0; Daniel@0: while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum=1) { Daniel@0: printf("done\n"); fflush(stdout); Daniel@0: } Daniel@0: Daniel@0: return(alpha); Daniel@0: } Daniel@0: Daniel@0: void nol_ll(char *file, long int *nol, long int *wol, long int *ll) Daniel@0: /* Grep through file and count number of lines, maximum number of Daniel@0: spaces per line, and longest line. */ Daniel@0: { Daniel@0: FILE *fl; Daniel@0: int ic; Daniel@0: char c; Daniel@0: long current_length,current_wol; Daniel@0: Daniel@0: if ((fl = fopen (file, "r")) == NULL) Daniel@0: { perror (file); exit (1); } Daniel@0: current_length=0; Daniel@0: current_wol=0; Daniel@0: (*ll)=0; Daniel@0: (*nol)=1; Daniel@0: (*wol)=0; Daniel@0: while((ic=getc(fl)) != EOF) { Daniel@0: c=(char)ic; Daniel@0: current_length++; Daniel@0: if(space_or_null((int)c)) { Daniel@0: current_wol++; Daniel@0: } Daniel@0: if(c == '\n') { Daniel@0: (*nol)++; Daniel@0: if(current_length>(*ll)) { Daniel@0: (*ll)=current_length; Daniel@0: } Daniel@0: if(current_wol>(*wol)) { Daniel@0: (*wol)=current_wol; Daniel@0: } Daniel@0: current_length=0; Daniel@0: current_wol=0; Daniel@0: } Daniel@0: } Daniel@0: fclose(fl); Daniel@0: } Daniel@0: Daniel@0: long minl(long int a, long int b) Daniel@0: { Daniel@0: if(ab) Daniel@0: return(a); Daniel@0: else Daniel@0: return(b); Daniel@0: } Daniel@0: Daniel@0: long get_runtime(void) Daniel@0: { Daniel@0: clock_t start; Daniel@0: start = clock(); Daniel@0: return((long)((double)start*100.0/(double)CLOCKS_PER_SEC)); Daniel@0: } Daniel@0: Daniel@0: Daniel@0: # ifdef _MSC_VER Daniel@0: Daniel@0: int isnan(double a) Daniel@0: { Daniel@0: return(_isnan(a)); Daniel@0: } Daniel@0: Daniel@0: # endif Daniel@0: Daniel@0: int space_or_null(int c) { Daniel@0: if (c==0) Daniel@0: return 1; Daniel@0: return isspace(c); Daniel@0: } Daniel@0: Daniel@0: void *my_malloc(size_t size) Daniel@0: { Daniel@0: void *ptr; Daniel@0: ptr=(void *)malloc(size); Daniel@0: if(!ptr) { Daniel@0: perror ("Out of memory!\n"); Daniel@0: exit (1); Daniel@0: } Daniel@0: return(ptr); Daniel@0: } Daniel@0: Daniel@0: void copyright_notice(void) Daniel@0: { Daniel@0: printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n"); Daniel@0: printf("This software is available for non-commercial use only. It must not\n"); Daniel@0: printf("be modified and distributed without prior permission of the author.\n"); Daniel@0: printf("The author is not responsible for implications from the use of this\n"); Daniel@0: printf("software.\n\n"); Daniel@0: }