diff toolboxes/SVM-light/src/svm_common.c @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/toolboxes/SVM-light/src/svm_common.c	Tue Feb 10 15:05:51 2015 +0000
@@ -0,0 +1,984 @@
+/************************************************************************/
+/*                                                                      */
+/*   svm_common.c                                                       */
+/*                                                                      */
+/*   Definitions and functions used in both svm_learn and svm_classify. */
+/*                                                                      */
+/*   Author: Thorsten Joachims                                          */
+/*   Date: 02.07.04                                                     */
+/*                                                                      */
+/*   Copyright (c) 2004  Thorsten Joachims - All rights reserved        */
+/*                                                                      */
+/*   This software is available for non-commercial use only. It must    */
+/*   not be modified and distributed without prior permission of the    */
+/*   author. The author is not responsible for implications from the    */
+/*   use of this software.                                              */
+/*                                                                      */
+/************************************************************************/
+
+# include "ctype.h"
+# include "svm_common.h"
+# include "kernel.h"           /* this contains a user supplied kernel */
+
+long   verbosity;              /* verbosity level (0-4) */
+long   kernel_cache_statistic;
+
+double classify_example(MODEL *model, DOC *ex) 
+     /* classifies one example */
+{
+  register long i;
+  register double dist;
+
+  if((model->kernel_parm.kernel_type == LINEAR) && (model->lin_weights))
+    return(classify_example_linear(model,ex));
+	   
+  dist=0;
+  for(i=1;i<model->sv_num;i++) {  
+    dist+=kernel(&model->kernel_parm,model->supvec[i],ex)*model->alpha[i];
+  }
+  return(dist-model->b);
+}
+
+double classify_example_linear(MODEL *model, DOC *ex) 
+     /* classifies example for linear kernel */
+     
+     /* important: the model must have the linear weight vector computed */
+     /* use: add_weight_vector_to_linear_model(&model); */
+
+
+     /* important: the feature numbers in the example to classify must */
+     /*            not be larger than the weight vector!               */
+{
+  double sum=0;
+  SVECTOR *f;
+
+  for(f=ex->fvec;f;f=f->next)  
+    sum+=f->factor*sprod_ns(model->lin_weights,f);
+  return(sum-model->b);
+}
+
+
+CFLOAT kernel(KERNEL_PARM *kernel_parm, DOC *a, DOC *b) 
+     /* calculate the kernel function */
+{
+  double sum=0;
+  SVECTOR *fa,*fb;
+
+  /* in case the constraints are sums of feature vector as represented
+     as a list of SVECTOR's with their coefficient factor in the sum,
+     take the kernel between all pairs */ 
+  for(fa=a->fvec;fa;fa=fa->next) { 
+    for(fb=b->fvec;fb;fb=fb->next) {
+      if(fa->kernel_id == fb->kernel_id)
+	sum+=fa->factor*fb->factor*single_kernel(kernel_parm,fa,fb);
+    }
+  }
+  return(sum);
+}
+
+CFLOAT single_kernel(KERNEL_PARM *kernel_parm, SVECTOR *a, SVECTOR *b) 
+     /* calculate the kernel function between two vectors */
+{
+  kernel_cache_statistic++;
+  switch(kernel_parm->kernel_type) {
+    case 0: /* linear */ 
+            return((CFLOAT)sprod_ss(a,b)); 
+    case 1: /* polynomial */
+            return((CFLOAT)pow(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const,(double)kernel_parm->poly_degree)); 
+    case 2: /* radial basis function */
+            return((CFLOAT)exp(-kernel_parm->rbf_gamma*(a->twonorm_sq-2*sprod_ss(a,b)+b->twonorm_sq)));
+    case 3: /* sigmoid neural net */
+            return((CFLOAT)tanh(kernel_parm->coef_lin*sprod_ss(a,b)+kernel_parm->coef_const)); 
+    case 4: /* custom-kernel supplied in file kernel.h*/
+            return((CFLOAT)custom_kernel(kernel_parm,a,b)); 
+    default: printf("Error: Unknown kernel function\n"); exit(1);
+  }
+}
+
+
+SVECTOR *create_svector(WORD *words,char *userdefined,double factor)
+{
+  SVECTOR *vec;
+  long    fnum,i;
+
+  fnum=0;
+  while(words[fnum].wnum) {
+    fnum++;
+  }
+  fnum++;
+  vec = (SVECTOR *)my_malloc(sizeof(SVECTOR));
+  vec->words = (WORD *)my_malloc(sizeof(WORD)*(fnum));
+  for(i=0;i<fnum;i++) { 
+      vec->words[i]=words[i];
+  }
+  vec->twonorm_sq=sprod_ss(vec,vec);
+
+  fnum=0;
+  while(userdefined[fnum]) {
+    fnum++;
+  }
+  fnum++;
+  vec->userdefined = (char *)my_malloc(sizeof(char)*(fnum));
+  for(i=0;i<fnum;i++) { 
+      vec->userdefined[i]=userdefined[i];
+  }
+  vec->kernel_id=0;
+  vec->next=NULL;
+  vec->factor=factor;
+  return(vec);
+}
+
+SVECTOR *copy_svector(SVECTOR *vec)
+{
+  SVECTOR *newvec=NULL;
+  if(vec) {
+    newvec=create_svector(vec->words,vec->userdefined,vec->factor);
+    newvec->next=copy_svector(vec->next);
+  }
+  return(newvec);
+}
+    
+void free_svector(SVECTOR *vec)
+{
+  if(vec) {
+    free(vec->words);
+    if(vec->userdefined)
+      free(vec->userdefined);
+    free_svector(vec->next);
+    free(vec);
+  }
+}
+
+double sprod_ss(SVECTOR *a, SVECTOR *b) 
+     /* compute the inner product of two sparse vectors */
+{
+    register CFLOAT sum=0;
+    register WORD *ai,*bj;
+    ai=a->words;
+    bj=b->words;
+    while (ai->wnum && bj->wnum) {
+      if(ai->wnum > bj->wnum) {
+	bj++;
+      }
+      else if (ai->wnum < bj->wnum) {
+	ai++;
+      }
+      else {
+	sum+=(CFLOAT)(ai->weight) * (CFLOAT)(bj->weight);
+	ai++;
+	bj++;
+      }
+    }
+    return((double)sum);
+}
+
+SVECTOR* sub_ss(SVECTOR *a, SVECTOR *b) 
+     /* compute the difference a-b of two sparse vectors */
+     /* Note: SVECTOR lists are not followed, but only the first
+	SVECTOR is used */
+{
+    SVECTOR *vec;
+    register WORD *sum,*sumi;
+    register WORD *ai,*bj;
+    long veclength;
+  
+    ai=a->words;
+    bj=b->words;
+    veclength=0;
+    while (ai->wnum && bj->wnum) {
+      if(ai->wnum > bj->wnum) {
+	veclength++;
+	bj++;
+      }
+      else if (ai->wnum < bj->wnum) {
+	veclength++;
+	ai++;
+      }
+      else {
+	veclength++;
+	ai++;
+	bj++;
+      }
+    }
+    while (bj->wnum) {
+      veclength++;
+      bj++;
+    }
+    while (ai->wnum) {
+      veclength++;
+      ai++;
+    }
+    veclength++;
+
+    sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
+    sumi=sum;
+    ai=a->words;
+    bj=b->words;
+    while (ai->wnum && bj->wnum) {
+      if(ai->wnum > bj->wnum) {
+	(*sumi)=(*bj);
+	sumi->weight*=(-1);
+	sumi++;
+	bj++;
+      }
+      else if (ai->wnum < bj->wnum) {
+	(*sumi)=(*ai);
+	sumi++;
+	ai++;
+      }
+      else {
+	(*sumi)=(*ai);
+	sumi->weight-=bj->weight;
+	if(sumi->weight != 0)
+	  sumi++;
+	ai++;
+	bj++;
+      }
+    }
+    while (bj->wnum) {
+      (*sumi)=(*bj);
+      sumi->weight*=(-1);
+      sumi++;
+      bj++;
+    }
+    while (ai->wnum) {
+      (*sumi)=(*ai);
+      sumi++;
+      ai++;
+    }
+    sumi->wnum=0;
+
+    vec=create_svector(sum,"",1.0);
+    free(sum);
+
+    return(vec);
+}
+
+SVECTOR* add_ss(SVECTOR *a, SVECTOR *b) 
+     /* compute the sum a+b of two sparse vectors */
+     /* Note: SVECTOR lists are not followed, but only the first
+	SVECTOR is used */
+{
+    SVECTOR *vec;
+    register WORD *sum,*sumi;
+    register WORD *ai,*bj;
+    long veclength;
+  
+    ai=a->words;
+    bj=b->words;
+    veclength=0;
+    while (ai->wnum && bj->wnum) {
+      if(ai->wnum > bj->wnum) {
+	veclength++;
+	bj++;
+      }
+      else if (ai->wnum < bj->wnum) {
+	veclength++;
+	ai++;
+      }
+      else {
+	veclength++;
+	ai++;
+	bj++;
+      }
+    }
+    while (bj->wnum) {
+      veclength++;
+      bj++;
+    }
+    while (ai->wnum) {
+      veclength++;
+      ai++;
+    }
+    veclength++;
+
+    /*** is veclength=lengSequence(a)+lengthSequence(b)? ***/
+
+    sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
+    sumi=sum;
+    ai=a->words;
+    bj=b->words;
+    while (ai->wnum && bj->wnum) {
+      if(ai->wnum > bj->wnum) {
+	(*sumi)=(*bj);
+	sumi++;
+	bj++;
+      }
+      else if (ai->wnum < bj->wnum) {
+	(*sumi)=(*ai);
+	sumi++;
+	ai++;
+      }
+      else {
+	(*sumi)=(*ai);
+	sumi->weight+=bj->weight;
+	if(sumi->weight != 0)
+	  sumi++;
+	ai++;
+	bj++;
+      }
+    }
+    while (bj->wnum) {
+      (*sumi)=(*bj);
+      sumi++;
+      bj++;
+    }
+    while (ai->wnum) {
+      (*sumi)=(*ai);
+      sumi++;
+      ai++;
+    }
+    sumi->wnum=0;
+
+    vec=create_svector(sum,"",1.0);
+    free(sum);
+
+    return(vec);
+}
+
+SVECTOR* add_list_ss(SVECTOR *a) 
+     /* computes the linear combination of the SVECTOR list weighted
+	by the factor of each SVECTOR */
+{
+  SVECTOR *scaled,*oldsum,*sum,*f;
+  WORD    empty[2];
+    
+  if(a){
+    sum=smult_s(a,a->factor);
+    for(f=a->next;f;f=f->next) {
+      scaled=smult_s(f,f->factor);
+      oldsum=sum;
+      sum=add_ss(sum,scaled);
+      free_svector(oldsum);
+      free_svector(scaled);
+    }
+    sum->factor=1.0;
+  }
+  else {
+    empty[0].wnum=0;
+    sum=create_svector(empty,"",1.0);
+  }
+  return(sum);
+}
+
+void append_svector_list(SVECTOR *a, SVECTOR *b) 
+     /* appends SVECTOR b to the end of SVECTOR a. */
+{
+    SVECTOR *f;
+    
+    for(f=a;f->next;f=f->next);  /* find end of first vector list */
+    f->next=b;                   /* append the two vector lists */
+}
+
+SVECTOR* smult_s(SVECTOR *a, double factor) 
+     /* scale sparse vector a by factor */
+{
+    SVECTOR *vec;
+    register WORD *sum,*sumi;
+    register WORD *ai;
+    long veclength;
+  
+    ai=a->words;
+    veclength=0;
+    while (ai->wnum) {
+      veclength++;
+      ai++;
+    }
+    veclength++;
+
+    sum=(WORD *)my_malloc(sizeof(WORD)*veclength);
+    sumi=sum;
+    ai=a->words;
+    while (ai->wnum) {
+	(*sumi)=(*ai);
+	sumi->weight*=factor;
+	if(sumi->weight != 0)
+	  sumi++;
+	ai++;
+    }
+    sumi->wnum=0;
+
+    vec=create_svector(sum,a->userdefined,a->factor);
+    free(sum);
+
+    return(vec);
+}
+
+int featvec_eq(SVECTOR *a, SVECTOR *b)
+     /* tests two sparse vectors for equality */
+{
+    register WORD *ai,*bj;
+    ai=a->words;
+    bj=b->words;
+    while (ai->wnum && bj->wnum) {
+      if(ai->wnum > bj->wnum) {
+	if((CFLOAT)(bj->weight) != 0)
+	  return(0);
+	bj++;
+      }
+      else if (ai->wnum < bj->wnum) {
+	if((CFLOAT)(ai->weight) != 0)
+	  return(0);
+	ai++;
+      }
+      else {
+	if((CFLOAT)(ai->weight) != (CFLOAT)(bj->weight)) 
+	  return(0);
+	ai++;
+	bj++;
+      }
+    }
+    return(1);
+}
+
+double model_length_s(MODEL *model, KERNEL_PARM *kernel_parm) 
+     /* compute length of weight vector */
+{
+  register long i,j;
+  register double sum=0,alphai;
+  register DOC *supveci;
+
+  for(i=1;i<model->sv_num;i++) {  
+    alphai=model->alpha[i];
+    supveci=model->supvec[i];
+    for(j=1;j<model->sv_num;j++) {
+      sum+=alphai*model->alpha[j]
+	   *kernel(kernel_parm,supveci,model->supvec[j]);
+    }
+  }
+  return(sqrt(sum));
+}
+
+void clear_vector_n(double *vec, long int n)
+{
+  register long i;
+  for(i=0;i<=n;i++) vec[i]=0;
+}
+
+void add_vector_ns(double *vec_n, SVECTOR *vec_s, double faktor)
+{
+  register WORD *ai;
+  ai=vec_s->words;
+  while (ai->wnum) {
+    vec_n[ai->wnum]+=(faktor*ai->weight);
+    ai++;
+  }
+}
+
+double sprod_ns(double *vec_n, SVECTOR *vec_s)
+{
+  register double sum=0;
+  register WORD *ai;
+  ai=vec_s->words;
+  while (ai->wnum) {
+    sum+=(vec_n[ai->wnum]*ai->weight);
+    ai++;
+  }
+  return(sum);
+}
+
+void add_weight_vector_to_linear_model(MODEL *model)
+     /* compute weight vector in linear case and add to model */
+{
+  long i;
+  SVECTOR *f;
+
+  model->lin_weights=(double *)my_malloc(sizeof(double)*(model->totwords+1));
+  clear_vector_n(model->lin_weights,model->totwords);
+  for(i=1;i<model->sv_num;i++) {
+    for(f=(model->supvec[i])->fvec;f;f=f->next)  
+      add_vector_ns(model->lin_weights,f,f->factor*model->alpha[i]);
+  }
+}
+
+
+DOC *create_example(long docnum, long queryid, long slackid, 
+		    double costfactor, SVECTOR *fvec)
+{
+  DOC *example;
+  example = (DOC *)my_malloc(sizeof(DOC));
+  example->docnum=docnum;
+  example->queryid=queryid;
+  example->slackid=slackid;
+  example->costfactor=costfactor;
+  example->fvec=fvec;
+  return(example);
+}
+
+void free_example(DOC *example, long deep)
+{
+  if(example) {
+    if(deep) {
+      if(example->fvec)
+	free_svector(example->fvec);
+    }
+    free(example);
+  }
+}
+
+void write_model(char *modelfile, MODEL *model)
+{
+  FILE *modelfl;
+  long j,i,sv_num;
+  SVECTOR *v;
+
+  if(verbosity>=1) {
+    printf("Writing model file..."); fflush(stdout);
+  }
+  if ((modelfl = fopen (modelfile, "w")) == NULL)
+  { perror (modelfile); exit (1); }
+  fprintf(modelfl,"SVM-light Version %s\n",VERSION);
+  fprintf(modelfl,"%ld # kernel type\n",
+	  model->kernel_parm.kernel_type);
+  fprintf(modelfl,"%ld # kernel parameter -d \n",
+	  model->kernel_parm.poly_degree);
+  fprintf(modelfl,"%.8g # kernel parameter -g \n",
+	  model->kernel_parm.rbf_gamma);
+  fprintf(modelfl,"%.8g # kernel parameter -s \n",
+	  model->kernel_parm.coef_lin);
+  fprintf(modelfl,"%.8g # kernel parameter -r \n",
+	  model->kernel_parm.coef_const);
+  fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
+  fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
+  fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
+ 
+  sv_num=1;
+  for(i=1;i<model->sv_num;i++) {
+    for(v=model->supvec[i]->fvec;v;v=v->next) 
+      sv_num++;
+  }
+  fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
+  fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
+
+  for(i=1;i<model->sv_num;i++) {
+    for(v=model->supvec[i]->fvec;v;v=v->next) {
+      fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
+      for (j=0; (v->words[j]).wnum; j++) {
+	fprintf(modelfl,"%ld:%.8g ",
+		(long)(v->words[j]).wnum,
+		(double)(v->words[j]).weight);
+      }
+      fprintf(modelfl,"#%s\n",v->userdefined);
+    /* NOTE: this could be made more efficient by summing the
+       alpha's of identical vectors before writing them to the
+       file. */
+    }
+  }
+  fclose(modelfl);
+  if(verbosity>=1) {
+    printf("done\n");
+  }
+}
+
+
+MODEL *read_model(char *modelfile)
+{
+  FILE *modelfl;
+  long i,queryid,slackid;
+  double costfactor;
+  long max_sv,max_words,ll,wpos;
+  char *line,*comment;
+  WORD *words;
+  char version_buffer[100];
+  MODEL *model;
+
+  if(verbosity>=1) {
+    printf("Reading model..."); fflush(stdout);
+  }
+
+  nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
+  max_words+=2;
+  ll+=2;
+
+  words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
+  line = (char *)my_malloc(sizeof(char)*ll);
+  model = (MODEL *)my_malloc(sizeof(MODEL));
+
+  if ((modelfl = fopen (modelfile, "r")) == NULL)
+  { perror (modelfile); exit (1); }
+
+  fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
+  if(strcmp(version_buffer,VERSION)) {
+    perror ("Version of model-file does not match version of svm_classify!"); 
+    exit (1); 
+  }
+  fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);  
+  fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
+  fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
+  fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
+  fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
+  fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
+
+  fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
+  fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
+  fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
+  fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
+
+  model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
+  model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
+  model->index=NULL;
+  model->lin_weights=NULL;
+
+  for(i=1;i<model->sv_num;i++) {
+    fgets(line,(int)ll,modelfl);
+    if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
+		       &costfactor,&wpos,max_words,&comment)) {
+      printf("\nParsing error while reading model file in SV %ld!\n%s",
+	     i,line);
+      exit(1);
+    }
+    model->supvec[i] = create_example(-1,
+				      0,0,
+				      0.0,
+				      create_svector(words,comment,1.0));
+  }
+  fclose(modelfl);
+  free(line);
+  free(words);
+  if(verbosity>=1) {
+    fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
+  }
+  return(model);
+}
+
+MODEL *copy_model(MODEL *model)
+{
+  MODEL *newmodel;
+  long  i;
+
+  newmodel=(MODEL *)my_malloc(sizeof(MODEL));
+  (*newmodel)=(*model);
+  newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
+  newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
+  newmodel->index = NULL; /* index is not copied */
+  newmodel->supvec[0] = NULL;
+  newmodel->alpha[0] = 0;
+  for(i=1;i<model->sv_num;i++) {
+    newmodel->alpha[i]=model->alpha[i];
+    newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
+				       model->supvec[i]->queryid,0,
+				       model->supvec[i]->costfactor,
+				       copy_svector(model->supvec[i]->fvec));
+  }
+  if(model->lin_weights) {
+    newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
+    for(i=0;i<model->totwords+1;i++) 
+      newmodel->lin_weights[i]=model->lin_weights[i];
+  }
+  return(newmodel);
+}
+
+void free_model(MODEL *model, int deep)
+{
+  long i;
+
+  if(model->supvec) {
+    if(deep) {
+      for(i=1;i<model->sv_num;i++) {
+	free_example(model->supvec[i],1);
+      }
+    }
+    free(model->supvec);
+  }
+  if(model->alpha) free(model->alpha);
+  if(model->index) free(model->index);
+  if(model->lin_weights) free(model->lin_weights);
+  free(model);
+}
+
+
+void read_documents(char *docfile, DOC ***docs, double **label, 
+		    long int *totwords, long int *totdoc)
+{
+  char *line,*comment;
+  WORD *words;
+  long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
+  long max_words_doc, ll;
+  double doc_label,costfactor;
+  FILE *docfl;
+
+  if(verbosity>=1) {
+    printf("Scanning examples..."); fflush(stdout);
+  }
+  nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
+  max_words_doc+=2;
+  ll+=2;
+  max_docs+=2;
+  if(verbosity>=1) {
+    printf("done\n"); fflush(stdout);
+  }
+
+  (*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs);    /* feature vectors */
+  (*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
+  line = (char *)my_malloc(sizeof(char)*ll);
+
+  if ((docfl = fopen (docfile, "r")) == NULL)
+  { perror (docfile); exit (1); }
+
+  words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
+  if(verbosity>=1) {
+    printf("Reading examples into memory..."); fflush(stdout);
+  }
+  dnum=0;
+  (*totwords)=0;
+  while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
+    if(line[0] == '#') continue;  /* line contains comments */
+    if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
+		       &wpos,max_words_doc,&comment)) {
+      printf("\nParsing error in line %ld!\n%s",dnum,line);
+      exit(1);
+    }
+    (*label)[dnum]=doc_label;
+    /* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
+    if(doc_label > 0) dpos++;
+    if (doc_label < 0) dneg++;
+    if (doc_label == 0) dunlab++;
+    if((wpos>1) && ((words[wpos-2]).wnum>(*totwords))) 
+      (*totwords)=(words[wpos-2]).wnum;
+    if((*totwords) > MAXFEATNUM) {
+      printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
+      printf("LINE: %s\n",line);
+      exit(1);
+    }
+    (*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
+				   create_svector(words,comment,1.0));
+    /* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq);  */
+    dnum++;  
+    if(verbosity>=1) {
+      if((dnum % 100) == 0) {
+	printf("%ld..",dnum); fflush(stdout);
+      }
+    }
+  } 
+
+  fclose(docfl);
+  free(line);
+  free(words);
+  if(verbosity>=1) {
+    fprintf(stdout, "OK. (%ld examples read)\n", dnum);
+  }
+  (*totdoc)=dnum;
+}
+
+int parse_document(char *line, WORD *words, double *label,
+		   long *queryid, long *slackid, double *costfactor,
+		   long int *numwords, long int max_words_doc,
+		   char **comment)
+{
+  register long wpos,pos;
+  long wnum;
+  double weight;
+  int numread;
+  char featurepair[1000],junk[1000];
+
+  (*queryid)=0;
+  (*slackid)=0;
+  (*costfactor)=1;
+
+  pos=0;
+  (*comment)=NULL;
+  while(line[pos] ) {      /* cut off comments */
+    if((line[pos] == '#') && (!(*comment))) {
+      line[pos]=0;
+      (*comment)=&(line[pos+1]);
+    }
+    if(line[pos] == '\n') { /* strip the CR */
+      line[pos]=0;
+    }
+    pos++;
+  }
+  if(!(*comment)) (*comment)=&(line[pos]);
+  /* printf("Comment: '%s'\n",(*comment)); */
+
+  wpos=0;
+  /* check, that line starts with target value or zero, but not with
+     feature pair */
+  if(sscanf(line,"%s",featurepair) == EOF) return(0);
+  pos=0;
+  while((featurepair[pos] != ':') && featurepair[pos]) pos++;
+  if(featurepair[pos] == ':') {
+	perror ("Line must start with label or 0!!!\n"); 
+	printf("LINE: %s\n",line);
+	exit (1); 
+  }
+  /* read the target value */
+  if(sscanf(line,"%lf",label) == EOF) return(0);
+  pos=0;
+  while(space_or_null((int)line[pos])) pos++;
+  while((!space_or_null((int)line[pos])) && line[pos]) pos++;
+  while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) && 
+	(numread > 0) && 
+	(wpos<max_words_doc)) {
+    /* printf("%s\n",featurepair); */
+    while(space_or_null((int)line[pos])) pos++;
+    while((!space_or_null((int)line[pos])) && line[pos]) pos++;
+    if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
+      /* it is the query id */
+      (*queryid)=(long)wnum;
+    }
+    else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
+      /* it is the slack id */
+      if(wnum > 0) 
+	(*slackid)=(long)wnum;
+      else {
+	perror ("Slack-id must be greater or equal to 1!!!\n"); 
+	printf("LINE: %s\n",line);
+	exit (1); 
+      }
+    }
+    else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
+      /* it is the example-dependent cost factor */
+      (*costfactor)=(double)weight;
+    }
+    else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
+      /* it is a regular feature */
+      if(wnum<=0) { 
+	perror ("Feature numbers must be larger or equal to 1!!!\n"); 
+	printf("LINE: %s\n",line);
+	exit (1); 
+      }
+      if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) { 
+	perror ("Features must be in increasing order!!!\n"); 
+	printf("LINE: %s\n",line);
+	exit (1); 
+      }
+      (words[wpos]).wnum=wnum;
+      (words[wpos]).weight=(FVAL)weight; 
+      wpos++;
+    }
+    else {
+      perror ("Cannot parse feature/value pair!!!\n"); 
+      printf("'%s' in LINE: %s\n",featurepair,line);
+      exit (1); 
+    }
+  }
+  (words[wpos]).wnum=0;
+  (*numwords)=wpos+1;
+  return(1);
+}
+
+double *read_alphas(char *alphafile,long totdoc)
+     /* reads the alpha vector from a file as written by the
+        write_alphas function */
+{
+  FILE *fl;
+  double *alpha;
+  long dnum;
+
+  if ((fl = fopen (alphafile, "r")) == NULL)
+  { perror (alphafile); exit (1); }
+
+  alpha = (double *)my_malloc(sizeof(double)*totdoc);
+  if(verbosity>=1) {
+    printf("Reading alphas..."); fflush(stdout);
+  }
+  dnum=0;
+  while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
+    dnum++;
+  }
+  if(dnum != totdoc)
+  { perror ("\nNot enough values in alpha file!"); exit (1); }
+  fclose(fl);
+
+  if(verbosity>=1) {
+    printf("done\n"); fflush(stdout);
+  }
+
+  return(alpha);
+}
+
+void nol_ll(char *file, long int *nol, long int *wol, long int *ll) 
+     /* Grep through file and count number of lines, maximum number of
+        spaces per line, and longest line. */
+{
+  FILE *fl;
+  int ic;
+  char c;
+  long current_length,current_wol;
+
+  if ((fl = fopen (file, "r")) == NULL)
+  { perror (file); exit (1); }
+  current_length=0;
+  current_wol=0;
+  (*ll)=0;
+  (*nol)=1;
+  (*wol)=0;
+  while((ic=getc(fl)) != EOF) {
+    c=(char)ic;
+    current_length++;
+    if(space_or_null((int)c)) {
+      current_wol++;
+    }
+    if(c == '\n') {
+      (*nol)++;
+      if(current_length>(*ll)) {
+	(*ll)=current_length;
+      }
+      if(current_wol>(*wol)) {
+	(*wol)=current_wol;
+      }
+      current_length=0;
+      current_wol=0;
+    }
+  }
+  fclose(fl);
+}
+
+long minl(long int a, long int b)
+{
+  if(a<b)
+    return(a);
+  else
+    return(b);
+}
+
+long maxl(long int a, long int b)
+{
+  if(a>b)
+    return(a);
+  else
+    return(b);
+}
+
+long get_runtime(void)
+{
+  clock_t start;
+  start = clock();
+  return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
+}
+
+
+# ifdef _MSC_VER
+
+int isnan(double a)
+{
+  return(_isnan(a));
+}
+
+# endif 
+
+int space_or_null(int c) {
+  if (c==0)
+    return 1;
+  return isspace(c);
+}
+
+void *my_malloc(size_t size)
+{
+  void *ptr;
+  ptr=(void *)malloc(size);
+  if(!ptr) { 
+    perror ("Out of memory!\n"); 
+    exit (1); 
+  }
+  return(ptr);
+}
+
+void copyright_notice(void)
+{
+  printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
+  printf("This software is available for non-commercial use only. It must not\n");
+  printf("be modified and distributed without prior permission of the author.\n");
+  printf("The author is not responsible for implications from the use of this\n");
+  printf("software.\n\n");
+}