changeset 280:3be15407e814

Merge sampling branch (-r361:405, though I hope that the branch is now finished) onto trunk. API developers take note. Things still to clear up: * whether the threshold distance it currently reports bears any relation to reality; * if not, how to bring it a bit more into alignment; * minor code cleanup issues in sample.cpp; * incorporating --absolute-threshold handling into sampling; * writing suitable test cases.
author mas01cr
date Wed, 02 Jul 2008 14:07:10 +0000
parents d209e8470a60
children f042cbf427b3
files Makefile audioDB.cpp audioDB.h gengetopt.in sample.cpp xthresh.c
diffstat 6 files changed, 253 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Mon Jun 30 17:38:59 2008 +0000
+++ b/Makefile	Wed Jul 02 14:07:10 2008 +0000
@@ -37,10 +37,10 @@
 %.o: %.cpp audioDB.h adb.nsmap cmdline.h reporter.h
 	g++ -c ${CFLAGS} ${GSOAP_INCLUDE} -Wall -Werror $<
 
-OBJS=insert.o create.o common.o dump.o query.o soap.o audioDB.o
+OBJS=insert.o create.o common.o dump.o query.o soap.o sample.o audioDB.o
 
 ${EXECUTABLE}: ${OBJS} soapServer.cpp soapClient.cpp soapC.cpp cmdline.c
-	g++ -o ${EXECUTABLE} ${CFLAGS} ${GSOAP_INCLUDE} $^ ${GSOAP_CPP}
+	g++ -o ${EXECUTABLE} ${CFLAGS} -lgsl -lgslcblas ${GSOAP_INCLUDE} $^ ${GSOAP_CPP}
 
 clean:
 	-rm cmdline.c cmdline.h
@@ -52,3 +52,6 @@
 
 test: ${EXECUTABLE}
 	-sh -c "cd tests && sh ./run-tests.sh"
+
+xthresh: xthresh.c
+	gcc -o $@ -lgsl -lgslcblas $<
--- a/audioDB.cpp	Mon Jun 30 17:38:59 2008 +0000
+++ b/audioDB.cpp	Wed Jul 02 14:07:10 2008 +0000
@@ -37,6 +37,9 @@
       ws_status(dbName,(char*)hostport);
     else
       status(dbName);
+
+  else if(O2_ACTION(COM_SAMPLE))
+    sample(dbName);
   
   else if(O2_ACTION(COM_L2NORM))
     l2norm(dbName);
@@ -210,6 +213,17 @@
     return 0;
   }
 
+  if(args_info.SAMPLE_given) {
+    command = COM_SAMPLE;
+    dbName = args_info.database_arg;
+    sequenceLength = args_info.sequencelength_arg;
+    if(sequenceLength < 1 || sequenceLength > 1000) {
+      error("seqlen out of range: 1 <= seqlen <= 1000");
+    }
+    nsamples = args_info.nsamples_arg;
+    return 0;
+  }
+
   if(args_info.DUMP_given){
     command=COM_DUMP;
     dbName=args_info.database_arg;
--- a/audioDB.h	Mon Jun 30 17:38:59 2008 +0000
+++ b/audioDB.h	Wed Jul 02 14:07:10 2008 +0000
@@ -12,6 +12,7 @@
 #include <assert.h>
 #include <float.h>
 #include <signal.h>
+#include <gsl/gsl_rng.h>
 
 // includes for web services
 #include "soapH.h"
@@ -29,6 +30,7 @@
 #define COM_POWER "--POWER"
 #define COM_DUMP "--DUMP"
 #define COM_SERVER "--SERVER"
+#define COM_SAMPLE "--SAMPLE"
 
 // parameters
 #define COM_CLIENT "--client"
@@ -176,6 +178,8 @@
   // Flags and parameters
   unsigned verbosity;   // how much do we want to know?
 
+  unsigned nsamples;
+
   //off_t size; // given size (for creation)
   unsigned datasize; // size in MB
   unsigned ntracks;
@@ -247,6 +251,8 @@
   void batchinsert(const char* dbName, const char* inFile);
   void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0);
   void status(const char* dbName, adb__statusResponse *adbStatusResponse=0);
+  unsigned random_track(unsigned *propTable, unsigned total, gsl_rng *);
+  void sample(const char *dbName);
   void ws_status(const char*dbName, char* hostport);
   void ws_query(const char*dbName, const char *trackKey, const char* hostport);
   void l2norm(const char* dbName);
@@ -291,6 +297,7 @@
   powerTableLength(0), \
   l2normTableLength(0), \
   verbosity(1), \
+  nsamples(2000), \
   datasize(O2_DEFAULT_DATASIZE), \
   ntracks(O2_DEFAULT_NTRACKS), \
   datadim(O2_DEFAULT_DATADIM), \
--- a/gengetopt.in	Mon Jun 30 17:38:59 2008 +0000
+++ b/gengetopt.in	Wed Jul 02 14:07:10 2008 +0000
@@ -17,15 +17,21 @@
 option "ntracks" - "capacity of database for tracks" int dependon="NEW" default="20000" optional
 option "datadim" - "dimensionality of stored data" int dependon="NEW" default="9" optional
 
-section "Database Maintenance" sectiondesc="Querying, tweaking and dumping databases."
+section "Database Maintenance" sectiondesc="Tweaking and dumping databases."
 
-option "STATUS" S "output database information to stdout." dependon="database" optional
 option "DUMP"   D "output all entries: index key size." dependon="database" optional
 option "output" - "output directory" string dependon="DUMP" default="audioDB.dump" optional
 option "L2NORM" L "unit norm vectors and norm all future inserts." dependon="database" optional
 option "POWER"  P "turn on power flag for database." dependon="database" optional
 
+section "Database Information" sectiondesc="Information about databases."
+
+option "STATUS" S "output database information to stdout." dependon="database" optional
+option "SAMPLE" X "sample statistics for database." dependon="database" optional
+option "nsamples" - "number of pairwise samples to take." dependon="SAMPLE" int typestr="number" default="2000" optional
+
 section "Database Insertion" sectiondesc="The following commands insert feature files, with optional keys and timestamps.\n"
+
 option "INSERT"      I "add feature vectors to an existing database." dependon="features" optional
 option "UPDATE"      U "replace inserted vectors associated with key with new input vectors." dependon="features" dependon="key" dependon="database" optional hidden
 option "features" f "binary series of vectors file {int sz:ieee double[][sz]:eof}." string typestr="filename" dependon="database" optional
@@ -39,6 +45,8 @@
 option "powerList"   W "text file containing list of binary power feature file." string typestr="filename" dependon="database" optional
 option "keyList"     K "text file containing list of unique identifiers associated with --features." string typestr="filename" optional
 
+option "sequencelength" l "length of sequences for sequence search." int typestr="length" default="16" optional
+
 
 section "Database Search" sectiondesc="Thse commands control the retrieval behaviour.\n"
 
@@ -50,7 +58,6 @@
 option "expandfactor" x "time compress/expand factor of result length to query length [1.0 .. 100.0]." double default="1.1" optional hidden
 option "rotate"       o "rotate query vectors for rotationally invariant search." flag off optional hidden
 option "resultlength" r "maximum length of the result list." int typestr="length" default="10" optional
-option "sequencelength" l "length of sequences for sequence search." int typestr="length" default="16" dependon="QUERY" optional
 option "sequencehop"  h "hop size of sequence window for sequence search." int typestr="hop" default="1" dependon="QUERY" optional
 option "absolute-threshold" - "absolute power threshold for consideration of query or target sequence (in Bels)" double dependon="QUERY" optional
 option "relative-threshold" - "relative power threshold between query and target sequence (in Bels)" double dependon="QUERY" optional
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sample.cpp	Wed Jul 02 14:07:10 2008 +0000
@@ -0,0 +1,190 @@
+#include "audioDB.h"
+
+#include <gsl/gsl_sf.h>
+#include <gsl/gsl_rng.h>
+
+static
+double yfun(double d) {
+  return gsl_sf_log(d) - gsl_sf_psi(d);
+}
+
+static
+double yinv(double y) {
+  double a = 1.0e-5;
+  double b = 1000.0;
+
+  double ay = yfun(a);
+  double by = yfun(b);
+
+  double c, cy;
+
+  /* FIXME: simple binary search; there's probably some clever solver
+     in gsl somewhere which is less sucky. */
+  while ((b - a) > 1.0e-5) {
+    c = (a + b) / 2;
+    cy = yfun(c);
+    if (cy > y) {
+      a = c;
+      ay = cy;
+    } else {
+      b = c;
+      by = cy;
+    }
+  }
+
+  return c;
+}
+
+unsigned audioDB::random_track(unsigned *propTable, unsigned total, gsl_rng *rng) {
+  /* FIXME: make this O(1) by using the alias-rejection method, or
+     some other sensible method of sampling from a discrete
+     distribution. */
+  double thing = gsl_rng_uniform(rng);
+  unsigned sofar = 0;
+  for (unsigned int i = 0; i < dbH->numFiles; i++) {
+    sofar += propTable[i];
+    if (thing < ((double) sofar / (double) total)) {
+      return i;
+    }
+  }
+  error("fell through in random_track()");
+
+  /* FIXME: decorate error's declaration so that this isn't necessary */
+  return 0;
+}
+
+void audioDB::sample(const char *dbName) {
+  initTables(dbName, 0);
+
+  gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937);
+
+  /* FIXME: in Real Life we'll want to initialize the RNG using
+     /dev/random or the current time or something, like this:
+
+     unsigned int seed;
+     int fd = open("/dev/urandom", O_RDONLY);
+     read(fd, &seed, 4);
+     
+     gsl_rng_set(rng, seed);
+  */
+
+  // build track offset table (FIXME: cut'n'pasted from query.cpp)
+  off_t *trackOffsetTable = new off_t[dbH->numFiles];
+  unsigned cumTrack=0;
+  for(unsigned int k = 0; k < dbH->numFiles; k++){
+    trackOffsetTable[k] = cumTrack;
+    cumTrack += trackTable[k] * dbH->dim;
+  }
+
+  unsigned *propTable = new unsigned[dbH->numFiles];
+  unsigned total = 0;
+  unsigned count = 0;
+
+  for (unsigned int i = 0; i < dbH->numFiles; i++) {
+    /* what kind of a stupid language doesn't have binary max(), let
+       alone nary? */
+    unsigned int prop = trackTable[i] - sequenceLength + 1;
+    prop = prop > 0 ? prop : 0;
+    if (prop > 0) 
+      count++;
+    propTable[i] = prop;
+    total += prop;
+  }
+
+  if (total == 0) {
+    error("no sequences of this sequence length in the database", dbName);
+  }
+
+  unsigned int vlen = dbH->dim * sequenceLength;
+  double *v1 = new double[vlen];
+  double *v2 = new double[vlen];
+  double v1norm, v2norm, v1v2;
+
+  double sumdist = 0;
+  double sumlogdist = 0;
+
+  for (unsigned int i = 0; i < nsamples;) {
+    unsigned track1 = random_track(propTable, total, rng);
+    unsigned track2 = random_track(propTable, total, rng);
+
+    if(track1 == track2)
+      continue;
+
+    unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
+    unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
+
+    VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
+
+    /* FIXME: this seeking, reading and distance calculation should
+       share more code with the query loop */
+    lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
+    read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
+
+    lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
+    read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
+
+    v1norm = 0;
+    v2norm = 0;
+    v1v2 = 0;
+
+    for (unsigned int j = 0; j < vlen; j++) {
+      v1norm += v1[j]*v1[j];
+      v2norm += v2[j]*v2[j];
+      v1v2 += v1[j]*v2[j];
+    }
+
+    /* FIXME: we must deal with infinities better than this; there
+       could be all sorts of NaNs from arbitrary features.  Best
+       include power thresholds or something... */
+    if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
+
+      VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
+      /* assume normalizedDistance == true for now */
+      /* FIXME: not convinced that the statistics we calculated in
+	 TASLP paper are technically valid for normalizedDistance */
+
+      double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
+      // double dist = v1norm + v2norm - 2*v1v2;
+      
+      VERB_LOG(1, "%f %f\n", dist, log(dist));
+      sumdist += dist;
+      sumlogdist += log(dist);
+      i++;
+    } else {
+      VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
+    }
+  }
+
+  /* FIXME: the mean isn't really what we should be reporting here */
+  unsigned meanN = total / count;
+
+  double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
+  double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
+
+  std::cout << "Summary statistics" << std::endl;
+  std::cout << "number of samples: " << nsamples << std::endl;
+  std::cout << "sum of distances (S): " << sumdist << std::endl;
+  std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
+
+  /* FIXME: we'll also want some more summary statistics based on
+     propTable, for the minimum-of-X estimate */
+  std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
+  std::cout << std::endl;
+  std::cout << "Estimated parameters" << std::endl;
+  std::cout << "sigma^2: " << sigma2 << "; ";
+  std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
+  std::cout << "d: " << d << std::endl;
+
+  double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
+  double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
+    - (2 / d) * gsl_sf_log(meanN)
+    - gsl_sf_log(d/2)
+    - (2 / d) * gsl_sf_log(2 / d)
+    + (2 / d) * gsl_sf_lngamma(d / 2);
+
+  std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
+
+  delete[] propTable;
+  delete[] v1;
+  delete[] v2;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xthresh.c	Wed Jul 02 14:07:10 2008 +0000
@@ -0,0 +1,27 @@
+#include <gsl/gsl_sf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+int main(int argc, char *argv[]) {
+  if(argc != 4) {
+    fprintf(stderr, "Wrong number of arguments: %d\n", argc);
+    exit(1);
+  }
+
+  long int meanN = strtol(argv[1], NULL, 10);
+
+  double d = strtod(argv[2], NULL);
+  double sigma2 = strtod(argv[3], NULL);
+
+  double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
+  double logxthresh = gsl_sf_log(sigma2) + logw
+    - (2 / d) * gsl_sf_log(meanN)
+    - gsl_sf_log(d/2)
+    - (2 / d) * gsl_sf_log(2 / d)
+    + (2 / d) * gsl_sf_lngamma(d / 2);
+
+  printf("w: %f\n", exp(logw));
+  printf("x_thresh: %f\n", exp(logxthresh));
+  exit(0);
+}