changeset 693:b1723ae7675e

begin work on sampling API This is motivated by the need to be able to sample with arbitrary feature data (e.g. from a feature file) against a database, for the JNMR "collections" paper revisions or possible ISMIR paper revisions. That bit doesn't work yet, but the C-ified version of the current functionality (sample db x db and sample key x db) works to the level of anecdotal tests. The general approach is to mirror the _query_spec() API, where a whole heap of knobs and twiddles are available to the user. Unlike in the _query_spec() API, not quite all of the knobs make sense (and even fewer are actually implemented), but the basic idea is the same. I pity the poor chump who will have to document all this.
author mas01cr
date Thu, 22 Apr 2010 21:03:47 +0000
parents 02756c5ca15a
children 55aa1919d735
files audioDB-internals.h audioDB.cpp audioDB.h audioDB_API.h close.cpp common.cpp open.cpp sample.cpp
diffstat 8 files changed, 454 insertions(+), 190 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB-internals.h	Thu Apr 22 15:43:26 2010 +0000
+++ b/audioDB-internals.h	Thu Apr 22 21:03:47 2010 +0000
@@ -6,6 +6,7 @@
 #endif
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/time.h>
 
 #include <stdio.h>
 #include <errno.h>
@@ -21,6 +22,8 @@
 #include <windows.h>
 #endif
 
+#include <gsl/gsl_rng.h>
+
 #include <algorithm>
 #include <iostream>
 #include <map>
@@ -119,6 +122,7 @@
   std::vector<uint32_t> *track_lengths;
   std::vector<off_t> *track_offsets;
   LSH *cached_lsh;
+  gsl_rng *rng;
 };
 
 typedef struct {
@@ -374,6 +378,7 @@
 void audiodb_index_delete_shingles(vector<vector<float> > *);
 void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t);
 int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float);
+int audiodb_sample_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
 
 #define ADB_MAXSTR (512U)
 #define ADB_FILETABLE_ENTRY_SIZE (256U)
--- a/audioDB.cpp	Thu Apr 22 15:43:26 2010 +0000
+++ b/audioDB.cpp	Thu Apr 22 21:03:47 2010 +0000
@@ -147,8 +147,6 @@
     munmap(powerFileNameTable, fileTableLength);
   if(reporter)
     delete reporter;
-  if(rng)
-    gsl_rng_free(rng);
   if(infid>0) {
     close(infid);
     infid = 0;
@@ -846,7 +844,8 @@
     int fd;
     struct stat st;
 
-    /* FIXME: around here there are all sorts of hideous leaks. */
+    /* FIXME: around here error conditions will cause all sorts of
+       hideous leaks. */
     fd = open(inFile, O_RDONLY);
     if(fd < 0) {
       error("failed to open feature file", inFile);
@@ -1010,6 +1009,150 @@
   audiodb_liszt_free_results(adb, results);  
 }
 
+#include <gsl/gsl_sf.h>
+
+static
+double yfun(double d) {
+  return gsl_sf_log(d) - gsl_sf_psi(d);
+}
+
+static
+double yinv(double y) {
+  double a = 1.0e-5;
+  double b = 1000.0;
+
+  double ay = yfun(a);
+  double by = yfun(b);
+
+  double c = 0;
+  double cy;
+
+  /* FIXME: simple binary search; there's probably some clever solver
+     in gsl somewhere which is less sucky. */
+  while ((b - a) > 1.0e-5) {
+    c = (a + b) / 2;
+    cy = yfun(c);
+    if (cy > y) {
+      a = c;
+      ay = cy;
+    } else {
+      b = c;
+      by = cy;
+    }
+  }
+
+  return c;
+}
+
+void audioDB::sample(const char *dbName) {
+  if(!adb) {
+    if(!(adb = audiodb_open(dbName, O_RDONLY))) {
+      error("failed to open database", dbName);
+    }
+  }
+
+  adb_status_t status;
+  if(audiodb_status(adb, &status)) {
+    error("error getting status");
+  }
+
+  double sumdist = 0;
+  double sumlogdist = 0;
+
+  adb_query_results_t *results;
+  adb_query_spec_t spec = {{0},{0},{0}};
+  adb_datum_t datum = {0};
+
+  spec.refine.qhopsize = sequenceHop;
+  spec.refine.ihopsize = sequenceHop;
+  if(sequenceHop != 1) {
+    spec.refine.flags |= ADB_REFINE_HOP_SIZE;
+  }
+
+  if(query_from_key) {
+    datum.key = key;
+    spec.qid.datum = &datum;
+    spec.qid.flags |= ADB_QID_FLAG_EXHAUSTIVE;
+    spec.refine.flags |= ADB_REFINE_EXCLUDE_KEYLIST;
+    spec.refine.exclude.nkeys = 1;
+    spec.refine.exclude.keys = &key;
+  } else if(inFile) {
+    error("sample from feature file not supported (yet)");
+  } else {
+    spec.qid.datum = NULL; /* full db sample */
+  }
+  spec.qid.sequence_length = sequenceLength;
+  spec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE;
+  spec.qid.sequence_start = queryPoint;
+
+  spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
+  spec.params.accumulation = ADB_ACCUMULATION_DB;
+  spec.params.npoints = nsamples;
+
+  if(!(results = audiodb_sample_spec(adb, &spec))) {
+    error("error in audiodb_sample_spec");
+  }
+
+  if(results->nresults != nsamples) {
+    error("mismatch in sample count");
+  }
+
+  for(uint32_t i = 0; i < nsamples; i++) {
+    double d = results->results[i].dist;
+    sumdist += d;
+    sumlogdist += log(d);
+  }
+
+  audiodb_query_free_results(adb, &spec, results);
+
+  unsigned total = 0;
+  unsigned count = 0;
+  adb_liszt_results_t *liszt;
+  if(!(liszt = audiodb_liszt(adb))) {
+    error("liszt failed");
+  }
+  for(uint32_t i = 0; i < liszt->nresults; i++) {
+    unsigned int prop = (liszt->entries[i].nvectors - sequenceLength)/sequenceHop + 1;
+    prop = prop > 0 ? prop : 0;
+    if (prop > 0) {
+      count++;
+    }
+    total += prop;
+  }
+
+  /* FIXME: the mean isn't really what we should be using here; it's
+     more a question of "how many independent sequences of length
+     sequenceLength are there in the database? */
+  unsigned meanN = total / count;
+
+  double sigma2 = sumdist / (sequenceLength * status.dim * nsamples);
+  double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
+
+  std::cout << "Summary statistics" << std::endl;
+  std::cout << "number of samples: " << nsamples << std::endl;
+  std::cout << "sum of distances (S): " << sumdist << std::endl;
+  std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
+
+  /* FIXME: we'll also want some more summary statistics based on
+     propTable, for the minimum-of-X estimate */
+  std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
+  std::cout << std::endl;
+  std::cout << "Estimated parameters" << std::endl;
+  std::cout << "sigma^2: " << sigma2 << "; ";
+  std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
+  std::cout << "d: " << d << std::endl;
+
+  double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
+  double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
+    - (2 / d) * gsl_sf_log(meanN)
+    - gsl_sf_log(d/2)
+    - (2 / d) * gsl_sf_log(2 / d)
+    + (2 / d) * gsl_sf_lngamma(d / 2);
+
+  std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
+}
+
+
 // This entry point is visited once per instance
 // so it is a good place to set any global state variables
 int main(const int argc, const char* argv[]){
--- a/audioDB.h	Thu Apr 22 15:43:26 2010 +0000
+++ b/audioDB.h	Thu Apr 22 21:03:47 2010 +0000
@@ -21,7 +21,6 @@
 #include <assert.h>
 #include <float.h>
 #include <signal.h>
-#include <gsl/gsl_rng.h>
 
 // includes for LSH indexing
 extern "C" {
@@ -204,8 +203,6 @@
   struct adb_header *dbH;
   struct adb *adb;
 
-  gsl_rng *rng;
-  
   char* fileTable;
   unsigned* trackTable;
   double* l2normTable;
@@ -264,7 +261,6 @@
   void error(const char* a, const char* b = "", const char *sysFunc = 0) __attribute__ ((noreturn));
 
   void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata);
-  void initRNG();
   void initDBHeader(const char *dbName);
   void initInputFile(const char *inFile);
   void initTables(const char* dbName, const char* inFile = 0);
@@ -347,7 +343,6 @@
     infid(0),					\
     dbH(0),					\
     adb(0),                                     \
-    rng(0),                                     \
     fileTable(0),				\
     trackTable(0),				\
     l2normTable(0),				\
--- a/audioDB_API.h	Thu Apr 22 15:43:26 2010 +0000
+++ b/audioDB_API.h	Thu Apr 22 21:03:47 2010 +0000
@@ -167,6 +167,9 @@
 adb_liszt_results_t *audiodb_liszt(adb_t *);
 int audiodb_liszt_free_results(adb_t *, adb_liszt_results_t *);
 
+/* sample */
+adb_query_results_t *audiodb_sample_spec(adb_t *, const adb_query_spec_t *);
+
 /* backwards compatibility */
 int audiodb_insert(adb_t *, adb_insert_t *ins);
 int audiodb_batchinsert(adb_t *, adb_insert_t *ins, unsigned int size);
--- a/close.cpp	Thu Apr 22 15:43:26 2010 +0000
+++ b/close.cpp	Thu Apr 22 21:03:47 2010 +0000
@@ -10,6 +10,7 @@
   delete adb->keymap;
   delete adb->track_lengths;
   delete adb->track_offsets;
+  gsl_rng_free(adb->rng);
   if(adb->cached_lsh) {
     delete adb->cached_lsh;
   }
--- a/common.cpp	Thu Apr 22 15:43:26 2010 +0000
+++ b/common.cpp	Thu Apr 22 21:03:47 2010 +0000
@@ -33,29 +33,6 @@
     }
 }
 
-void audioDB::initRNG() {
-  rng = gsl_rng_alloc(gsl_rng_mt19937);
-  if(!rng) {
-    error("could not allocate Random Number Generator");
-  }
-  /* FIXME: maybe we should use a real source of entropy? */
-  uint32_t seed = 0;
-#ifdef WIN32
-  seed = time(NULL);
-#else
-  struct timeval tv;
-  if(gettimeofday(&tv, NULL) == -1) {
-    error("failed to get time of day");
-  }
-  /* usec field should be less than than 2^20.  We want to mix the
-     usec field, highly-variable, into the high bits of the seconds
-     field, which will be static between closely-spaced runs.  -- CSR,
-     2010-01-05 */
-  seed = tv.tv_sec ^ (tv.tv_usec << 12);
-#endif
-  gsl_rng_set(rng, seed);
-}
-
 void audioDB::initDBHeader(const char* dbName) {
   if(!adb) {
     adb = audiodb_open(dbName, forWrite ? O_RDWR : O_RDONLY);
@@ -140,13 +117,6 @@
 }
 
 void audioDB::initTables(const char* dbName, const char* inFile) {
-  /* FIXME: initRNG() really logically belongs in the audioDB
-     contructor.  However, there are of the order of four constructors
-     at the moment, and more to come from API implementation.  Given
-     that duplication, I think this is the least worst place to put
-     it; the assumption is that nothing which doesn't look at a
-     database will need an RNG.  -- CSR, 2008-07-02 */
-  initRNG();
   initDBHeader(dbName);
   if(inFile)
     initInputFile(inFile);
--- a/open.cpp	Thu Apr 22 15:43:26 2010 +0000
+++ b/open.cpp	Thu Apr 22 21:03:47 2010 +0000
@@ -67,6 +67,31 @@
   return 1;
 }
 
+static int audiodb_init_rng(adb_t *adb) {
+  gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937);
+  if(!rng) {
+    return 1;
+  }
+  /* FIXME: maybe we should use a real source of entropy? */
+  uint32_t seed = 0;
+#ifdef WIN32
+  seed = time(NULL);
+#else
+  struct timeval tv;
+  if(gettimeofday(&tv, NULL) == -1) {
+    return 1;
+  }
+  /* usec field should be less than than 2^20.  We want to mix the
+     usec field, highly-variable, into the high bits of the seconds
+     field, which will be static between closely-spaced runs.  -- CSR,
+     2010-01-05 */
+  seed = tv.tv_sec ^ (tv.tv_usec << 12);
+#endif
+  gsl_rng_set(rng, seed);
+  adb->rng = rng;
+  return 0;
+}
+
 adb_t *audiodb_open(const char *path, int flags) {
   adb_t *adb = 0;
   int fd = -1;
@@ -128,6 +153,9 @@
     goto error;
   }
   adb->cached_lsh = 0;
+  if(audiodb_init_rng(adb)) {
+    goto error;
+  }
   return adb;
 
  error:
@@ -143,6 +171,9 @@
     if(adb->track_lengths) {
       delete adb->track_lengths;
     }
+    if(adb->rng) {
+      gsl_rng_free(adb->rng);
+    }
     free(adb);
   }
   if(fd != -1) {
--- a/sample.cpp	Thu Apr 22 15:43:26 2010 +0000
+++ b/sample.cpp	Thu Apr 22 21:03:47 2010 +0000
@@ -1,198 +1,314 @@
-#include "audioDB.h"
+extern "C" {
+#include "audioDB_API.h"
+}
+#include "audioDB-internals.h"
+#include "accumulators.h"
 
-#include <gsl/gsl_sf.h>
-#include <gsl/gsl_rng.h>
+/* 0. if the datum in the spec is sufficiently NULL, do db x db
+   sampling;
 
-static
-double yfun(double d) {
-  return gsl_sf_log(d) - gsl_sf_psi(d);
-}
-
-static
-double yinv(double y) {
-  double a = 1.0e-5;
-  double b = 1000.0;
-
-  double ay = yfun(a);
-  double by = yfun(b);
-
-  double c = 0;
-  double cy;
-
-  /* FIXME: simple binary search; there's probably some clever solver
-     in gsl somewhere which is less sucky. */
-  while ((b - a) > 1.0e-5) {
-    c = (a + b) / 2;
-    cy = yfun(c);
-    if (cy > y) {
-      a = c;
-      ay = cy;
-    } else {
-      b = c;
-      by = cy;
+   1. if accumulation is DATABASE, do basically the current sampling
+   with track1 being the datum; 
+   
+   2. if accumulation is PER_TRACK, sample n points per db track
+   rather than n points overall.
+   
+   3. if accumulation is ONE_TO_ONE, ponder hard.
+*/
+adb_query_results_t *audiodb_sample_spec(adb_t *adb, const adb_query_spec_t *qspec) {
+  adb_qstate_internal_t qstate = {0};
+  /* FIXME: this paragraph is the same as in audiodb_query_spec() */
+  qstate.allowed_keys = new std::set<std::string>;
+  adb_query_results_t *results;
+  if(qspec->refine.flags & ADB_REFINE_INCLUDE_KEYLIST) {
+    for(unsigned int k = 0; k < qspec->refine.include.nkeys; k++) {
+      qstate.allowed_keys->insert(qspec->refine.include.keys[k]);
+    }
+  } else {
+    for(unsigned int k = 0; k < adb->header->numFiles; k++) {
+      qstate.allowed_keys->insert((*adb->keys)[k]);
+    }
+  }
+  if(qspec->refine.flags & ADB_REFINE_EXCLUDE_KEYLIST) {
+    for(unsigned int k = 0; k < qspec->refine.exclude.nkeys; k++) {
+      qstate.allowed_keys->erase(qspec->refine.exclude.keys[k]);
     }
   }
 
-  return c;
+  if(!(qspec->qid.datum)) {
+    switch(qspec->params.distance) {
+    case ADB_DISTANCE_DOT_PRODUCT:
+      qstate.accumulator = new DBAccumulator<adb_result_dist_gt>(qspec->params.npoints);
+      break;
+    case ADB_DISTANCE_EUCLIDEAN_NORMED:
+    case ADB_DISTANCE_EUCLIDEAN:
+      qstate.accumulator = new DBAccumulator<adb_result_dist_lt>(qspec->params.npoints);
+      break;
+    default:
+      goto error;
+    }
+  } else {
+    /* FIXME: this paragraph is the same as in audiodb_query_spec(),
+     apart from only being taken in one branch */
+    switch(qspec->params.distance) {
+    case ADB_DISTANCE_DOT_PRODUCT:
+      switch(qspec->params.accumulation) {
+      case ADB_ACCUMULATION_DB:
+        qstate.accumulator = new DBAccumulator<adb_result_dist_gt>(qspec->params.npoints);
+        break;
+      case ADB_ACCUMULATION_PER_TRACK:
+        qstate.accumulator = new PerTrackAccumulator<adb_result_dist_gt>(qspec->params.npoints, qspec->params.ntracks);
+        break;
+      case ADB_ACCUMULATION_ONE_TO_ONE:
+        qstate.accumulator = new NearestAccumulator<adb_result_dist_gt>();
+        break;
+      default:
+        goto error;
+      }
+      break;
+    case ADB_DISTANCE_EUCLIDEAN_NORMED:
+    case ADB_DISTANCE_EUCLIDEAN:
+      switch(qspec->params.accumulation) {
+      case ADB_ACCUMULATION_DB:
+        qstate.accumulator = new DBAccumulator<adb_result_dist_lt>(qspec->params.npoints);
+        break;
+      case ADB_ACCUMULATION_PER_TRACK:
+        qstate.accumulator = new PerTrackAccumulator<adb_result_dist_lt>(qspec->params.npoints, qspec->params.ntracks);
+        break;
+      case ADB_ACCUMULATION_ONE_TO_ONE:
+        qstate.accumulator = new NearestAccumulator<adb_result_dist_lt>();
+        break;
+      default:
+        goto error;
+      }
+     break;
+    default:
+      goto error;
+    }
+  }
+
+  if(audiodb_sample_loop(adb, qspec, &qstate)) {
+    goto error;
+  }
+
+  results = qstate.accumulator->get_points();
+
+  delete qstate.accumulator;
+  delete qstate.allowed_keys;
+
+  return results;
+
+ error:
+  if(qstate.accumulator)
+    delete qstate.accumulator;
+  if(qstate.allowed_keys)
+    delete qstate.allowed_keys;
+  return NULL;
 }
 
-unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
+uint32_t audiodb_random_track(adb_t *adb, uint32_t *propTable, unsigned total) {
   /* FIXME: make this O(1) by using the alias-rejection method, or
      some other sensible method of sampling from a discrete
      distribution. */
-  double thing = gsl_rng_uniform(rng);
+  double thing = gsl_rng_uniform(adb->rng);
   unsigned sofar = 0;
-  for (unsigned int i = 0; i < dbH->numFiles; i++) {
+  for (unsigned int i = 0; i < adb->header->numFiles; i++) {
     sofar += propTable[i];
     if (thing < ((double) sofar / (double) total)) {
       return i;
     }
   }
-  error("fell through in random_track()");
-
+  /* can't happen */
+  return (uint32_t) -1;
 }
 
-void audioDB::sample(const char *dbName) {
-  initTables(dbName, 0);
-  if(dbH->flags & O2_FLAG_LARGE_ADB){
-    error("error: sample not yet supported for LARGE_ADB");
-  }
-    
-  off_t *trackOffsetTable = new off_t[dbH->numFiles];
-  unsigned cumTrack=0;
-  for(unsigned int k = 0; k < dbH->numFiles; k++){
-    trackOffsetTable[k] = cumTrack;
-    cumTrack += trackTable[k] * dbH->dim;
+int audiodb_sample_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
+  /* FIXME: all eerily reminiscent of audiodb_query_loop() */
+  double *query, *query_data;
+  adb_qpointers_internal_t qpointers = {0}, dbpointers = {0};
+
+  if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
+    /* FIXME: someday */
+    return 1;
   }
 
-  unsigned *propTable = new unsigned[dbH->numFiles];
-  unsigned total = 0;
-  unsigned count = 0;
-
-  for (unsigned int i = 0; i < dbH->numFiles; i++) {
-    /* what kind of a stupid language doesn't have binary max(), let
-       alone nary? */
-    unsigned int prop = trackTable[i] - sequenceLength + 1;
-    prop = prop > 0 ? prop : 0;
-    if (prop > 0) 
-      count++;
-    propTable[i] = prop;
-    total += prop;
+  if(spec->qid.datum) {
+    if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
+      return 1;
+    }
   }
 
-  if (total == 0) {
-    error("no sequences of this sequence length in the database", dbName);
+  /*
+  if(audiodb_set_up_dbpointers(adb, spec, &dbpointers)) {
+    return 1;
+  }
+  */
+
+  /* three cases: 
+
+     1. datum is null:  Select two random tracks and two
+     offsets into those tracks, compute distance, and add to
+     accumulator.
+
+     2. datum is not null and
+
+        (a) ADB_QID_FLAG_EXHAUSTIVE is set: select one random track
+        and offset, and one offset into datum, compute distance, add
+        to accumulator.
+
+        (b) ADB_QID_FLAG_EXHAUSTIVE is not set: select one random
+        track and offset, compute distance with datum, add to
+        accumulator.
+
+     That all sounds simple, right?  Oh, but we also need to take care
+     of refinement: not just hop sizes, but power thresholding, track
+     inclusion, duration ratio (theoretically), radius thresholding
+     (no, that doesn't make sense).
+
+     Also, for ADB_ACCUMULATE_PER_TRACK, we need to do effectively
+     case 2 for each of the allowed keys.
+
+  */
+
+  if(spec->refine.flags & ADB_REFINE_RADIUS) {
+    /* doesn't make sense */
+    return 1;
   }
 
-  unsigned int vlen = dbH->dim * sequenceLength;
-  double *v1 = new double[vlen];
-  double *v2 = new double[vlen];
-  double v1norm, v2norm, v1v2;
+  if(spec->refine.flags & ADB_REFINE_DURATION_RATIO) {
+    /* does make sense in principle but my brain is too small today */
+    return -1;
+  }
 
-  double sumdist = 0;
-  double sumlogdist = 0;
+  if(spec->params.accumulation != ADB_ACCUMULATION_DB) {
+    /* likewise, brain too small */
+    return -1;
+  }
 
-  unsigned key_index = 0;
-  if(query_from_key) {
-    /* naughty use of internals here.  When this is part of the API,
-       it will be a legitimate use of internals.  -- CSR,
-       2010-01-05 */
-    key_index = (*adb->keymap)[key];
-    if(propTable[key_index] == 0) {
-      error("no samples of this length possible for key");
+  uint32_t qhop, ihop;
+  if(spec->refine.flags & ADB_REFINE_HOP_SIZE) {
+    qhop = spec->refine.qhopsize;
+    qhop = qhop ? qhop : 1;
+    ihop = spec->refine.ihopsize;
+    ihop = ihop ? ihop : 1;
+  } else {
+    qhop = 1;
+    ihop = 1;
+  }
+
+  uint32_t total = 0;
+  uint32_t *props = new uint32_t[adb->header->numFiles];
+
+  std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
+  for(uint32_t i = 0; i < adb->header->numFiles; i++) {
+    uint32_t prev = i == 0 ? 0 : props[i-1];
+    if(qstate->allowed_keys->find((*adb->keys)[i]) == keys_end) {
+      props[i] = prev;
+    } else {
+      uint32_t add = (*adb->track_lengths)[i] - spec->qid.sequence_length + 1;
+      props[i] = prev + add > 0 ? add : 0;
+      total += add;
     }
   }
-  for (unsigned int i = 0; i < nsamples;) {
-    unsigned track1 = 0;
-    if(query_from_key) {
-      track1 = key_index;
+
+  double *v1 = NULL;
+  double *v2 = new double[adb->header->dim * spec->qid.sequence_length];
+
+  if(!spec->qid.datum) {
+    v1 = new double[adb->header->dim * spec->qid.sequence_length];
+  }
+
+  uint32_t i1, i2, track1 = 0, track2;
+
+  uint32_t npoints = spec->params.npoints;
+  for(uint32_t nsamples = 0; nsamples < npoints;) {
+
+    if(spec->qid.datum) {
+      if(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) {
+        i1 = qhop * gsl_rng_uniform_int(adb->rng, (qpointers.nvectors - spec->qid.sequence_length + qhop - 1) / qhop);
+        v1 = query + i1 * adb->header->dim;
+      } else {
+        i1 = spec->qid.sequence_start;
+        v1 = query;
+      }
+      track2 = audiodb_random_track(adb, props, total);
+      i2 = ihop * gsl_rng_uniform_int(adb->rng, (props[track2] + ihop - 1) / ihop);
+      off_t offset2 = (*adb->track_offsets)[track2] + i2 * adb->header->dim * sizeof(double);
+      uint32_t length = adb->header->dim * spec->qid.sequence_length * sizeof(double);
+      lseek_set_or_goto_error(adb->fd, adb->header->dataOffset + offset2);
+      read_or_goto_error(adb->fd, v2, length);
     } else {
-      track1 = random_track(propTable, total);
+      /* FIXME: Theoretically we should pass ihop here, but that's a
+         small correction that I'm ignoring for now */
+      track1 = audiodb_random_track(adb, props, total);
+      track2 = audiodb_random_track(adb, props, total);
+      if(track1 == track2) {
+        continue;
+      }
+
+      i1 = ihop * gsl_rng_uniform_int(adb->rng, (props[track1] + ihop - 1) / ihop);
+      i2 = ihop * gsl_rng_uniform_int(adb->rng, (props[track2] + ihop - 1) / ihop);
+
+      off_t offset1 = (*adb->track_offsets)[track1] + i1 * adb->header->dim * sizeof(double);
+      off_t offset2 = (*adb->track_offsets)[track2] + i2 * adb->header->dim * sizeof(double);
+      uint32_t length = adb->header->dim * spec->qid.sequence_length * sizeof(double);
+      lseek_set_or_goto_error(adb->fd, adb->header->dataOffset + offset1);
+      read_or_goto_error(adb->fd, v1, length);
+      lseek_set_or_goto_error(adb->fd, adb->header->dataOffset + offset2);
+      read_or_goto_error(adb->fd, v2, length);
     }
-    unsigned track2 = random_track(propTable, total);
 
-    if(track1 == track2)
-      continue;
+    /* FIXME: replace at least the norms with dbpointers */
+    double v1norm = 0;
+    double v2norm = 0;
+    double v1v2 = 0;
+    double dist = 0;
 
-    unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
-    unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
-
-    VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
-
-    /* FIXME: this seeking, reading and distance calculation should
-       share more code with the query loop */
-    if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
-      error("seek failure", "", "lseek");
-    }
-    CHECKED_READ(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
-
-    if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
-      error("seek failure", "", "lseek");
-    }
-    CHECKED_READ(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
-
-    v1norm = 0;
-    v2norm = 0;
-    v1v2 = 0;
-
+    uint32_t vlen = spec->qid.sequence_length * adb->header->dim;
     for (unsigned int j = 0; j < vlen; j++) {
       v1norm += v1[j]*v1[j];
       v2norm += v2[j]*v2[j];
       v1v2 += v1[j]*v2[j];
     }
 
-    /* FIXME: we must deal with infinities better than this; there
-       could be all sorts of NaNs from arbitrary features.  Best
-       include power thresholds or something... */
+    /* FIXME: do power thresholding test */
     if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
+      switch(spec->params.distance) {
+      case ADB_DISTANCE_EUCLIDEAN_NORMED:
+        dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
+        break;
+      case ADB_DISTANCE_EUCLIDEAN:
+        dist = v1norm + v2norm - 2 * v1v2;
+        break;
+      case ADB_DISTANCE_DOT_PRODUCT:
+        dist = v1v2;
+        break;
+      }
+    } else {
+      continue;
+    }
 
-      VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
-      /* assume normalizedDistance == true for now */
-      /* FIXME: not convinced that the statistics we calculated in
-	 TASLP paper are technically valid for normalizedDistance */
-
-      double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
-      // double dist = v1norm + v2norm - 2*v1v2;
-      
-      VERB_LOG(1, "%f %f\n", dist, log(dist));
-      sumdist += dist;
-      sumlogdist += log(dist);
-      i++;
-    } else {
-      VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
-    }
+    adb_result_t r;
+    r.qkey = spec->qid.datum ? "" : (*adb->keys)[track1].c_str();
+    r.ikey = (*adb->keys)[track2].c_str();
+    r.dist = dist;
+    r.qpos = i1;
+    r.ipos = i2;
+    qstate->accumulator->add_point(&r);
+    nsamples++;
   }
 
-  /* FIXME: the mean isn't really what we should be reporting here */
-  unsigned meanN = total / count;
+  if(!spec->qid.datum) {
+    delete[] v1;
+  }
+  delete[] v2;
 
-  double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
-  double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
+  return 0;
 
-  std::cout << "Summary statistics" << std::endl;
-  std::cout << "number of samples: " << nsamples << std::endl;
-  std::cout << "sum of distances (S): " << sumdist << std::endl;
-  std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
-
-  /* FIXME: we'll also want some more summary statistics based on
-     propTable, for the minimum-of-X estimate */
-  std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
-  std::cout << std::endl;
-  std::cout << "Estimated parameters" << std::endl;
-  std::cout << "sigma^2: " << sigma2 << "; ";
-  std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
-  std::cout << "d: " << d << std::endl;
-
-  double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
-  double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
-    - (2 / d) * gsl_sf_log(meanN)
-    - gsl_sf_log(d/2)
-    - (2 / d) * gsl_sf_log(2 / d)
-    + (2 / d) * gsl_sf_lngamma(d / 2);
-
-  std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
-
-  delete[] propTable;
-  delete[] v1;
+ error:
+  if(!spec->qid.datum) {
+    delete[] v1;
+  }
   delete[] v2;
+  return 17;
 }