diff sample.cpp @ 266:4ffa05f25a00 sampling

Add initial sampling of database distances. Zillions of FIXME comments everywhere.
author mas01cr
date Sat, 14 Jun 2008 17:13:26 +0000
parents
children 30a2a45f2b70
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sample.cpp	Sat Jun 14 17:13:26 2008 +0000
@@ -0,0 +1,120 @@
+#include "audioDB.h"
+
+unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
+  /* FIXME: make this O(1) by using the alias-rejection method, or
+     some other sensible method of sampling from a discrete
+     distribution. */
+  /* FIXME: use a real random number generator, not random() */
+  double thing = random() / (double) RAND_MAX;
+  unsigned sofar = 0;
+  for (unsigned int i = 0; i < dbH->numFiles; i++) {
+    sofar += propTable[i];
+    if (thing < ((double) sofar / (double) total)) {
+      return i;
+    }
+  }
+  error("fell through in random_track()");
+
+  /* FIXME: decorate error's declaration so that this isn't necessary */
+  return 0;
+}
+
+void audioDB::sample(const char *dbName) {
+  initTables(dbName, 0);
+
+  // build track offset table (FIXME: cut'n'pasted from query.cpp)
+  off_t *trackOffsetTable = new off_t[dbH->numFiles];
+  unsigned cumTrack=0;
+  for(unsigned int k = 0; k < dbH->numFiles; k++){
+    trackOffsetTable[k] = cumTrack;
+    cumTrack += trackTable[k] * dbH->dim;
+  }
+
+  unsigned *propTable = new unsigned[dbH->numFiles];
+  unsigned total = 0;
+
+  for (unsigned int i = 0; i < dbH->numFiles; i++) {
+    /* what kind of a stupid language doesn't have binary max(), let
+       alone nary? */
+    unsigned int prop = trackTable[i] - sequenceLength + 1;
+    prop = prop > 0 ? prop : 0;
+    propTable[i] = prop;
+    total += prop;
+  }
+
+  if (total == 0) {
+    error("no sequences of this sequence length in the database", dbName);
+  }
+
+  unsigned int vlen = dbH->dim * sequenceLength;
+  double *v1 = new double[vlen];
+  double *v2 = new double[vlen];
+  double v1norm, v2norm, v1v2;
+
+  double sumdist = 0;
+  double sumlogdist = 0;
+
+  /* 1037 samples for now */
+  for (unsigned int i = 0; i < 1037;) {
+    /* FIXME: in Real Life we'll want to initialize the RNG using
+       /dev/random or the current time or something.  */
+    unsigned track1 = random_track(propTable, total);
+    unsigned track2 = random_track(propTable, total);
+
+    /* FIXME: this uses lower-order bits, which is OK on Linux but not
+       necessarily elsewhere.  Again, use a real random number
+       generator */
+    unsigned i1 = random() % propTable[track1];
+    unsigned i2 = random() % propTable[track2];
+
+    VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
+
+    /* FIXME: this seeking, reading and distance calculation should
+       share more code with the query loop */
+    lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
+    read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
+
+    lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
+    read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
+
+    v1norm = 0;
+    v2norm = 0;
+    v1v2 = 0;
+
+    for (unsigned int j = 0; j < vlen; j++) {
+      v1norm += v1[j]*v1[j];
+      v2norm += v2[j]*v2[j];
+      v1v2 += v1[j]*v2[j];
+    }
+
+    /* FIXME: we must deal with infinities better than this; there
+       could be all sorts of NaNs from arbitrary features.  Best
+       include power thresholds or something... */
+    if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
+
+      VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
+      /* assume normalizedDistance == true for now */
+      /* FIXME: not convinced that the statistics we calculated in
+	 TASLP paper are valid for normalizedDistance */
+      double dist = 2 - v1v2 / sqrt(v1norm * v2norm);
+      VERB_LOG(1, "%f %f\n", dist, log(dist));
+      sumdist += dist;
+      sumlogdist += log(dist);
+      i++;
+    } else {
+      VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2);
+    }
+  }
+
+  std::cout << "Summary statistics" << std::endl;
+  std::cout << "number of samples: " << 1037 << std::endl;
+  std::cout << "sum of distances (S): " << sumdist << std::endl;
+  std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
+
+  /* FIXME: we'll also want some summary statistics based on
+     propTable, for the minimum-of-X estimate */
+
+  delete[] propTable;
+  delete[] v1;
+  delete[] v2;
+}