changeset 271:1f2c7d5e581c sampling

Small cleanups. Includes some commented-out code which would fix (in a hacky way) some of the FIXME notes.
author mas01cr
date Mon, 16 Jun 2008 17:17:11 +0000
parents 9636040ff503
children 5d721f1ead01
files sample.cpp
diffstat 1 files changed, 23 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/sample.cpp	Mon Jun 16 11:59:43 2008 +0000
+++ b/sample.cpp	Mon Jun 16 17:17:11 2008 +0000
@@ -53,6 +53,16 @@
 void audioDB::sample(const char *dbName) {
   initTables(dbName, 0);
 
+  /* FIXME: in Real Life we'll want to initialize the RNG using
+     /dev/random or the current time or something, like this:
+
+     unsigned int seed;
+     int fd = open("/dev/urandom", O_RDONLY);
+     read(fd, &seed, 4);
+     
+     srandom(seed);
+  */ 
+
   // build track offset table (FIXME: cut'n'pasted from query.cpp)
   off_t *trackOffsetTable = new off_t[dbH->numFiles];
   unsigned cumTrack=0;
@@ -88,14 +98,15 @@
   double sumdist = 0;
   double sumlogdist = 0;
 
-  unsigned int nsamples = 2049;
+  unsigned int nsamples = 20490;
 
   for (unsigned int i = 0; i < nsamples;) {
-    /* FIXME: in Real Life we'll want to initialize the RNG using
-       /dev/random or the current time or something.  */
     unsigned track1 = random_track(propTable, total);
     unsigned track2 = random_track(propTable, total);
 
+    if(track1 == track2)
+      continue;
+
     /* FIXME: this uses lower-order bits, which is OK on Linux but not
        necessarily elsewhere.  Again, use a real random number
        generator */
@@ -130,8 +141,11 @@
       VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
       /* assume normalizedDistance == true for now */
       /* FIXME: not convinced that the statistics we calculated in
-	 TASLP paper are valid for normalizedDistance */
+	 TASLP paper are technically valid for normalizedDistance */
+
       double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
+      // double dist = v1norm + v2norm - 2*v1v2;
+      
       VERB_LOG(1, "%f %f\n", dist, log(dist));
       sumdist += dist;
       sumlogdist += log(dist);
@@ -151,10 +165,14 @@
   std::cout << "number of samples: " << nsamples << std::endl;
   std::cout << "sum of distances (S): " << sumdist << std::endl;
   std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
+
+  /* FIXME: we'll also want some more summary statistics based on
+     propTable, for the minimum-of-X estimate */
   std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
   std::cout << std::endl;
   std::cout << "Estimated parameters" << std::endl;
-  std::cout << "sigma^2: " << sigma2 << std::endl;
+  std::cout << "sigma^2: " << sigma2 << "; ";
+  std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
   std::cout << "d: " << d << std::endl;
 
   double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
@@ -166,9 +184,6 @@
 
   std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
 
-  /* FIXME: we'll also want some summary statistics based on
-     propTable, for the minimum-of-X estimate */
-
   delete[] propTable;
   delete[] v1;
   delete[] v2;