changeset 270:9636040ff503 sampling

Parametrize nsamples (though not on the command-line yet) Compute the mean number of possible sequences in tracks, and display that. Compute the track xthreshold as in TASLP paper. (Not quite getting the same answers, though).
author mas01cr
date Mon, 16 Jun 2008 11:59:43 +0000
parents dcbb30790b30
children 1f2c7d5e581c
files sample.cpp
diffstat 1 files changed, 22 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/sample.cpp	Mon Jun 16 11:57:25 2008 +0000
+++ b/sample.cpp	Mon Jun 16 11:59:43 2008 +0000
@@ -63,12 +63,15 @@
 
   unsigned *propTable = new unsigned[dbH->numFiles];
   unsigned total = 0;
+  unsigned count = 0;
 
   for (unsigned int i = 0; i < dbH->numFiles; i++) {
     /* what kind of a stupid language doesn't have binary max(), let
        alone nary? */
     unsigned int prop = trackTable[i] - sequenceLength + 1;
     prop = prop > 0 ? prop : 0;
+    if (prop > 0) 
+      count++;
     propTable[i] = prop;
     total += prop;
   }
@@ -85,8 +88,9 @@
   double sumdist = 0;
   double sumlogdist = 0;
 
-  /* 1037 samples for now */
-  for (unsigned int i = 0; i < 1037;) {
+  unsigned int nsamples = 2049;
+
+  for (unsigned int i = 0; i < nsamples;) {
     /* FIXME: in Real Life we'll want to initialize the RNG using
        /dev/random or the current time or something.  */
     unsigned track1 = random_track(propTable, total);
@@ -137,18 +141,30 @@
     }
   }
 
-  double sigma2 = (sumdist / (sequenceLength * dbH->dim * 1037));
-  double d = 2 * yinv(log(sumdist/1037) - sumlogdist/1037);
+  /* FIXME: the mean isn't really what we should be reporting here */
+  unsigned meanN = total / count;
+
+  double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
+  double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
 
   std::cout << "Summary statistics" << std::endl;
-  std::cout << "number of samples: " << 1037 << std::endl;
+  std::cout << "number of samples: " << nsamples << std::endl;
   std::cout << "sum of distances (S): " << sumdist << std::endl;
   std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
+  std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
   std::cout << std::endl;
   std::cout << "Estimated parameters" << std::endl;
   std::cout << "sigma^2: " << sigma2 << std::endl;
   std::cout << "d: " << d << std::endl;
-  std::cout << "check: " << yfun(d/2) << std::endl;
+
+  double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
+  double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
+    - (2 / d) * gsl_sf_log(meanN)
+    - gsl_sf_log(d/2)
+    - (2 / d) * gsl_sf_log(2 / d)
+    + (2 / d) * gsl_sf_lngamma(d / 2);
+
+  std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
 
   /* FIXME: we'll also want some summary statistics based on
      propTable, for the minimum-of-X estimate */