Mercurial > hg > audiodb
changeset 270:9636040ff503 sampling
Parametrize nsamples (though not on the command-line yet)
Compute the mean number of possible sequences in tracks, and display
that.
Compute the track xthreshold as in TASLP paper. (Not quite getting the
same answers, though).
author | mas01cr |
---|---|
date | Mon, 16 Jun 2008 11:59:43 +0000 |
parents | dcbb30790b30 |
children | 1f2c7d5e581c |
files | sample.cpp |
diffstat | 1 files changed, 22 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/sample.cpp Mon Jun 16 11:57:25 2008 +0000 +++ b/sample.cpp Mon Jun 16 11:59:43 2008 +0000 @@ -63,12 +63,15 @@ unsigned *propTable = new unsigned[dbH->numFiles]; unsigned total = 0; + unsigned count = 0; for (unsigned int i = 0; i < dbH->numFiles; i++) { /* what kind of a stupid language doesn't have binary max(), let alone nary? */ unsigned int prop = trackTable[i] - sequenceLength + 1; prop = prop > 0 ? prop : 0; + if (prop > 0) + count++; propTable[i] = prop; total += prop; } @@ -85,8 +88,9 @@ double sumdist = 0; double sumlogdist = 0; - /* 1037 samples for now */ - for (unsigned int i = 0; i < 1037;) { + unsigned int nsamples = 2049; + + for (unsigned int i = 0; i < nsamples;) { /* FIXME: in Real Life we'll want to initialize the RNG using /dev/random or the current time or something. */ unsigned track1 = random_track(propTable, total); @@ -137,18 +141,30 @@ } } - double sigma2 = (sumdist / (sequenceLength * dbH->dim * 1037)); - double d = 2 * yinv(log(sumdist/1037) - sumlogdist/1037); + /* FIXME: the mean isn't really what we should be reporting here */ + unsigned meanN = total / count; + + double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples); + double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); std::cout << "Summary statistics" << std::endl; - std::cout << "number of samples: " << 1037 << std::endl; + std::cout << "number of samples: " << nsamples << std::endl; std::cout << "sum of distances (S): " << sumdist << std::endl; std::cout << "sum of log distances (L): " << sumlogdist << std::endl; + std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; std::cout << std::endl; std::cout << "Estimated parameters" << std::endl; std::cout << "sigma^2: " << sigma2 << std::endl; std::cout << "d: " << d << std::endl; - std::cout << "check: " << yfun(d/2) << std::endl; + + double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); + double logxthresh = gsl_sf_log(sumdist / nsamples) + logw + - (2 / d) * gsl_sf_log(meanN) + - gsl_sf_log(d/2) + - (2 / d) * gsl_sf_log(2 / d) + + (2 / d) * gsl_sf_lngamma(d / 2); + + std::cout << "track xthresh: " << exp(logxthresh) << std::endl; /* FIXME: we'll also want some summary statistics based on propTable, for the minimum-of-X estimate */