Mercurial > hg > audiodb
diff audioDB.cpp @ 693:b1723ae7675e
begin work on sampling API
This is motivated by the need to be able to sample with arbitrary feature data
(e.g. from a feature file) against a database, for the JNMR "collections" paper
revisions or possible ISMIR paper revisions. That bit doesn't work yet, but
the C-ified version of the current functionality (sample db x db and
sample key x db) works to the level of anecdotal tests.
The general approach is to mirror the _query_spec() API, where a whole heap
of knobs and twiddles are available to the user. Unlike in the _query_spec()
API, not quite all of the knobs make sense (and even fewer are actually
implemented), but the basic idea is the same.
I pity the poor chump who will have to document all this.
author | mas01cr |
---|---|
date | Thu, 22 Apr 2010 21:03:47 +0000 |
parents | c62041316a44 |
children | 01e25f938b63 |
line wrap: on
line diff
--- a/audioDB.cpp Thu Apr 22 15:43:26 2010 +0000 +++ b/audioDB.cpp Thu Apr 22 21:03:47 2010 +0000 @@ -147,8 +147,6 @@ munmap(powerFileNameTable, fileTableLength); if(reporter) delete reporter; - if(rng) - gsl_rng_free(rng); if(infid>0) { close(infid); infid = 0; @@ -846,7 +844,8 @@ int fd; struct stat st; - /* FIXME: around here there are all sorts of hideous leaks. */ + /* FIXME: around here error conditions will cause all sorts of + hideous leaks. */ fd = open(inFile, O_RDONLY); if(fd < 0) { error("failed to open feature file", inFile); @@ -1010,6 +1009,150 @@ audiodb_liszt_free_results(adb, results); } +#include <gsl/gsl_sf.h> + +static +double yfun(double d) { + return gsl_sf_log(d) - gsl_sf_psi(d); +} + +static +double yinv(double y) { + double a = 1.0e-5; + double b = 1000.0; + + double ay = yfun(a); + double by = yfun(b); + + double c = 0; + double cy; + + /* FIXME: simple binary search; there's probably some clever solver + in gsl somewhere which is less sucky. */ + while ((b - a) > 1.0e-5) { + c = (a + b) / 2; + cy = yfun(c); + if (cy > y) { + a = c; + ay = cy; + } else { + b = c; + by = cy; + } + } + + return c; +} + +void audioDB::sample(const char *dbName) { + if(!adb) { + if(!(adb = audiodb_open(dbName, O_RDONLY))) { + error("failed to open database", dbName); + } + } + + adb_status_t status; + if(audiodb_status(adb, &status)) { + error("error getting status"); + } + + double sumdist = 0; + double sumlogdist = 0; + + adb_query_results_t *results; + adb_query_spec_t spec = {{0},{0},{0}}; + adb_datum_t datum = {0}; + + spec.refine.qhopsize = sequenceHop; + spec.refine.ihopsize = sequenceHop; + if(sequenceHop != 1) { + spec.refine.flags |= ADB_REFINE_HOP_SIZE; + } + + if(query_from_key) { + datum.key = key; + spec.qid.datum = &datum; + spec.qid.flags |= ADB_QID_FLAG_EXHAUSTIVE; + spec.refine.flags |= ADB_REFINE_EXCLUDE_KEYLIST; + spec.refine.exclude.nkeys = 1; + spec.refine.exclude.keys = &key; + } else if(inFile) { + error("sample from feature file not supported (yet)"); + } else { + spec.qid.datum = NULL; /* full db sample */ + } + spec.qid.sequence_length = sequenceLength; + spec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE; + spec.qid.sequence_start = queryPoint; + + spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED; + spec.params.accumulation = ADB_ACCUMULATION_DB; + spec.params.npoints = nsamples; + + if(!(results = audiodb_sample_spec(adb, &spec))) { + error("error in audiodb_sample_spec"); + } + + if(results->nresults != nsamples) { + error("mismatch in sample count"); + } + + for(uint32_t i = 0; i < nsamples; i++) { + double d = results->results[i].dist; + sumdist += d; + sumlogdist += log(d); + } + + audiodb_query_free_results(adb, &spec, results); + + unsigned total = 0; + unsigned count = 0; + adb_liszt_results_t *liszt; + if(!(liszt = audiodb_liszt(adb))) { + error("liszt failed"); + } + for(uint32_t i = 0; i < liszt->nresults; i++) { + unsigned int prop = (liszt->entries[i].nvectors - sequenceLength)/sequenceHop + 1; + prop = prop > 0 ? prop : 0; + if (prop > 0) { + count++; + } + total += prop; + } + + /* FIXME: the mean isn't really what we should be using here; it's + more a question of "how many independent sequences of length + sequenceLength are there in the database? */ + unsigned meanN = total / count; + + double sigma2 = sumdist / (sequenceLength * status.dim * nsamples); + double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); + + std::cout << "Summary statistics" << std::endl; + std::cout << "number of samples: " << nsamples << std::endl; + std::cout << "sum of distances (S): " << sumdist << std::endl; + std::cout << "sum of log distances (L): " << sumlogdist << std::endl; + + /* FIXME: we'll also want some more summary statistics based on + propTable, for the minimum-of-X estimate */ + std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; + std::cout << std::endl; + std::cout << "Estimated parameters" << std::endl; + std::cout << "sigma^2: " << sigma2 << "; "; + std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; + std::cout << "d: " << d << std::endl; + + double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); + double logxthresh = gsl_sf_log(sumdist / nsamples) + logw + - (2 / d) * gsl_sf_log(meanN) + - gsl_sf_log(d/2) + - (2 / d) * gsl_sf_log(2 / d) + + (2 / d) * gsl_sf_lngamma(d / 2); + + std::cout << "track xthresh: " << exp(logxthresh) << std::endl; +} + + // This entry point is visited once per instance // so it is a good place to set any global state variables int main(const int argc, const char* argv[]){