Mercurial > hg > audiodb
changeset 693:b1723ae7675e
begin work on sampling API
This is motivated by the need to be able to sample with arbitrary feature data
(e.g. from a feature file) against a database, for the JNMR "collections" paper
revisions or possible ISMIR paper revisions. That bit doesn't work yet, but
the C-ified version of the current functionality (sample db x db and
sample key x db) works to the level of anecdotal tests.
The general approach is to mirror the _query_spec() API, where a whole heap
of knobs and twiddles are available to the user. Unlike in the _query_spec()
API, not quite all of the knobs make sense (and even fewer are actually
implemented), but the basic idea is the same.
I pity the poor chump who will have to document all this.
author | mas01cr |
---|---|
date | Thu, 22 Apr 2010 21:03:47 +0000 |
parents | 02756c5ca15a |
children | 55aa1919d735 |
files | audioDB-internals.h audioDB.cpp audioDB.h audioDB_API.h close.cpp common.cpp open.cpp sample.cpp |
diffstat | 8 files changed, 454 insertions(+), 190 deletions(-) [+] |
line wrap: on
line diff
--- a/audioDB-internals.h Thu Apr 22 15:43:26 2010 +0000 +++ b/audioDB-internals.h Thu Apr 22 21:03:47 2010 +0000 @@ -6,6 +6,7 @@ #endif #include <sys/stat.h> #include <sys/types.h> +#include <sys/time.h> #include <stdio.h> #include <errno.h> @@ -21,6 +22,8 @@ #include <windows.h> #endif +#include <gsl/gsl_rng.h> + #include <algorithm> #include <iostream> #include <map> @@ -119,6 +122,7 @@ std::vector<uint32_t> *track_lengths; std::vector<off_t> *track_offsets; LSH *cached_lsh; + gsl_rng *rng; }; typedef struct { @@ -374,6 +378,7 @@ void audiodb_index_delete_shingles(vector<vector<float> > *); void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t); int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float); +int audiodb_sample_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); #define ADB_MAXSTR (512U) #define ADB_FILETABLE_ENTRY_SIZE (256U)
--- a/audioDB.cpp Thu Apr 22 15:43:26 2010 +0000 +++ b/audioDB.cpp Thu Apr 22 21:03:47 2010 +0000 @@ -147,8 +147,6 @@ munmap(powerFileNameTable, fileTableLength); if(reporter) delete reporter; - if(rng) - gsl_rng_free(rng); if(infid>0) { close(infid); infid = 0; @@ -846,7 +844,8 @@ int fd; struct stat st; - /* FIXME: around here there are all sorts of hideous leaks. */ + /* FIXME: around here error conditions will cause all sorts of + hideous leaks. */ fd = open(inFile, O_RDONLY); if(fd < 0) { error("failed to open feature file", inFile); @@ -1010,6 +1009,150 @@ audiodb_liszt_free_results(adb, results); } +#include <gsl/gsl_sf.h> + +static +double yfun(double d) { + return gsl_sf_log(d) - gsl_sf_psi(d); +} + +static +double yinv(double y) { + double a = 1.0e-5; + double b = 1000.0; + + double ay = yfun(a); + double by = yfun(b); + + double c = 0; + double cy; + + /* FIXME: simple binary search; there's probably some clever solver + in gsl somewhere which is less sucky. */ + while ((b - a) > 1.0e-5) { + c = (a + b) / 2; + cy = yfun(c); + if (cy > y) { + a = c; + ay = cy; + } else { + b = c; + by = cy; + } + } + + return c; +} + +void audioDB::sample(const char *dbName) { + if(!adb) { + if(!(adb = audiodb_open(dbName, O_RDONLY))) { + error("failed to open database", dbName); + } + } + + adb_status_t status; + if(audiodb_status(adb, &status)) { + error("error getting status"); + } + + double sumdist = 0; + double sumlogdist = 0; + + adb_query_results_t *results; + adb_query_spec_t spec = {{0},{0},{0}}; + adb_datum_t datum = {0}; + + spec.refine.qhopsize = sequenceHop; + spec.refine.ihopsize = sequenceHop; + if(sequenceHop != 1) { + spec.refine.flags |= ADB_REFINE_HOP_SIZE; + } + + if(query_from_key) { + datum.key = key; + spec.qid.datum = &datum; + spec.qid.flags |= ADB_QID_FLAG_EXHAUSTIVE; + spec.refine.flags |= ADB_REFINE_EXCLUDE_KEYLIST; + spec.refine.exclude.nkeys = 1; + spec.refine.exclude.keys = &key; + } else if(inFile) { + error("sample from feature file not supported (yet)"); + } else { + spec.qid.datum = NULL; /* full db sample */ + } + spec.qid.sequence_length = sequenceLength; + spec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE; + spec.qid.sequence_start = queryPoint; + + spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED; + spec.params.accumulation = ADB_ACCUMULATION_DB; + spec.params.npoints = nsamples; + + if(!(results = audiodb_sample_spec(adb, &spec))) { + error("error in audiodb_sample_spec"); + } + + if(results->nresults != nsamples) { + error("mismatch in sample count"); + } + + for(uint32_t i = 0; i < nsamples; i++) { + double d = results->results[i].dist; + sumdist += d; + sumlogdist += log(d); + } + + audiodb_query_free_results(adb, &spec, results); + + unsigned total = 0; + unsigned count = 0; + adb_liszt_results_t *liszt; + if(!(liszt = audiodb_liszt(adb))) { + error("liszt failed"); + } + for(uint32_t i = 0; i < liszt->nresults; i++) { + unsigned int prop = (liszt->entries[i].nvectors - sequenceLength)/sequenceHop + 1; + prop = prop > 0 ? prop : 0; + if (prop > 0) { + count++; + } + total += prop; + } + + /* FIXME: the mean isn't really what we should be using here; it's + more a question of "how many independent sequences of length + sequenceLength are there in the database? */ + unsigned meanN = total / count; + + double sigma2 = sumdist / (sequenceLength * status.dim * nsamples); + double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); + + std::cout << "Summary statistics" << std::endl; + std::cout << "number of samples: " << nsamples << std::endl; + std::cout << "sum of distances (S): " << sumdist << std::endl; + std::cout << "sum of log distances (L): " << sumlogdist << std::endl; + + /* FIXME: we'll also want some more summary statistics based on + propTable, for the minimum-of-X estimate */ + std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; + std::cout << std::endl; + std::cout << "Estimated parameters" << std::endl; + std::cout << "sigma^2: " << sigma2 << "; "; + std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; + std::cout << "d: " << d << std::endl; + + double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); + double logxthresh = gsl_sf_log(sumdist / nsamples) + logw + - (2 / d) * gsl_sf_log(meanN) + - gsl_sf_log(d/2) + - (2 / d) * gsl_sf_log(2 / d) + + (2 / d) * gsl_sf_lngamma(d / 2); + + std::cout << "track xthresh: " << exp(logxthresh) << std::endl; +} + + // This entry point is visited once per instance // so it is a good place to set any global state variables int main(const int argc, const char* argv[]){
--- a/audioDB.h Thu Apr 22 15:43:26 2010 +0000 +++ b/audioDB.h Thu Apr 22 21:03:47 2010 +0000 @@ -21,7 +21,6 @@ #include <assert.h> #include <float.h> #include <signal.h> -#include <gsl/gsl_rng.h> // includes for LSH indexing extern "C" { @@ -204,8 +203,6 @@ struct adb_header *dbH; struct adb *adb; - gsl_rng *rng; - char* fileTable; unsigned* trackTable; double* l2normTable; @@ -264,7 +261,6 @@ void error(const char* a, const char* b = "", const char *sysFunc = 0) __attribute__ ((noreturn)); void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata); - void initRNG(); void initDBHeader(const char *dbName); void initInputFile(const char *inFile); void initTables(const char* dbName, const char* inFile = 0); @@ -347,7 +343,6 @@ infid(0), \ dbH(0), \ adb(0), \ - rng(0), \ fileTable(0), \ trackTable(0), \ l2normTable(0), \
--- a/audioDB_API.h Thu Apr 22 15:43:26 2010 +0000 +++ b/audioDB_API.h Thu Apr 22 21:03:47 2010 +0000 @@ -167,6 +167,9 @@ adb_liszt_results_t *audiodb_liszt(adb_t *); int audiodb_liszt_free_results(adb_t *, adb_liszt_results_t *); +/* sample */ +adb_query_results_t *audiodb_sample_spec(adb_t *, const adb_query_spec_t *); + /* backwards compatibility */ int audiodb_insert(adb_t *, adb_insert_t *ins); int audiodb_batchinsert(adb_t *, adb_insert_t *ins, unsigned int size);
--- a/close.cpp Thu Apr 22 15:43:26 2010 +0000 +++ b/close.cpp Thu Apr 22 21:03:47 2010 +0000 @@ -10,6 +10,7 @@ delete adb->keymap; delete adb->track_lengths; delete adb->track_offsets; + gsl_rng_free(adb->rng); if(adb->cached_lsh) { delete adb->cached_lsh; }
--- a/common.cpp Thu Apr 22 15:43:26 2010 +0000 +++ b/common.cpp Thu Apr 22 21:03:47 2010 +0000 @@ -33,29 +33,6 @@ } } -void audioDB::initRNG() { - rng = gsl_rng_alloc(gsl_rng_mt19937); - if(!rng) { - error("could not allocate Random Number Generator"); - } - /* FIXME: maybe we should use a real source of entropy? */ - uint32_t seed = 0; -#ifdef WIN32 - seed = time(NULL); -#else - struct timeval tv; - if(gettimeofday(&tv, NULL) == -1) { - error("failed to get time of day"); - } - /* usec field should be less than than 2^20. We want to mix the - usec field, highly-variable, into the high bits of the seconds - field, which will be static between closely-spaced runs. -- CSR, - 2010-01-05 */ - seed = tv.tv_sec ^ (tv.tv_usec << 12); -#endif - gsl_rng_set(rng, seed); -} - void audioDB::initDBHeader(const char* dbName) { if(!adb) { adb = audiodb_open(dbName, forWrite ? O_RDWR : O_RDONLY); @@ -140,13 +117,6 @@ } void audioDB::initTables(const char* dbName, const char* inFile) { - /* FIXME: initRNG() really logically belongs in the audioDB - contructor. However, there are of the order of four constructors - at the moment, and more to come from API implementation. Given - that duplication, I think this is the least worst place to put - it; the assumption is that nothing which doesn't look at a - database will need an RNG. -- CSR, 2008-07-02 */ - initRNG(); initDBHeader(dbName); if(inFile) initInputFile(inFile);
--- a/open.cpp Thu Apr 22 15:43:26 2010 +0000 +++ b/open.cpp Thu Apr 22 21:03:47 2010 +0000 @@ -67,6 +67,31 @@ return 1; } +static int audiodb_init_rng(adb_t *adb) { + gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); + if(!rng) { + return 1; + } + /* FIXME: maybe we should use a real source of entropy? */ + uint32_t seed = 0; +#ifdef WIN32 + seed = time(NULL); +#else + struct timeval tv; + if(gettimeofday(&tv, NULL) == -1) { + return 1; + } + /* usec field should be less than than 2^20. We want to mix the + usec field, highly-variable, into the high bits of the seconds + field, which will be static between closely-spaced runs. -- CSR, + 2010-01-05 */ + seed = tv.tv_sec ^ (tv.tv_usec << 12); +#endif + gsl_rng_set(rng, seed); + adb->rng = rng; + return 0; +} + adb_t *audiodb_open(const char *path, int flags) { adb_t *adb = 0; int fd = -1; @@ -128,6 +153,9 @@ goto error; } adb->cached_lsh = 0; + if(audiodb_init_rng(adb)) { + goto error; + } return adb; error: @@ -143,6 +171,9 @@ if(adb->track_lengths) { delete adb->track_lengths; } + if(adb->rng) { + gsl_rng_free(adb->rng); + } free(adb); } if(fd != -1) {
--- a/sample.cpp Thu Apr 22 15:43:26 2010 +0000 +++ b/sample.cpp Thu Apr 22 21:03:47 2010 +0000 @@ -1,198 +1,314 @@ -#include "audioDB.h" +extern "C" { +#include "audioDB_API.h" +} +#include "audioDB-internals.h" +#include "accumulators.h" -#include <gsl/gsl_sf.h> -#include <gsl/gsl_rng.h> +/* 0. if the datum in the spec is sufficiently NULL, do db x db + sampling; -static -double yfun(double d) { - return gsl_sf_log(d) - gsl_sf_psi(d); -} - -static -double yinv(double y) { - double a = 1.0e-5; - double b = 1000.0; - - double ay = yfun(a); - double by = yfun(b); - - double c = 0; - double cy; - - /* FIXME: simple binary search; there's probably some clever solver - in gsl somewhere which is less sucky. */ - while ((b - a) > 1.0e-5) { - c = (a + b) / 2; - cy = yfun(c); - if (cy > y) { - a = c; - ay = cy; - } else { - b = c; - by = cy; + 1. if accumulation is DATABASE, do basically the current sampling + with track1 being the datum; + + 2. if accumulation is PER_TRACK, sample n points per db track + rather than n points overall. + + 3. if accumulation is ONE_TO_ONE, ponder hard. +*/ +adb_query_results_t *audiodb_sample_spec(adb_t *adb, const adb_query_spec_t *qspec) { + adb_qstate_internal_t qstate = {0}; + /* FIXME: this paragraph is the same as in audiodb_query_spec() */ + qstate.allowed_keys = new std::set<std::string>; + adb_query_results_t *results; + if(qspec->refine.flags & ADB_REFINE_INCLUDE_KEYLIST) { + for(unsigned int k = 0; k < qspec->refine.include.nkeys; k++) { + qstate.allowed_keys->insert(qspec->refine.include.keys[k]); + } + } else { + for(unsigned int k = 0; k < adb->header->numFiles; k++) { + qstate.allowed_keys->insert((*adb->keys)[k]); + } + } + if(qspec->refine.flags & ADB_REFINE_EXCLUDE_KEYLIST) { + for(unsigned int k = 0; k < qspec->refine.exclude.nkeys; k++) { + qstate.allowed_keys->erase(qspec->refine.exclude.keys[k]); } } - return c; + if(!(qspec->qid.datum)) { + switch(qspec->params.distance) { + case ADB_DISTANCE_DOT_PRODUCT: + qstate.accumulator = new DBAccumulator<adb_result_dist_gt>(qspec->params.npoints); + break; + case ADB_DISTANCE_EUCLIDEAN_NORMED: + case ADB_DISTANCE_EUCLIDEAN: + qstate.accumulator = new DBAccumulator<adb_result_dist_lt>(qspec->params.npoints); + break; + default: + goto error; + } + } else { + /* FIXME: this paragraph is the same as in audiodb_query_spec(), + apart from only being taken in one branch */ + switch(qspec->params.distance) { + case ADB_DISTANCE_DOT_PRODUCT: + switch(qspec->params.accumulation) { + case ADB_ACCUMULATION_DB: + qstate.accumulator = new DBAccumulator<adb_result_dist_gt>(qspec->params.npoints); + break; + case ADB_ACCUMULATION_PER_TRACK: + qstate.accumulator = new PerTrackAccumulator<adb_result_dist_gt>(qspec->params.npoints, qspec->params.ntracks); + break; + case ADB_ACCUMULATION_ONE_TO_ONE: + qstate.accumulator = new NearestAccumulator<adb_result_dist_gt>(); + break; + default: + goto error; + } + break; + case ADB_DISTANCE_EUCLIDEAN_NORMED: + case ADB_DISTANCE_EUCLIDEAN: + switch(qspec->params.accumulation) { + case ADB_ACCUMULATION_DB: + qstate.accumulator = new DBAccumulator<adb_result_dist_lt>(qspec->params.npoints); + break; + case ADB_ACCUMULATION_PER_TRACK: + qstate.accumulator = new PerTrackAccumulator<adb_result_dist_lt>(qspec->params.npoints, qspec->params.ntracks); + break; + case ADB_ACCUMULATION_ONE_TO_ONE: + qstate.accumulator = new NearestAccumulator<adb_result_dist_lt>(); + break; + default: + goto error; + } + break; + default: + goto error; + } + } + + if(audiodb_sample_loop(adb, qspec, &qstate)) { + goto error; + } + + results = qstate.accumulator->get_points(); + + delete qstate.accumulator; + delete qstate.allowed_keys; + + return results; + + error: + if(qstate.accumulator) + delete qstate.accumulator; + if(qstate.allowed_keys) + delete qstate.allowed_keys; + return NULL; } -unsigned audioDB::random_track(unsigned *propTable, unsigned total) { +uint32_t audiodb_random_track(adb_t *adb, uint32_t *propTable, unsigned total) { /* FIXME: make this O(1) by using the alias-rejection method, or some other sensible method of sampling from a discrete distribution. */ - double thing = gsl_rng_uniform(rng); + double thing = gsl_rng_uniform(adb->rng); unsigned sofar = 0; - for (unsigned int i = 0; i < dbH->numFiles; i++) { + for (unsigned int i = 0; i < adb->header->numFiles; i++) { sofar += propTable[i]; if (thing < ((double) sofar / (double) total)) { return i; } } - error("fell through in random_track()"); - + /* can't happen */ + return (uint32_t) -1; } -void audioDB::sample(const char *dbName) { - initTables(dbName, 0); - if(dbH->flags & O2_FLAG_LARGE_ADB){ - error("error: sample not yet supported for LARGE_ADB"); - } - - off_t *trackOffsetTable = new off_t[dbH->numFiles]; - unsigned cumTrack=0; - for(unsigned int k = 0; k < dbH->numFiles; k++){ - trackOffsetTable[k] = cumTrack; - cumTrack += trackTable[k] * dbH->dim; +int audiodb_sample_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) { + /* FIXME: all eerily reminiscent of audiodb_query_loop() */ + double *query, *query_data; + adb_qpointers_internal_t qpointers = {0}, dbpointers = {0}; + + if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) { + /* FIXME: someday */ + return 1; } - unsigned *propTable = new unsigned[dbH->numFiles]; - unsigned total = 0; - unsigned count = 0; - - for (unsigned int i = 0; i < dbH->numFiles; i++) { - /* what kind of a stupid language doesn't have binary max(), let - alone nary? */ - unsigned int prop = trackTable[i] - sequenceLength + 1; - prop = prop > 0 ? prop : 0; - if (prop > 0) - count++; - propTable[i] = prop; - total += prop; + if(spec->qid.datum) { + if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) { + return 1; + } } - if (total == 0) { - error("no sequences of this sequence length in the database", dbName); + /* + if(audiodb_set_up_dbpointers(adb, spec, &dbpointers)) { + return 1; + } + */ + + /* three cases: + + 1. datum is null: Select two random tracks and two + offsets into those tracks, compute distance, and add to + accumulator. + + 2. datum is not null and + + (a) ADB_QID_FLAG_EXHAUSTIVE is set: select one random track + and offset, and one offset into datum, compute distance, add + to accumulator. + + (b) ADB_QID_FLAG_EXHAUSTIVE is not set: select one random + track and offset, compute distance with datum, add to + accumulator. + + That all sounds simple, right? Oh, but we also need to take care + of refinement: not just hop sizes, but power thresholding, track + inclusion, duration ratio (theoretically), radius thresholding + (no, that doesn't make sense). + + Also, for ADB_ACCUMULATE_PER_TRACK, we need to do effectively + case 2 for each of the allowed keys. + + */ + + if(spec->refine.flags & ADB_REFINE_RADIUS) { + /* doesn't make sense */ + return 1; } - unsigned int vlen = dbH->dim * sequenceLength; - double *v1 = new double[vlen]; - double *v2 = new double[vlen]; - double v1norm, v2norm, v1v2; + if(spec->refine.flags & ADB_REFINE_DURATION_RATIO) { + /* does make sense in principle but my brain is too small today */ + return -1; + } - double sumdist = 0; - double sumlogdist = 0; + if(spec->params.accumulation != ADB_ACCUMULATION_DB) { + /* likewise, brain too small */ + return -1; + } - unsigned key_index = 0; - if(query_from_key) { - /* naughty use of internals here. When this is part of the API, - it will be a legitimate use of internals. -- CSR, - 2010-01-05 */ - key_index = (*adb->keymap)[key]; - if(propTable[key_index] == 0) { - error("no samples of this length possible for key"); + uint32_t qhop, ihop; + if(spec->refine.flags & ADB_REFINE_HOP_SIZE) { + qhop = spec->refine.qhopsize; + qhop = qhop ? qhop : 1; + ihop = spec->refine.ihopsize; + ihop = ihop ? ihop : 1; + } else { + qhop = 1; + ihop = 1; + } + + uint32_t total = 0; + uint32_t *props = new uint32_t[adb->header->numFiles]; + + std::set<std::string>::iterator keys_end = qstate->allowed_keys->end(); + for(uint32_t i = 0; i < adb->header->numFiles; i++) { + uint32_t prev = i == 0 ? 0 : props[i-1]; + if(qstate->allowed_keys->find((*adb->keys)[i]) == keys_end) { + props[i] = prev; + } else { + uint32_t add = (*adb->track_lengths)[i] - spec->qid.sequence_length + 1; + props[i] = prev + add > 0 ? add : 0; + total += add; } } - for (unsigned int i = 0; i < nsamples;) { - unsigned track1 = 0; - if(query_from_key) { - track1 = key_index; + + double *v1 = NULL; + double *v2 = new double[adb->header->dim * spec->qid.sequence_length]; + + if(!spec->qid.datum) { + v1 = new double[adb->header->dim * spec->qid.sequence_length]; + } + + uint32_t i1, i2, track1 = 0, track2; + + uint32_t npoints = spec->params.npoints; + for(uint32_t nsamples = 0; nsamples < npoints;) { + + if(spec->qid.datum) { + if(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) { + i1 = qhop * gsl_rng_uniform_int(adb->rng, (qpointers.nvectors - spec->qid.sequence_length + qhop - 1) / qhop); + v1 = query + i1 * adb->header->dim; + } else { + i1 = spec->qid.sequence_start; + v1 = query; + } + track2 = audiodb_random_track(adb, props, total); + i2 = ihop * gsl_rng_uniform_int(adb->rng, (props[track2] + ihop - 1) / ihop); + off_t offset2 = (*adb->track_offsets)[track2] + i2 * adb->header->dim * sizeof(double); + uint32_t length = adb->header->dim * spec->qid.sequence_length * sizeof(double); + lseek_set_or_goto_error(adb->fd, adb->header->dataOffset + offset2); + read_or_goto_error(adb->fd, v2, length); } else { - track1 = random_track(propTable, total); + /* FIXME: Theoretically we should pass ihop here, but that's a + small correction that I'm ignoring for now */ + track1 = audiodb_random_track(adb, props, total); + track2 = audiodb_random_track(adb, props, total); + if(track1 == track2) { + continue; + } + + i1 = ihop * gsl_rng_uniform_int(adb->rng, (props[track1] + ihop - 1) / ihop); + i2 = ihop * gsl_rng_uniform_int(adb->rng, (props[track2] + ihop - 1) / ihop); + + off_t offset1 = (*adb->track_offsets)[track1] + i1 * adb->header->dim * sizeof(double); + off_t offset2 = (*adb->track_offsets)[track2] + i2 * adb->header->dim * sizeof(double); + uint32_t length = adb->header->dim * spec->qid.sequence_length * sizeof(double); + lseek_set_or_goto_error(adb->fd, adb->header->dataOffset + offset1); + read_or_goto_error(adb->fd, v1, length); + lseek_set_or_goto_error(adb->fd, adb->header->dataOffset + offset2); + read_or_goto_error(adb->fd, v2, length); } - unsigned track2 = random_track(propTable, total); - if(track1 == track2) - continue; + /* FIXME: replace at least the norms with dbpointers */ + double v1norm = 0; + double v2norm = 0; + double v1v2 = 0; + double dist = 0; - unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]); - unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]); - - VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2); - - /* FIXME: this seeking, reading and distance calculation should - share more code with the query loop */ - if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) { - error("seek failure", "", "lseek"); - } - CHECKED_READ(dbfid, v1, dbH->dim * sequenceLength * sizeof(double)); - - if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) { - error("seek failure", "", "lseek"); - } - CHECKED_READ(dbfid, v2, dbH->dim * sequenceLength * sizeof(double)); - - v1norm = 0; - v2norm = 0; - v1v2 = 0; - + uint32_t vlen = spec->qid.sequence_length * adb->header->dim; for (unsigned int j = 0; j < vlen; j++) { v1norm += v1[j]*v1[j]; v2norm += v2[j]*v2[j]; v1v2 += v1[j]*v2[j]; } - /* FIXME: we must deal with infinities better than this; there - could be all sorts of NaNs from arbitrary features. Best - include power thresholds or something... */ + /* FIXME: do power thresholding test */ if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) { + switch(spec->params.distance) { + case ADB_DISTANCE_EUCLIDEAN_NORMED: + dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm); + break; + case ADB_DISTANCE_EUCLIDEAN: + dist = v1norm + v2norm - 2 * v1v2; + break; + case ADB_DISTANCE_DOT_PRODUCT: + dist = v1v2; + break; + } + } else { + continue; + } - VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2); - /* assume normalizedDistance == true for now */ - /* FIXME: not convinced that the statistics we calculated in - TASLP paper are technically valid for normalizedDistance */ - - double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm); - // double dist = v1norm + v2norm - 2*v1v2; - - VERB_LOG(1, "%f %f\n", dist, log(dist)); - sumdist += dist; - sumlogdist += log(dist); - i++; - } else { - VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2); - } + adb_result_t r; + r.qkey = spec->qid.datum ? "" : (*adb->keys)[track1].c_str(); + r.ikey = (*adb->keys)[track2].c_str(); + r.dist = dist; + r.qpos = i1; + r.ipos = i2; + qstate->accumulator->add_point(&r); + nsamples++; } - /* FIXME: the mean isn't really what we should be reporting here */ - unsigned meanN = total / count; + if(!spec->qid.datum) { + delete[] v1; + } + delete[] v2; - double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples); - double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); + return 0; - std::cout << "Summary statistics" << std::endl; - std::cout << "number of samples: " << nsamples << std::endl; - std::cout << "sum of distances (S): " << sumdist << std::endl; - std::cout << "sum of log distances (L): " << sumlogdist << std::endl; - - /* FIXME: we'll also want some more summary statistics based on - propTable, for the minimum-of-X estimate */ - std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; - std::cout << std::endl; - std::cout << "Estimated parameters" << std::endl; - std::cout << "sigma^2: " << sigma2 << "; "; - std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; - std::cout << "d: " << d << std::endl; - - double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); - double logxthresh = gsl_sf_log(sumdist / nsamples) + logw - - (2 / d) * gsl_sf_log(meanN) - - gsl_sf_log(d/2) - - (2 / d) * gsl_sf_log(2 / d) - + (2 / d) * gsl_sf_lngamma(d / 2); - - std::cout << "track xthresh: " << exp(logxthresh) << std::endl; - - delete[] propTable; - delete[] v1; + error: + if(!spec->qid.datum) { + delete[] v1; + } delete[] v2; + return 17; }