Mercurial > hg > audiodb
changeset 280:3be15407e814
Merge sampling branch (-r361:405, though I hope that the branch is now
finished) onto trunk. API developers take note.
Things still to clear up:
* whether the threshold distance it currently reports bears any relation
to reality;
* if not, how to bring it a bit more into alignment;
* minor code cleanup issues in sample.cpp;
* incorporating --absolute-threshold handling into sampling;
* writing suitable test cases.
author | mas01cr |
---|---|
date | Wed, 02 Jul 2008 14:07:10 +0000 |
parents | d209e8470a60 |
children | f042cbf427b3 |
files | Makefile audioDB.cpp audioDB.h gengetopt.in sample.cpp xthresh.c |
diffstat | 6 files changed, 253 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Mon Jun 30 17:38:59 2008 +0000 +++ b/Makefile Wed Jul 02 14:07:10 2008 +0000 @@ -37,10 +37,10 @@ %.o: %.cpp audioDB.h adb.nsmap cmdline.h reporter.h g++ -c ${CFLAGS} ${GSOAP_INCLUDE} -Wall -Werror $< -OBJS=insert.o create.o common.o dump.o query.o soap.o audioDB.o +OBJS=insert.o create.o common.o dump.o query.o soap.o sample.o audioDB.o ${EXECUTABLE}: ${OBJS} soapServer.cpp soapClient.cpp soapC.cpp cmdline.c - g++ -o ${EXECUTABLE} ${CFLAGS} ${GSOAP_INCLUDE} $^ ${GSOAP_CPP} + g++ -o ${EXECUTABLE} ${CFLAGS} -lgsl -lgslcblas ${GSOAP_INCLUDE} $^ ${GSOAP_CPP} clean: -rm cmdline.c cmdline.h @@ -52,3 +52,6 @@ test: ${EXECUTABLE} -sh -c "cd tests && sh ./run-tests.sh" + +xthresh: xthresh.c + gcc -o $@ -lgsl -lgslcblas $<
--- a/audioDB.cpp Mon Jun 30 17:38:59 2008 +0000 +++ b/audioDB.cpp Wed Jul 02 14:07:10 2008 +0000 @@ -37,6 +37,9 @@ ws_status(dbName,(char*)hostport); else status(dbName); + + else if(O2_ACTION(COM_SAMPLE)) + sample(dbName); else if(O2_ACTION(COM_L2NORM)) l2norm(dbName); @@ -210,6 +213,17 @@ return 0; } + if(args_info.SAMPLE_given) { + command = COM_SAMPLE; + dbName = args_info.database_arg; + sequenceLength = args_info.sequencelength_arg; + if(sequenceLength < 1 || sequenceLength > 1000) { + error("seqlen out of range: 1 <= seqlen <= 1000"); + } + nsamples = args_info.nsamples_arg; + return 0; + } + if(args_info.DUMP_given){ command=COM_DUMP; dbName=args_info.database_arg;
--- a/audioDB.h Mon Jun 30 17:38:59 2008 +0000 +++ b/audioDB.h Wed Jul 02 14:07:10 2008 +0000 @@ -12,6 +12,7 @@ #include <assert.h> #include <float.h> #include <signal.h> +#include <gsl/gsl_rng.h> // includes for web services #include "soapH.h" @@ -29,6 +30,7 @@ #define COM_POWER "--POWER" #define COM_DUMP "--DUMP" #define COM_SERVER "--SERVER" +#define COM_SAMPLE "--SAMPLE" // parameters #define COM_CLIENT "--client" @@ -176,6 +178,8 @@ // Flags and parameters unsigned verbosity; // how much do we want to know? + unsigned nsamples; + //off_t size; // given size (for creation) unsigned datasize; // size in MB unsigned ntracks; @@ -247,6 +251,8 @@ void batchinsert(const char* dbName, const char* inFile); void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); void status(const char* dbName, adb__statusResponse *adbStatusResponse=0); + unsigned random_track(unsigned *propTable, unsigned total, gsl_rng *); + void sample(const char *dbName); void ws_status(const char*dbName, char* hostport); void ws_query(const char*dbName, const char *trackKey, const char* hostport); void l2norm(const char* dbName); @@ -291,6 +297,7 @@ powerTableLength(0), \ l2normTableLength(0), \ verbosity(1), \ + nsamples(2000), \ datasize(O2_DEFAULT_DATASIZE), \ ntracks(O2_DEFAULT_NTRACKS), \ datadim(O2_DEFAULT_DATADIM), \
--- a/gengetopt.in Mon Jun 30 17:38:59 2008 +0000 +++ b/gengetopt.in Wed Jul 02 14:07:10 2008 +0000 @@ -17,15 +17,21 @@ option "ntracks" - "capacity of database for tracks" int dependon="NEW" default="20000" optional option "datadim" - "dimensionality of stored data" int dependon="NEW" default="9" optional -section "Database Maintenance" sectiondesc="Querying, tweaking and dumping databases." +section "Database Maintenance" sectiondesc="Tweaking and dumping databases." -option "STATUS" S "output database information to stdout." dependon="database" optional option "DUMP" D "output all entries: index key size." dependon="database" optional option "output" - "output directory" string dependon="DUMP" default="audioDB.dump" optional option "L2NORM" L "unit norm vectors and norm all future inserts." dependon="database" optional option "POWER" P "turn on power flag for database." dependon="database" optional +section "Database Information" sectiondesc="Information about databases." + +option "STATUS" S "output database information to stdout." dependon="database" optional +option "SAMPLE" X "sample statistics for database." dependon="database" optional +option "nsamples" - "number of pairwise samples to take." dependon="SAMPLE" int typestr="number" default="2000" optional + section "Database Insertion" sectiondesc="The following commands insert feature files, with optional keys and timestamps.\n" + option "INSERT" I "add feature vectors to an existing database." dependon="features" optional option "UPDATE" U "replace inserted vectors associated with key with new input vectors." dependon="features" dependon="key" dependon="database" optional hidden option "features" f "binary series of vectors file {int sz:ieee double[][sz]:eof}." string typestr="filename" dependon="database" optional @@ -39,6 +45,8 @@ option "powerList" W "text file containing list of binary power feature file." string typestr="filename" dependon="database" optional option "keyList" K "text file containing list of unique identifiers associated with --features." string typestr="filename" optional +option "sequencelength" l "length of sequences for sequence search." int typestr="length" default="16" optional + section "Database Search" sectiondesc="Thse commands control the retrieval behaviour.\n" @@ -50,7 +58,6 @@ option "expandfactor" x "time compress/expand factor of result length to query length [1.0 .. 100.0]." double default="1.1" optional hidden option "rotate" o "rotate query vectors for rotationally invariant search." flag off optional hidden option "resultlength" r "maximum length of the result list." int typestr="length" default="10" optional -option "sequencelength" l "length of sequences for sequence search." int typestr="length" default="16" dependon="QUERY" optional option "sequencehop" h "hop size of sequence window for sequence search." int typestr="hop" default="1" dependon="QUERY" optional option "absolute-threshold" - "absolute power threshold for consideration of query or target sequence (in Bels)" double dependon="QUERY" optional option "relative-threshold" - "relative power threshold between query and target sequence (in Bels)" double dependon="QUERY" optional
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sample.cpp Wed Jul 02 14:07:10 2008 +0000 @@ -0,0 +1,190 @@ +#include "audioDB.h" + +#include <gsl/gsl_sf.h> +#include <gsl/gsl_rng.h> + +static +double yfun(double d) { + return gsl_sf_log(d) - gsl_sf_psi(d); +} + +static +double yinv(double y) { + double a = 1.0e-5; + double b = 1000.0; + + double ay = yfun(a); + double by = yfun(b); + + double c, cy; + + /* FIXME: simple binary search; there's probably some clever solver + in gsl somewhere which is less sucky. */ + while ((b - a) > 1.0e-5) { + c = (a + b) / 2; + cy = yfun(c); + if (cy > y) { + a = c; + ay = cy; + } else { + b = c; + by = cy; + } + } + + return c; +} + +unsigned audioDB::random_track(unsigned *propTable, unsigned total, gsl_rng *rng) { + /* FIXME: make this O(1) by using the alias-rejection method, or + some other sensible method of sampling from a discrete + distribution. */ + double thing = gsl_rng_uniform(rng); + unsigned sofar = 0; + for (unsigned int i = 0; i < dbH->numFiles; i++) { + sofar += propTable[i]; + if (thing < ((double) sofar / (double) total)) { + return i; + } + } + error("fell through in random_track()"); + + /* FIXME: decorate error's declaration so that this isn't necessary */ + return 0; +} + +void audioDB::sample(const char *dbName) { + initTables(dbName, 0); + + gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); + + /* FIXME: in Real Life we'll want to initialize the RNG using + /dev/random or the current time or something, like this: + + unsigned int seed; + int fd = open("/dev/urandom", O_RDONLY); + read(fd, &seed, 4); + + gsl_rng_set(rng, seed); + */ + + // build track offset table (FIXME: cut'n'pasted from query.cpp) + off_t *trackOffsetTable = new off_t[dbH->numFiles]; + unsigned cumTrack=0; + for(unsigned int k = 0; k < dbH->numFiles; k++){ + trackOffsetTable[k] = cumTrack; + cumTrack += trackTable[k] * dbH->dim; + } + + unsigned *propTable = new unsigned[dbH->numFiles]; + unsigned total = 0; + unsigned count = 0; + + for (unsigned int i = 0; i < dbH->numFiles; i++) { + /* what kind of a stupid language doesn't have binary max(), let + alone nary? */ + unsigned int prop = trackTable[i] - sequenceLength + 1; + prop = prop > 0 ? prop : 0; + if (prop > 0) + count++; + propTable[i] = prop; + total += prop; + } + + if (total == 0) { + error("no sequences of this sequence length in the database", dbName); + } + + unsigned int vlen = dbH->dim * sequenceLength; + double *v1 = new double[vlen]; + double *v2 = new double[vlen]; + double v1norm, v2norm, v1v2; + + double sumdist = 0; + double sumlogdist = 0; + + for (unsigned int i = 0; i < nsamples;) { + unsigned track1 = random_track(propTable, total, rng); + unsigned track2 = random_track(propTable, total, rng); + + if(track1 == track2) + continue; + + unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]); + unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]); + + VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2); + + /* FIXME: this seeking, reading and distance calculation should + share more code with the query loop */ + lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET); + read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double)); + + lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET); + read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double)); + + v1norm = 0; + v2norm = 0; + v1v2 = 0; + + for (unsigned int j = 0; j < vlen; j++) { + v1norm += v1[j]*v1[j]; + v2norm += v2[j]*v2[j]; + v1v2 += v1[j]*v2[j]; + } + + /* FIXME: we must deal with infinities better than this; there + could be all sorts of NaNs from arbitrary features. Best + include power thresholds or something... */ + if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) { + + VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2); + /* assume normalizedDistance == true for now */ + /* FIXME: not convinced that the statistics we calculated in + TASLP paper are technically valid for normalizedDistance */ + + double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm); + // double dist = v1norm + v2norm - 2*v1v2; + + VERB_LOG(1, "%f %f\n", dist, log(dist)); + sumdist += dist; + sumlogdist += log(dist); + i++; + } else { + VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2); + } + } + + /* FIXME: the mean isn't really what we should be reporting here */ + unsigned meanN = total / count; + + double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples); + double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); + + std::cout << "Summary statistics" << std::endl; + std::cout << "number of samples: " << nsamples << std::endl; + std::cout << "sum of distances (S): " << sumdist << std::endl; + std::cout << "sum of log distances (L): " << sumlogdist << std::endl; + + /* FIXME: we'll also want some more summary statistics based on + propTable, for the minimum-of-X estimate */ + std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; + std::cout << std::endl; + std::cout << "Estimated parameters" << std::endl; + std::cout << "sigma^2: " << sigma2 << "; "; + std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; + std::cout << "d: " << d << std::endl; + + double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); + double logxthresh = gsl_sf_log(sumdist / nsamples) + logw + - (2 / d) * gsl_sf_log(meanN) + - gsl_sf_log(d/2) + - (2 / d) * gsl_sf_log(2 / d) + + (2 / d) * gsl_sf_lngamma(d / 2); + + std::cout << "track xthresh: " << exp(logxthresh) << std::endl; + + delete[] propTable; + delete[] v1; + delete[] v2; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xthresh.c Wed Jul 02 14:07:10 2008 +0000 @@ -0,0 +1,27 @@ +#include <gsl/gsl_sf.h> +#include <stdio.h> +#include <stdlib.h> +#include <math.h> + +int main(int argc, char *argv[]) { + if(argc != 4) { + fprintf(stderr, "Wrong number of arguments: %d\n", argc); + exit(1); + } + + long int meanN = strtol(argv[1], NULL, 10); + + double d = strtod(argv[2], NULL); + double sigma2 = strtod(argv[3], NULL); + + double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); + double logxthresh = gsl_sf_log(sigma2) + logw + - (2 / d) * gsl_sf_log(meanN) + - gsl_sf_log(d/2) + - (2 / d) * gsl_sf_log(2 / d) + + (2 / d) * gsl_sf_lngamma(d / 2); + + printf("w: %f\n", exp(logw)); + printf("x_thresh: %f\n", exp(logxthresh)); + exit(0); +}