Mercurial > hg > audiodb
view sample.cpp @ 266:4ffa05f25a00 sampling
Add initial sampling of database distances. Zillions of FIXME comments
everywhere.
author | mas01cr |
---|---|
date | Sat, 14 Jun 2008 17:13:26 +0000 |
parents | |
children | 30a2a45f2b70 |
line wrap: on
line source
#include "audioDB.h" unsigned audioDB::random_track(unsigned *propTable, unsigned total) { /* FIXME: make this O(1) by using the alias-rejection method, or some other sensible method of sampling from a discrete distribution. */ /* FIXME: use a real random number generator, not random() */ double thing = random() / (double) RAND_MAX; unsigned sofar = 0; for (unsigned int i = 0; i < dbH->numFiles; i++) { sofar += propTable[i]; if (thing < ((double) sofar / (double) total)) { return i; } } error("fell through in random_track()"); /* FIXME: decorate error's declaration so that this isn't necessary */ return 0; } void audioDB::sample(const char *dbName) { initTables(dbName, 0); // build track offset table (FIXME: cut'n'pasted from query.cpp) off_t *trackOffsetTable = new off_t[dbH->numFiles]; unsigned cumTrack=0; for(unsigned int k = 0; k < dbH->numFiles; k++){ trackOffsetTable[k] = cumTrack; cumTrack += trackTable[k] * dbH->dim; } unsigned *propTable = new unsigned[dbH->numFiles]; unsigned total = 0; for (unsigned int i = 0; i < dbH->numFiles; i++) { /* what kind of a stupid language doesn't have binary max(), let alone nary? */ unsigned int prop = trackTable[i] - sequenceLength + 1; prop = prop > 0 ? prop : 0; propTable[i] = prop; total += prop; } if (total == 0) { error("no sequences of this sequence length in the database", dbName); } unsigned int vlen = dbH->dim * sequenceLength; double *v1 = new double[vlen]; double *v2 = new double[vlen]; double v1norm, v2norm, v1v2; double sumdist = 0; double sumlogdist = 0; /* 1037 samples for now */ for (unsigned int i = 0; i < 1037;) { /* FIXME: in Real Life we'll want to initialize the RNG using /dev/random or the current time or something. */ unsigned track1 = random_track(propTable, total); unsigned track2 = random_track(propTable, total); /* FIXME: this uses lower-order bits, which is OK on Linux but not necessarily elsewhere. Again, use a real random number generator */ unsigned i1 = random() % propTable[track1]; unsigned i2 = random() % propTable[track2]; VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2); /* FIXME: this seeking, reading and distance calculation should share more code with the query loop */ lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET); read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double)); lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET); read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double)); v1norm = 0; v2norm = 0; v1v2 = 0; for (unsigned int j = 0; j < vlen; j++) { v1norm += v1[j]*v1[j]; v2norm += v2[j]*v2[j]; v1v2 += v1[j]*v2[j]; } /* FIXME: we must deal with infinities better than this; there could be all sorts of NaNs from arbitrary features. Best include power thresholds or something... */ if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) { VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2); /* assume normalizedDistance == true for now */ /* FIXME: not convinced that the statistics we calculated in TASLP paper are valid for normalizedDistance */ double dist = 2 - v1v2 / sqrt(v1norm * v2norm); VERB_LOG(1, "%f %f\n", dist, log(dist)); sumdist += dist; sumlogdist += log(dist); i++; } else { VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2); } } std::cout << "Summary statistics" << std::endl; std::cout << "number of samples: " << 1037 << std::endl; std::cout << "sum of distances (S): " << sumdist << std::endl; std::cout << "sum of log distances (L): " << sumlogdist << std::endl; /* FIXME: we'll also want some summary statistics based on propTable, for the minimum-of-X estimate */ delete[] propTable; delete[] v1; delete[] v2; }