annotate sample.cpp @ 266:4ffa05f25a00 sampling

Add initial sampling of database distances. Zillions of FIXME comments everywhere.
author mas01cr
date Sat, 14 Jun 2008 17:13:26 +0000
parents
children 30a2a45f2b70
rev   line source
mas01cr@266 1 #include "audioDB.h"
mas01cr@266 2
mas01cr@266 3 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@266 4 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@266 5 some other sensible method of sampling from a discrete
mas01cr@266 6 distribution. */
mas01cr@266 7 /* FIXME: use a real random number generator, not random() */
mas01cr@266 8 double thing = random() / (double) RAND_MAX;
mas01cr@266 9 unsigned sofar = 0;
mas01cr@266 10 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@266 11 sofar += propTable[i];
mas01cr@266 12 if (thing < ((double) sofar / (double) total)) {
mas01cr@266 13 return i;
mas01cr@266 14 }
mas01cr@266 15 }
mas01cr@266 16 error("fell through in random_track()");
mas01cr@266 17
mas01cr@266 18 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@266 19 return 0;
mas01cr@266 20 }
mas01cr@266 21
mas01cr@266 22 void audioDB::sample(const char *dbName) {
mas01cr@266 23 initTables(dbName, 0);
mas01cr@266 24
mas01cr@266 25 // build track offset table (FIXME: cut'n'pasted from query.cpp)
mas01cr@266 26 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@266 27 unsigned cumTrack=0;
mas01cr@266 28 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@266 29 trackOffsetTable[k] = cumTrack;
mas01cr@266 30 cumTrack += trackTable[k] * dbH->dim;
mas01cr@266 31 }
mas01cr@266 32
mas01cr@266 33 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@266 34 unsigned total = 0;
mas01cr@266 35
mas01cr@266 36 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@266 37 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@266 38 alone nary? */
mas01cr@266 39 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@266 40 prop = prop > 0 ? prop : 0;
mas01cr@266 41 propTable[i] = prop;
mas01cr@266 42 total += prop;
mas01cr@266 43 }
mas01cr@266 44
mas01cr@266 45 if (total == 0) {
mas01cr@266 46 error("no sequences of this sequence length in the database", dbName);
mas01cr@266 47 }
mas01cr@266 48
mas01cr@266 49 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@266 50 double *v1 = new double[vlen];
mas01cr@266 51 double *v2 = new double[vlen];
mas01cr@266 52 double v1norm, v2norm, v1v2;
mas01cr@266 53
mas01cr@266 54 double sumdist = 0;
mas01cr@266 55 double sumlogdist = 0;
mas01cr@266 56
mas01cr@266 57 /* 1037 samples for now */
mas01cr@266 58 for (unsigned int i = 0; i < 1037;) {
mas01cr@266 59 /* FIXME: in Real Life we'll want to initialize the RNG using
mas01cr@266 60 /dev/random or the current time or something. */
mas01cr@266 61 unsigned track1 = random_track(propTable, total);
mas01cr@266 62 unsigned track2 = random_track(propTable, total);
mas01cr@266 63
mas01cr@266 64 /* FIXME: this uses lower-order bits, which is OK on Linux but not
mas01cr@266 65 necessarily elsewhere. Again, use a real random number
mas01cr@266 66 generator */
mas01cr@266 67 unsigned i1 = random() % propTable[track1];
mas01cr@266 68 unsigned i2 = random() % propTable[track2];
mas01cr@266 69
mas01cr@266 70 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@266 71
mas01cr@266 72 /* FIXME: this seeking, reading and distance calculation should
mas01cr@266 73 share more code with the query loop */
mas01cr@266 74 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@266 75 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@266 76
mas01cr@266 77 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@266 78 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@266 79
mas01cr@266 80 v1norm = 0;
mas01cr@266 81 v2norm = 0;
mas01cr@266 82 v1v2 = 0;
mas01cr@266 83
mas01cr@266 84 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@266 85 v1norm += v1[j]*v1[j];
mas01cr@266 86 v2norm += v2[j]*v2[j];
mas01cr@266 87 v1v2 += v1[j]*v2[j];
mas01cr@266 88 }
mas01cr@266 89
mas01cr@266 90 /* FIXME: we must deal with infinities better than this; there
mas01cr@266 91 could be all sorts of NaNs from arbitrary features. Best
mas01cr@266 92 include power thresholds or something... */
mas01cr@266 93 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@266 94
mas01cr@266 95 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@266 96 /* assume normalizedDistance == true for now */
mas01cr@266 97 /* FIXME: not convinced that the statistics we calculated in
mas01cr@266 98 TASLP paper are valid for normalizedDistance */
mas01cr@266 99 double dist = 2 - v1v2 / sqrt(v1norm * v2norm);
mas01cr@266 100 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@266 101 sumdist += dist;
mas01cr@266 102 sumlogdist += log(dist);
mas01cr@266 103 i++;
mas01cr@266 104 } else {
mas01cr@266 105 VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@266 106 }
mas01cr@266 107 }
mas01cr@266 108
mas01cr@266 109 std::cout << "Summary statistics" << std::endl;
mas01cr@266 110 std::cout << "number of samples: " << 1037 << std::endl;
mas01cr@266 111 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@266 112 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@266 113
mas01cr@266 114 /* FIXME: we'll also want some summary statistics based on
mas01cr@266 115 propTable, for the minimum-of-X estimate */
mas01cr@266 116
mas01cr@266 117 delete[] propTable;
mas01cr@266 118 delete[] v1;
mas01cr@266 119 delete[] v2;
mas01cr@266 120 }