annotate sample.cpp @ 270:9636040ff503 sampling

Parametrize nsamples (though not on the command-line yet) Compute the mean number of possible sequences in tracks, and display that. Compute the track xthreshold as in TASLP paper. (Not quite getting the same answers, though).
author mas01cr
date Mon, 16 Jun 2008 11:59:43 +0000
parents dcbb30790b30
children 1f2c7d5e581c
rev   line source
mas01cr@266 1 #include "audioDB.h"
mas01cr@266 2
mas01cr@268 3 #include <gsl/gsl_sf.h>
mas01cr@268 4
mas01cr@268 5 static double yfun(double d) {
mas01cr@268 6 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@268 7 }
mas01cr@268 8
mas01cr@268 9 static double yinv(double y) {
mas01cr@268 10 double a = 1.0e-5;
mas01cr@268 11 double b = 1000.0;
mas01cr@268 12
mas01cr@268 13 double ay = yfun(a);
mas01cr@268 14 double by = yfun(b);
mas01cr@268 15
mas01cr@268 16 double c, cy;
mas01cr@268 17
mas01cr@268 18 /* FIXME: simple binary search */
mas01cr@268 19 while ((b - a) > 1.0e-5) {
mas01cr@268 20 c = (a + b) / 2;
mas01cr@268 21 cy = yfun(c);
mas01cr@268 22 if (cy > y) {
mas01cr@268 23 a = c;
mas01cr@268 24 ay = cy;
mas01cr@268 25 } else {
mas01cr@268 26 b = c;
mas01cr@268 27 by = cy;
mas01cr@268 28 }
mas01cr@268 29 }
mas01cr@268 30
mas01cr@268 31 return c;
mas01cr@268 32 }
mas01cr@268 33
mas01cr@266 34 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@266 35 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@266 36 some other sensible method of sampling from a discrete
mas01cr@266 37 distribution. */
mas01cr@266 38 /* FIXME: use a real random number generator, not random() */
mas01cr@266 39 double thing = random() / (double) RAND_MAX;
mas01cr@266 40 unsigned sofar = 0;
mas01cr@266 41 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@266 42 sofar += propTable[i];
mas01cr@266 43 if (thing < ((double) sofar / (double) total)) {
mas01cr@266 44 return i;
mas01cr@266 45 }
mas01cr@266 46 }
mas01cr@266 47 error("fell through in random_track()");
mas01cr@266 48
mas01cr@266 49 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@266 50 return 0;
mas01cr@266 51 }
mas01cr@266 52
mas01cr@266 53 void audioDB::sample(const char *dbName) {
mas01cr@266 54 initTables(dbName, 0);
mas01cr@266 55
mas01cr@266 56 // build track offset table (FIXME: cut'n'pasted from query.cpp)
mas01cr@266 57 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@266 58 unsigned cumTrack=0;
mas01cr@266 59 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@266 60 trackOffsetTable[k] = cumTrack;
mas01cr@266 61 cumTrack += trackTable[k] * dbH->dim;
mas01cr@266 62 }
mas01cr@266 63
mas01cr@266 64 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@266 65 unsigned total = 0;
mas01cr@270 66 unsigned count = 0;
mas01cr@266 67
mas01cr@266 68 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@266 69 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@266 70 alone nary? */
mas01cr@266 71 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@266 72 prop = prop > 0 ? prop : 0;
mas01cr@270 73 if (prop > 0)
mas01cr@270 74 count++;
mas01cr@266 75 propTable[i] = prop;
mas01cr@266 76 total += prop;
mas01cr@266 77 }
mas01cr@266 78
mas01cr@266 79 if (total == 0) {
mas01cr@266 80 error("no sequences of this sequence length in the database", dbName);
mas01cr@266 81 }
mas01cr@266 82
mas01cr@266 83 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@266 84 double *v1 = new double[vlen];
mas01cr@266 85 double *v2 = new double[vlen];
mas01cr@266 86 double v1norm, v2norm, v1v2;
mas01cr@266 87
mas01cr@266 88 double sumdist = 0;
mas01cr@266 89 double sumlogdist = 0;
mas01cr@266 90
mas01cr@270 91 unsigned int nsamples = 2049;
mas01cr@270 92
mas01cr@270 93 for (unsigned int i = 0; i < nsamples;) {
mas01cr@266 94 /* FIXME: in Real Life we'll want to initialize the RNG using
mas01cr@266 95 /dev/random or the current time or something. */
mas01cr@266 96 unsigned track1 = random_track(propTable, total);
mas01cr@266 97 unsigned track2 = random_track(propTable, total);
mas01cr@266 98
mas01cr@266 99 /* FIXME: this uses lower-order bits, which is OK on Linux but not
mas01cr@266 100 necessarily elsewhere. Again, use a real random number
mas01cr@266 101 generator */
mas01cr@266 102 unsigned i1 = random() % propTable[track1];
mas01cr@266 103 unsigned i2 = random() % propTable[track2];
mas01cr@266 104
mas01cr@266 105 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@266 106
mas01cr@266 107 /* FIXME: this seeking, reading and distance calculation should
mas01cr@266 108 share more code with the query loop */
mas01cr@266 109 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@266 110 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@266 111
mas01cr@266 112 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@266 113 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@266 114
mas01cr@266 115 v1norm = 0;
mas01cr@266 116 v2norm = 0;
mas01cr@266 117 v1v2 = 0;
mas01cr@266 118
mas01cr@266 119 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@266 120 v1norm += v1[j]*v1[j];
mas01cr@266 121 v2norm += v2[j]*v2[j];
mas01cr@266 122 v1v2 += v1[j]*v2[j];
mas01cr@266 123 }
mas01cr@266 124
mas01cr@266 125 /* FIXME: we must deal with infinities better than this; there
mas01cr@266 126 could be all sorts of NaNs from arbitrary features. Best
mas01cr@266 127 include power thresholds or something... */
mas01cr@266 128 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@266 129
mas01cr@266 130 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@266 131 /* assume normalizedDistance == true for now */
mas01cr@266 132 /* FIXME: not convinced that the statistics we calculated in
mas01cr@266 133 TASLP paper are valid for normalizedDistance */
mas01cr@269 134 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@266 135 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@266 136 sumdist += dist;
mas01cr@266 137 sumlogdist += log(dist);
mas01cr@266 138 i++;
mas01cr@266 139 } else {
mas01cr@266 140 VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@266 141 }
mas01cr@266 142 }
mas01cr@266 143
mas01cr@270 144 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@270 145 unsigned meanN = total / count;
mas01cr@270 146
mas01cr@270 147 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@270 148 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@268 149
mas01cr@266 150 std::cout << "Summary statistics" << std::endl;
mas01cr@270 151 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@266 152 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@266 153 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@270 154 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@268 155 std::cout << std::endl;
mas01cr@268 156 std::cout << "Estimated parameters" << std::endl;
mas01cr@268 157 std::cout << "sigma^2: " << sigma2 << std::endl;
mas01cr@268 158 std::cout << "d: " << d << std::endl;
mas01cr@270 159
mas01cr@270 160 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@270 161 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@270 162 - (2 / d) * gsl_sf_log(meanN)
mas01cr@270 163 - gsl_sf_log(d/2)
mas01cr@270 164 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@270 165 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@270 166
mas01cr@270 167 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@266 168
mas01cr@266 169 /* FIXME: we'll also want some summary statistics based on
mas01cr@266 170 propTable, for the minimum-of-X estimate */
mas01cr@266 171
mas01cr@266 172 delete[] propTable;
mas01cr@266 173 delete[] v1;
mas01cr@266 174 delete[] v2;
mas01cr@266 175 }