annotate sample.cpp @ 284:cacad987d785

Really finish with the sampling branch, this time merging all of it, yes, even the last revision. (The last revision was one that seeded the RNG with the current time, which is helpful when trying to get multiple independent-ish samples from the same database...)
author mas01cr
date Mon, 07 Jul 2008 08:57:06 +0000
parents 0e44de38d675
children b9eff6896943
rev   line source
mas01cr@280 1 #include "audioDB.h"
mas01cr@280 2
mas01cr@280 3 #include <gsl/gsl_sf.h>
mas01cr@280 4 #include <gsl/gsl_rng.h>
mas01cr@280 5
mas01cr@280 6 static
mas01cr@280 7 double yfun(double d) {
mas01cr@280 8 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@280 9 }
mas01cr@280 10
mas01cr@280 11 static
mas01cr@280 12 double yinv(double y) {
mas01cr@280 13 double a = 1.0e-5;
mas01cr@280 14 double b = 1000.0;
mas01cr@280 15
mas01cr@280 16 double ay = yfun(a);
mas01cr@280 17 double by = yfun(b);
mas01cr@280 18
mas01cr@283 19 double c = 0;
mas01cr@283 20 double cy;
mas01cr@280 21
mas01cr@280 22 /* FIXME: simple binary search; there's probably some clever solver
mas01cr@280 23 in gsl somewhere which is less sucky. */
mas01cr@280 24 while ((b - a) > 1.0e-5) {
mas01cr@280 25 c = (a + b) / 2;
mas01cr@280 26 cy = yfun(c);
mas01cr@280 27 if (cy > y) {
mas01cr@280 28 a = c;
mas01cr@280 29 ay = cy;
mas01cr@280 30 } else {
mas01cr@280 31 b = c;
mas01cr@280 32 by = cy;
mas01cr@280 33 }
mas01cr@280 34 }
mas01cr@280 35
mas01cr@280 36 return c;
mas01cr@280 37 }
mas01cr@280 38
mas01cr@284 39 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@280 40 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@280 41 some other sensible method of sampling from a discrete
mas01cr@280 42 distribution. */
mas01cr@280 43 double thing = gsl_rng_uniform(rng);
mas01cr@280 44 unsigned sofar = 0;
mas01cr@280 45 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 46 sofar += propTable[i];
mas01cr@280 47 if (thing < ((double) sofar / (double) total)) {
mas01cr@280 48 return i;
mas01cr@280 49 }
mas01cr@280 50 }
mas01cr@280 51 error("fell through in random_track()");
mas01cr@280 52
mas01cr@280 53 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@280 54 return 0;
mas01cr@280 55 }
mas01cr@280 56
mas01cr@280 57 void audioDB::sample(const char *dbName) {
mas01cr@280 58 initTables(dbName, 0);
mas01cr@280 59
mas01cr@280 60 // build track offset table (FIXME: cut'n'pasted from query.cpp)
mas01cr@280 61 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@280 62 unsigned cumTrack=0;
mas01cr@280 63 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@280 64 trackOffsetTable[k] = cumTrack;
mas01cr@280 65 cumTrack += trackTable[k] * dbH->dim;
mas01cr@280 66 }
mas01cr@280 67
mas01cr@280 68 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@280 69 unsigned total = 0;
mas01cr@280 70 unsigned count = 0;
mas01cr@280 71
mas01cr@280 72 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 73 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@280 74 alone nary? */
mas01cr@280 75 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@280 76 prop = prop > 0 ? prop : 0;
mas01cr@280 77 if (prop > 0)
mas01cr@280 78 count++;
mas01cr@280 79 propTable[i] = prop;
mas01cr@280 80 total += prop;
mas01cr@280 81 }
mas01cr@280 82
mas01cr@280 83 if (total == 0) {
mas01cr@280 84 error("no sequences of this sequence length in the database", dbName);
mas01cr@280 85 }
mas01cr@280 86
mas01cr@280 87 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@280 88 double *v1 = new double[vlen];
mas01cr@280 89 double *v2 = new double[vlen];
mas01cr@280 90 double v1norm, v2norm, v1v2;
mas01cr@280 91
mas01cr@280 92 double sumdist = 0;
mas01cr@280 93 double sumlogdist = 0;
mas01cr@280 94
mas01cr@280 95 for (unsigned int i = 0; i < nsamples;) {
mas01cr@284 96 unsigned track1 = random_track(propTable, total);
mas01cr@284 97 unsigned track2 = random_track(propTable, total);
mas01cr@280 98
mas01cr@280 99 if(track1 == track2)
mas01cr@280 100 continue;
mas01cr@280 101
mas01cr@280 102 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
mas01cr@280 103 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
mas01cr@280 104
mas01cr@280 105 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@280 106
mas01cr@280 107 /* FIXME: this seeking, reading and distance calculation should
mas01cr@280 108 share more code with the query loop */
mas01cr@280 109 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@280 110 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 111
mas01cr@280 112 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@280 113 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 114
mas01cr@280 115 v1norm = 0;
mas01cr@280 116 v2norm = 0;
mas01cr@280 117 v1v2 = 0;
mas01cr@280 118
mas01cr@280 119 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@280 120 v1norm += v1[j]*v1[j];
mas01cr@280 121 v2norm += v2[j]*v2[j];
mas01cr@280 122 v1v2 += v1[j]*v2[j];
mas01cr@280 123 }
mas01cr@280 124
mas01cr@280 125 /* FIXME: we must deal with infinities better than this; there
mas01cr@280 126 could be all sorts of NaNs from arbitrary features. Best
mas01cr@280 127 include power thresholds or something... */
mas01cr@280 128 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@280 129
mas01cr@280 130 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@280 131 /* assume normalizedDistance == true for now */
mas01cr@280 132 /* FIXME: not convinced that the statistics we calculated in
mas01cr@280 133 TASLP paper are technically valid for normalizedDistance */
mas01cr@280 134
mas01cr@280 135 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@280 136 // double dist = v1norm + v2norm - 2*v1v2;
mas01cr@280 137
mas01cr@280 138 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@280 139 sumdist += dist;
mas01cr@280 140 sumlogdist += log(dist);
mas01cr@280 141 i++;
mas01cr@280 142 } else {
mas01cr@280 143 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@280 144 }
mas01cr@280 145 }
mas01cr@280 146
mas01cr@280 147 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@280 148 unsigned meanN = total / count;
mas01cr@280 149
mas01cr@280 150 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@280 151 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@280 152
mas01cr@280 153 std::cout << "Summary statistics" << std::endl;
mas01cr@280 154 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@280 155 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@280 156 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@280 157
mas01cr@280 158 /* FIXME: we'll also want some more summary statistics based on
mas01cr@280 159 propTable, for the minimum-of-X estimate */
mas01cr@280 160 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@280 161 std::cout << std::endl;
mas01cr@280 162 std::cout << "Estimated parameters" << std::endl;
mas01cr@280 163 std::cout << "sigma^2: " << sigma2 << "; ";
mas01cr@280 164 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
mas01cr@280 165 std::cout << "d: " << d << std::endl;
mas01cr@280 166
mas01cr@280 167 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@280 168 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@280 169 - (2 / d) * gsl_sf_log(meanN)
mas01cr@280 170 - gsl_sf_log(d/2)
mas01cr@280 171 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@280 172 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@280 173
mas01cr@280 174 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@280 175
mas01cr@280 176 delete[] propTable;
mas01cr@280 177 delete[] v1;
mas01cr@280 178 delete[] v2;
mas01cr@280 179 }