annotate sample.cpp @ 497:9d8aee621afb api-inversion

More libtests fixups. Include audiodb_close() calls everywhere (whoops). Add the facility to run tests under valgrind. Unfortunately the error-exitcode flag doesn't actually cause an error exit if the only thing wrong is memory leaks, but it will if there are actual memory errors, which is a start.
author mas01cr
date Sat, 10 Jan 2009 16:07:43 +0000
parents 0c1c8726a79b
children e6dab5ed471c
rev   line source
mas01cr@280 1 #include "audioDB.h"
mas01cr@280 2
mas01cr@280 3 #include <gsl/gsl_sf.h>
mas01cr@280 4 #include <gsl/gsl_rng.h>
mas01cr@280 5
mas01cr@280 6 static
mas01cr@280 7 double yfun(double d) {
mas01cr@280 8 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@280 9 }
mas01cr@280 10
mas01cr@280 11 static
mas01cr@280 12 double yinv(double y) {
mas01cr@280 13 double a = 1.0e-5;
mas01cr@280 14 double b = 1000.0;
mas01cr@280 15
mas01cr@280 16 double ay = yfun(a);
mas01cr@280 17 double by = yfun(b);
mas01cr@280 18
mas01cr@283 19 double c = 0;
mas01cr@283 20 double cy;
mas01cr@280 21
mas01cr@280 22 /* FIXME: simple binary search; there's probably some clever solver
mas01cr@280 23 in gsl somewhere which is less sucky. */
mas01cr@280 24 while ((b - a) > 1.0e-5) {
mas01cr@280 25 c = (a + b) / 2;
mas01cr@280 26 cy = yfun(c);
mas01cr@280 27 if (cy > y) {
mas01cr@280 28 a = c;
mas01cr@280 29 ay = cy;
mas01cr@280 30 } else {
mas01cr@280 31 b = c;
mas01cr@280 32 by = cy;
mas01cr@280 33 }
mas01cr@280 34 }
mas01cr@280 35
mas01cr@280 36 return c;
mas01cr@280 37 }
mas01cr@280 38
mas01cr@284 39 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@280 40 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@280 41 some other sensible method of sampling from a discrete
mas01cr@280 42 distribution. */
mas01cr@280 43 double thing = gsl_rng_uniform(rng);
mas01cr@280 44 unsigned sofar = 0;
mas01cr@280 45 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 46 sofar += propTable[i];
mas01cr@280 47 if (thing < ((double) sofar / (double) total)) {
mas01cr@280 48 return i;
mas01cr@280 49 }
mas01cr@280 50 }
mas01cr@280 51 error("fell through in random_track()");
mas01cr@280 52
mas01cr@280 53 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@280 54 return 0;
mas01cr@280 55 }
mas01cr@280 56
mas01cr@280 57 void audioDB::sample(const char *dbName) {
mas01cr@280 58 initTables(dbName, 0);
mas01mc@324 59 if(dbH->flags & O2_FLAG_LARGE_ADB){
mas01mc@324 60 error("error: sample not yet supported for LARGE_ADB");
mas01mc@324 61 }
mas01mc@324 62
mas01cr@280 63 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@280 64 unsigned cumTrack=0;
mas01cr@280 65 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@280 66 trackOffsetTable[k] = cumTrack;
mas01cr@280 67 cumTrack += trackTable[k] * dbH->dim;
mas01cr@280 68 }
mas01cr@280 69
mas01cr@280 70 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@280 71 unsigned total = 0;
mas01cr@280 72 unsigned count = 0;
mas01cr@280 73
mas01cr@280 74 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 75 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@280 76 alone nary? */
mas01cr@280 77 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@280 78 prop = prop > 0 ? prop : 0;
mas01cr@280 79 if (prop > 0)
mas01cr@280 80 count++;
mas01cr@280 81 propTable[i] = prop;
mas01cr@280 82 total += prop;
mas01cr@280 83 }
mas01cr@280 84
mas01cr@280 85 if (total == 0) {
mas01cr@280 86 error("no sequences of this sequence length in the database", dbName);
mas01cr@280 87 }
mas01cr@280 88
mas01cr@280 89 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@280 90 double *v1 = new double[vlen];
mas01cr@280 91 double *v2 = new double[vlen];
mas01cr@280 92 double v1norm, v2norm, v1v2;
mas01cr@280 93
mas01cr@280 94 double sumdist = 0;
mas01cr@280 95 double sumlogdist = 0;
mas01cr@280 96
mas01cr@280 97 for (unsigned int i = 0; i < nsamples;) {
mas01cr@284 98 unsigned track1 = random_track(propTable, total);
mas01cr@284 99 unsigned track2 = random_track(propTable, total);
mas01cr@280 100
mas01cr@280 101 if(track1 == track2)
mas01cr@280 102 continue;
mas01cr@280 103
mas01cr@280 104 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
mas01cr@280 105 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
mas01cr@280 106
mas01cr@280 107 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@280 108
mas01cr@280 109 /* FIXME: this seeking, reading and distance calculation should
mas01cr@280 110 share more code with the query loop */
mas01cr@370 111 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@370 112 error("seek failure", "", "lseek");
mas01cr@370 113 }
mas01cr@370 114 CHECKED_READ(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 115
mas01cr@370 116 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@370 117 error("seek failure", "", "lseek");
mas01cr@370 118 }
mas01cr@370 119 CHECKED_READ(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 120
mas01cr@280 121 v1norm = 0;
mas01cr@280 122 v2norm = 0;
mas01cr@280 123 v1v2 = 0;
mas01cr@280 124
mas01cr@280 125 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@280 126 v1norm += v1[j]*v1[j];
mas01cr@280 127 v2norm += v2[j]*v2[j];
mas01cr@280 128 v1v2 += v1[j]*v2[j];
mas01cr@280 129 }
mas01cr@280 130
mas01cr@280 131 /* FIXME: we must deal with infinities better than this; there
mas01cr@280 132 could be all sorts of NaNs from arbitrary features. Best
mas01cr@280 133 include power thresholds or something... */
mas01cr@280 134 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@280 135
mas01cr@280 136 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@280 137 /* assume normalizedDistance == true for now */
mas01cr@280 138 /* FIXME: not convinced that the statistics we calculated in
mas01cr@280 139 TASLP paper are technically valid for normalizedDistance */
mas01cr@280 140
mas01cr@280 141 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@280 142 // double dist = v1norm + v2norm - 2*v1v2;
mas01cr@280 143
mas01cr@280 144 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@280 145 sumdist += dist;
mas01cr@280 146 sumlogdist += log(dist);
mas01cr@280 147 i++;
mas01cr@280 148 } else {
mas01cr@280 149 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@280 150 }
mas01cr@280 151 }
mas01cr@280 152
mas01cr@280 153 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@280 154 unsigned meanN = total / count;
mas01cr@280 155
mas01cr@280 156 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@280 157 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@280 158
mas01cr@280 159 std::cout << "Summary statistics" << std::endl;
mas01cr@280 160 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@280 161 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@280 162 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@280 163
mas01cr@280 164 /* FIXME: we'll also want some more summary statistics based on
mas01cr@280 165 propTable, for the minimum-of-X estimate */
mas01cr@280 166 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@280 167 std::cout << std::endl;
mas01cr@280 168 std::cout << "Estimated parameters" << std::endl;
mas01cr@280 169 std::cout << "sigma^2: " << sigma2 << "; ";
mas01cr@280 170 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
mas01cr@280 171 std::cout << "d: " << d << std::endl;
mas01cr@280 172
mas01cr@280 173 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@280 174 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@280 175 - (2 / d) * gsl_sf_log(meanN)
mas01cr@280 176 - gsl_sf_log(d/2)
mas01cr@280 177 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@280 178 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@280 179
mas01cr@280 180 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@280 181
mas01cr@280 182 delete[] propTable;
mas01cr@280 183 delete[] v1;
mas01cr@280 184 delete[] v2;
mas01cr@280 185 }