annotate sample.cpp @ 601:82d23418d867

Fix some fd leaks in the command-line binary Strictly speaking, they're not really leaks, because the only codepath that suffers from these leaks exits immediately afterwards. On the other hand, this fix makes valgrind on e.g. tests/0025 happier, going from 5 errors to none.
author mas01cr
date Fri, 14 Aug 2009 16:39:32 +0000
parents e6dab5ed471c
children 536cfa209e7f
rev   line source
mas01cr@280 1 #include "audioDB.h"
mas01cr@280 2
mas01cr@280 3 #include <gsl/gsl_sf.h>
mas01cr@280 4 #include <gsl/gsl_rng.h>
mas01cr@280 5
mas01cr@280 6 static
mas01cr@280 7 double yfun(double d) {
mas01cr@280 8 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@280 9 }
mas01cr@280 10
mas01cr@280 11 static
mas01cr@280 12 double yinv(double y) {
mas01cr@280 13 double a = 1.0e-5;
mas01cr@280 14 double b = 1000.0;
mas01cr@280 15
mas01cr@280 16 double ay = yfun(a);
mas01cr@280 17 double by = yfun(b);
mas01cr@280 18
mas01cr@283 19 double c = 0;
mas01cr@283 20 double cy;
mas01cr@280 21
mas01cr@280 22 /* FIXME: simple binary search; there's probably some clever solver
mas01cr@280 23 in gsl somewhere which is less sucky. */
mas01cr@280 24 while ((b - a) > 1.0e-5) {
mas01cr@280 25 c = (a + b) / 2;
mas01cr@280 26 cy = yfun(c);
mas01cr@280 27 if (cy > y) {
mas01cr@280 28 a = c;
mas01cr@280 29 ay = cy;
mas01cr@280 30 } else {
mas01cr@280 31 b = c;
mas01cr@280 32 by = cy;
mas01cr@280 33 }
mas01cr@280 34 }
mas01cr@280 35
mas01cr@280 36 return c;
mas01cr@280 37 }
mas01cr@280 38
mas01cr@284 39 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@280 40 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@280 41 some other sensible method of sampling from a discrete
mas01cr@280 42 distribution. */
mas01cr@280 43 double thing = gsl_rng_uniform(rng);
mas01cr@280 44 unsigned sofar = 0;
mas01cr@280 45 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 46 sofar += propTable[i];
mas01cr@280 47 if (thing < ((double) sofar / (double) total)) {
mas01cr@280 48 return i;
mas01cr@280 49 }
mas01cr@280 50 }
mas01cr@280 51 error("fell through in random_track()");
mas01cr@280 52
mas01cr@280 53 }
mas01cr@280 54
mas01cr@280 55 void audioDB::sample(const char *dbName) {
mas01cr@280 56 initTables(dbName, 0);
mas01mc@324 57 if(dbH->flags & O2_FLAG_LARGE_ADB){
mas01mc@324 58 error("error: sample not yet supported for LARGE_ADB");
mas01mc@324 59 }
mas01mc@324 60
mas01cr@280 61 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@280 62 unsigned cumTrack=0;
mas01cr@280 63 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@280 64 trackOffsetTable[k] = cumTrack;
mas01cr@280 65 cumTrack += trackTable[k] * dbH->dim;
mas01cr@280 66 }
mas01cr@280 67
mas01cr@280 68 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@280 69 unsigned total = 0;
mas01cr@280 70 unsigned count = 0;
mas01cr@280 71
mas01cr@280 72 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 73 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@280 74 alone nary? */
mas01cr@280 75 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@280 76 prop = prop > 0 ? prop : 0;
mas01cr@280 77 if (prop > 0)
mas01cr@280 78 count++;
mas01cr@280 79 propTable[i] = prop;
mas01cr@280 80 total += prop;
mas01cr@280 81 }
mas01cr@280 82
mas01cr@280 83 if (total == 0) {
mas01cr@280 84 error("no sequences of this sequence length in the database", dbName);
mas01cr@280 85 }
mas01cr@280 86
mas01cr@280 87 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@280 88 double *v1 = new double[vlen];
mas01cr@280 89 double *v2 = new double[vlen];
mas01cr@280 90 double v1norm, v2norm, v1v2;
mas01cr@280 91
mas01cr@280 92 double sumdist = 0;
mas01cr@280 93 double sumlogdist = 0;
mas01cr@280 94
mas01cr@280 95 for (unsigned int i = 0; i < nsamples;) {
mas01cr@284 96 unsigned track1 = random_track(propTable, total);
mas01cr@284 97 unsigned track2 = random_track(propTable, total);
mas01cr@280 98
mas01cr@280 99 if(track1 == track2)
mas01cr@280 100 continue;
mas01cr@280 101
mas01cr@280 102 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
mas01cr@280 103 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
mas01cr@280 104
mas01cr@280 105 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@280 106
mas01cr@280 107 /* FIXME: this seeking, reading and distance calculation should
mas01cr@280 108 share more code with the query loop */
mas01cr@370 109 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@370 110 error("seek failure", "", "lseek");
mas01cr@370 111 }
mas01cr@370 112 CHECKED_READ(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 113
mas01cr@370 114 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@370 115 error("seek failure", "", "lseek");
mas01cr@370 116 }
mas01cr@370 117 CHECKED_READ(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 118
mas01cr@280 119 v1norm = 0;
mas01cr@280 120 v2norm = 0;
mas01cr@280 121 v1v2 = 0;
mas01cr@280 122
mas01cr@280 123 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@280 124 v1norm += v1[j]*v1[j];
mas01cr@280 125 v2norm += v2[j]*v2[j];
mas01cr@280 126 v1v2 += v1[j]*v2[j];
mas01cr@280 127 }
mas01cr@280 128
mas01cr@280 129 /* FIXME: we must deal with infinities better than this; there
mas01cr@280 130 could be all sorts of NaNs from arbitrary features. Best
mas01cr@280 131 include power thresholds or something... */
mas01cr@280 132 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@280 133
mas01cr@280 134 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@280 135 /* assume normalizedDistance == true for now */
mas01cr@280 136 /* FIXME: not convinced that the statistics we calculated in
mas01cr@280 137 TASLP paper are technically valid for normalizedDistance */
mas01cr@280 138
mas01cr@280 139 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@280 140 // double dist = v1norm + v2norm - 2*v1v2;
mas01cr@280 141
mas01cr@280 142 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@280 143 sumdist += dist;
mas01cr@280 144 sumlogdist += log(dist);
mas01cr@280 145 i++;
mas01cr@280 146 } else {
mas01cr@280 147 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@280 148 }
mas01cr@280 149 }
mas01cr@280 150
mas01cr@280 151 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@280 152 unsigned meanN = total / count;
mas01cr@280 153
mas01cr@280 154 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@280 155 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@280 156
mas01cr@280 157 std::cout << "Summary statistics" << std::endl;
mas01cr@280 158 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@280 159 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@280 160 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@280 161
mas01cr@280 162 /* FIXME: we'll also want some more summary statistics based on
mas01cr@280 163 propTable, for the minimum-of-X estimate */
mas01cr@280 164 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@280 165 std::cout << std::endl;
mas01cr@280 166 std::cout << "Estimated parameters" << std::endl;
mas01cr@280 167 std::cout << "sigma^2: " << sigma2 << "; ";
mas01cr@280 168 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
mas01cr@280 169 std::cout << "d: " << d << std::endl;
mas01cr@280 170
mas01cr@280 171 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@280 172 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@280 173 - (2 / d) * gsl_sf_log(meanN)
mas01cr@280 174 - gsl_sf_log(d/2)
mas01cr@280 175 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@280 176 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@280 177
mas01cr@280 178 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@280 179
mas01cr@280 180 delete[] propTable;
mas01cr@280 181 delete[] v1;
mas01cr@280 182 delete[] v2;
mas01cr@280 183 }