annotate sample.cpp @ 524:469b50a3dd84 multiprobeLSH

Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted.
author mas01mc
date Wed, 28 Jan 2009 16:02:17 +0000
parents 342822c2d49a
children e6dab5ed471c
rev   line source
mas01cr@280 1 #include "audioDB.h"
mas01cr@280 2
mas01cr@280 3 #include <gsl/gsl_sf.h>
mas01cr@280 4 #include <gsl/gsl_rng.h>
mas01cr@280 5
mas01cr@280 6 static
mas01cr@280 7 double yfun(double d) {
mas01cr@280 8 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@280 9 }
mas01cr@280 10
mas01cr@280 11 static
mas01cr@280 12 double yinv(double y) {
mas01cr@280 13 double a = 1.0e-5;
mas01cr@280 14 double b = 1000.0;
mas01cr@280 15
mas01cr@280 16 double ay = yfun(a);
mas01cr@280 17 double by = yfun(b);
mas01cr@280 18
mas01cr@283 19 double c = 0;
mas01cr@283 20 double cy;
mas01cr@280 21
mas01cr@280 22 /* FIXME: simple binary search; there's probably some clever solver
mas01cr@280 23 in gsl somewhere which is less sucky. */
mas01cr@280 24 while ((b - a) > 1.0e-5) {
mas01cr@280 25 c = (a + b) / 2;
mas01cr@280 26 cy = yfun(c);
mas01cr@280 27 if (cy > y) {
mas01cr@280 28 a = c;
mas01cr@280 29 ay = cy;
mas01cr@280 30 } else {
mas01cr@280 31 b = c;
mas01cr@280 32 by = cy;
mas01cr@280 33 }
mas01cr@280 34 }
mas01cr@280 35
mas01cr@280 36 return c;
mas01cr@280 37 }
mas01cr@280 38
mas01cr@284 39 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@280 40 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@280 41 some other sensible method of sampling from a discrete
mas01cr@280 42 distribution. */
mas01cr@280 43 double thing = gsl_rng_uniform(rng);
mas01cr@280 44 unsigned sofar = 0;
mas01cr@280 45 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 46 sofar += propTable[i];
mas01cr@280 47 if (thing < ((double) sofar / (double) total)) {
mas01cr@280 48 return i;
mas01cr@280 49 }
mas01cr@280 50 }
mas01cr@280 51 error("fell through in random_track()");
mas01cr@280 52
mas01cr@280 53 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@280 54 return 0;
mas01cr@280 55 }
mas01cr@280 56
mas01cr@280 57 void audioDB::sample(const char *dbName) {
mas01cr@280 58 initTables(dbName, 0);
mas01mc@324 59 if(dbH->flags & O2_FLAG_LARGE_ADB){
mas01mc@324 60 error("error: sample not yet supported for LARGE_ADB");
mas01mc@324 61 }
mas01mc@324 62
mas01cr@280 63 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@280 64 unsigned cumTrack=0;
mas01cr@280 65 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@280 66 trackOffsetTable[k] = cumTrack;
mas01cr@280 67 cumTrack += trackTable[k] * dbH->dim;
mas01cr@280 68 }
mas01cr@280 69
mas01cr@280 70 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@280 71 unsigned total = 0;
mas01cr@280 72 unsigned count = 0;
mas01cr@280 73
mas01cr@280 74 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 75 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@280 76 alone nary? */
mas01cr@280 77 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@280 78 prop = prop > 0 ? prop : 0;
mas01cr@280 79 if (prop > 0)
mas01cr@280 80 count++;
mas01cr@280 81 propTable[i] = prop;
mas01cr@280 82 total += prop;
mas01cr@280 83 }
mas01cr@280 84
mas01cr@280 85 if (total == 0) {
mas01cr@280 86 error("no sequences of this sequence length in the database", dbName);
mas01cr@280 87 }
mas01cr@280 88
mas01cr@280 89 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@280 90 double *v1 = new double[vlen];
mas01cr@280 91 double *v2 = new double[vlen];
mas01cr@280 92 double v1norm, v2norm, v1v2;
mas01cr@280 93
mas01cr@280 94 double sumdist = 0;
mas01cr@280 95 double sumlogdist = 0;
mas01cr@280 96
mas01cr@280 97 for (unsigned int i = 0; i < nsamples;) {
mas01cr@284 98 unsigned track1 = random_track(propTable, total);
mas01cr@284 99 unsigned track2 = random_track(propTable, total);
mas01cr@280 100
mas01cr@280 101 if(track1 == track2)
mas01cr@280 102 continue;
mas01cr@280 103
mas01cr@280 104 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
mas01cr@280 105 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
mas01cr@280 106
mas01cr@280 107 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@280 108
mas01cr@280 109 /* FIXME: this seeking, reading and distance calculation should
mas01cr@280 110 share more code with the query loop */
mas01cr@370 111 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@370 112 error("seek failure", "", "lseek");
mas01cr@370 113 }
mas01cr@370 114 CHECKED_READ(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 115
mas01cr@370 116 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@370 117 error("seek failure", "", "lseek");
mas01cr@370 118 }
mas01cr@370 119 CHECKED_READ(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 120
mas01cr@280 121 v1norm = 0;
mas01cr@280 122 v2norm = 0;
mas01cr@280 123 v1v2 = 0;
mas01cr@280 124
mas01cr@280 125 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@280 126 v1norm += v1[j]*v1[j];
mas01cr@280 127 v2norm += v2[j]*v2[j];
mas01cr@280 128 v1v2 += v1[j]*v2[j];
mas01cr@280 129 }
mas01cr@280 130
mas01cr@280 131 /* FIXME: we must deal with infinities better than this; there
mas01cr@280 132 could be all sorts of NaNs from arbitrary features. Best
mas01cr@280 133 include power thresholds or something... */
mas01cr@280 134 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@280 135
mas01cr@280 136 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@280 137 /* assume normalizedDistance == true for now */
mas01cr@280 138 /* FIXME: not convinced that the statistics we calculated in
mas01cr@280 139 TASLP paper are technically valid for normalizedDistance */
mas01cr@280 140
mas01cr@280 141 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@280 142 // double dist = v1norm + v2norm - 2*v1v2;
mas01cr@280 143
mas01cr@280 144 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@280 145 sumdist += dist;
mas01cr@280 146 sumlogdist += log(dist);
mas01cr@280 147 i++;
mas01cr@280 148 } else {
mas01cr@280 149 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@280 150 }
mas01cr@280 151 }
mas01cr@280 152
mas01cr@280 153 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@280 154 unsigned meanN = total / count;
mas01cr@280 155
mas01cr@280 156 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@280 157 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@280 158
mas01cr@280 159 std::cout << "Summary statistics" << std::endl;
mas01cr@280 160 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@280 161 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@280 162 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@280 163
mas01cr@280 164 /* FIXME: we'll also want some more summary statistics based on
mas01cr@280 165 propTable, for the minimum-of-X estimate */
mas01cr@280 166 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@280 167 std::cout << std::endl;
mas01cr@280 168 std::cout << "Estimated parameters" << std::endl;
mas01cr@280 169 std::cout << "sigma^2: " << sigma2 << "; ";
mas01cr@280 170 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
mas01cr@280 171 std::cout << "d: " << d << std::endl;
mas01cr@280 172
mas01cr@280 173 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@280 174 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@280 175 - (2 / d) * gsl_sf_log(meanN)
mas01cr@280 176 - gsl_sf_log(d/2)
mas01cr@280 177 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@280 178 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@280 179
mas01cr@280 180 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@280 181
mas01cr@280 182 delete[] propTable;
mas01cr@280 183 delete[] v1;
mas01cr@280 184 delete[] v2;
mas01cr@280 185 }