annotate sample.cpp @ 276:5c34b71c5ffa sampling

Restore static decoration to yfun and yinv.
author mas01cr
date Tue, 01 Jul 2008 09:00:29 +0000
parents 1c76d5f41708
children d9dba57becd4
rev   line source
mas01cr@266 1 #include "audioDB.h"
mas01cr@266 2
mas01cr@268 3 #include <gsl/gsl_sf.h>
mas01cr@268 4
mas01cr@276 5 static
mas01cr@273 6 double yfun(double d) {
mas01cr@268 7 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@268 8 }
mas01cr@276 9
mas01cr@276 10 static
mas01cr@273 11 double yinv(double y) {
mas01cr@268 12 double a = 1.0e-5;
mas01cr@268 13 double b = 1000.0;
mas01cr@268 14
mas01cr@268 15 double ay = yfun(a);
mas01cr@268 16 double by = yfun(b);
mas01cr@268 17
mas01cr@268 18 double c, cy;
mas01cr@268 19
mas01cr@276 20 /* FIXME: simple binary search; there's probably some clever solver
mas01cr@276 21 in gsl somewhere which is less sucky. */
mas01cr@268 22 while ((b - a) > 1.0e-5) {
mas01cr@268 23 c = (a + b) / 2;
mas01cr@268 24 cy = yfun(c);
mas01cr@268 25 if (cy > y) {
mas01cr@268 26 a = c;
mas01cr@268 27 ay = cy;
mas01cr@268 28 } else {
mas01cr@268 29 b = c;
mas01cr@268 30 by = cy;
mas01cr@268 31 }
mas01cr@268 32 }
mas01cr@268 33
mas01cr@268 34 return c;
mas01cr@268 35 }
mas01cr@268 36
mas01cr@266 37 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@266 38 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@266 39 some other sensible method of sampling from a discrete
mas01cr@266 40 distribution. */
mas01cr@266 41 /* FIXME: use a real random number generator, not random() */
mas01cr@266 42 double thing = random() / (double) RAND_MAX;
mas01cr@266 43 unsigned sofar = 0;
mas01cr@266 44 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@266 45 sofar += propTable[i];
mas01cr@266 46 if (thing < ((double) sofar / (double) total)) {
mas01cr@266 47 return i;
mas01cr@266 48 }
mas01cr@266 49 }
mas01cr@266 50 error("fell through in random_track()");
mas01cr@266 51
mas01cr@266 52 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@266 53 return 0;
mas01cr@266 54 }
mas01cr@266 55
mas01cr@266 56 void audioDB::sample(const char *dbName) {
mas01cr@266 57 initTables(dbName, 0);
mas01cr@266 58
mas01cr@271 59 /* FIXME: in Real Life we'll want to initialize the RNG using
mas01cr@271 60 /dev/random or the current time or something, like this:
mas01cr@271 61
mas01cr@271 62 unsigned int seed;
mas01cr@271 63 int fd = open("/dev/urandom", O_RDONLY);
mas01cr@271 64 read(fd, &seed, 4);
mas01cr@271 65
mas01cr@271 66 srandom(seed);
mas01cr@271 67 */
mas01cr@271 68
mas01cr@266 69 // build track offset table (FIXME: cut'n'pasted from query.cpp)
mas01cr@266 70 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@266 71 unsigned cumTrack=0;
mas01cr@266 72 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@266 73 trackOffsetTable[k] = cumTrack;
mas01cr@266 74 cumTrack += trackTable[k] * dbH->dim;
mas01cr@266 75 }
mas01cr@266 76
mas01cr@266 77 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@266 78 unsigned total = 0;
mas01cr@270 79 unsigned count = 0;
mas01cr@266 80
mas01cr@266 81 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@266 82 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@266 83 alone nary? */
mas01cr@266 84 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@266 85 prop = prop > 0 ? prop : 0;
mas01cr@270 86 if (prop > 0)
mas01cr@270 87 count++;
mas01cr@266 88 propTable[i] = prop;
mas01cr@266 89 total += prop;
mas01cr@266 90 }
mas01cr@266 91
mas01cr@266 92 if (total == 0) {
mas01cr@266 93 error("no sequences of this sequence length in the database", dbName);
mas01cr@266 94 }
mas01cr@266 95
mas01cr@266 96 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@266 97 double *v1 = new double[vlen];
mas01cr@266 98 double *v2 = new double[vlen];
mas01cr@266 99 double v1norm, v2norm, v1v2;
mas01cr@266 100
mas01cr@266 101 double sumdist = 0;
mas01cr@266 102 double sumlogdist = 0;
mas01cr@266 103
mas01cr@270 104 for (unsigned int i = 0; i < nsamples;) {
mas01cr@266 105 unsigned track1 = random_track(propTable, total);
mas01cr@266 106 unsigned track2 = random_track(propTable, total);
mas01cr@266 107
mas01cr@271 108 if(track1 == track2)
mas01cr@271 109 continue;
mas01cr@271 110
mas01cr@266 111 /* FIXME: this uses lower-order bits, which is OK on Linux but not
mas01cr@266 112 necessarily elsewhere. Again, use a real random number
mas01cr@266 113 generator */
mas01cr@266 114 unsigned i1 = random() % propTable[track1];
mas01cr@266 115 unsigned i2 = random() % propTable[track2];
mas01cr@266 116
mas01cr@266 117 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@266 118
mas01cr@266 119 /* FIXME: this seeking, reading and distance calculation should
mas01cr@266 120 share more code with the query loop */
mas01cr@266 121 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@266 122 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@266 123
mas01cr@266 124 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
mas01cr@266 125 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@266 126
mas01cr@266 127 v1norm = 0;
mas01cr@266 128 v2norm = 0;
mas01cr@266 129 v1v2 = 0;
mas01cr@266 130
mas01cr@266 131 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@266 132 v1norm += v1[j]*v1[j];
mas01cr@266 133 v2norm += v2[j]*v2[j];
mas01cr@266 134 v1v2 += v1[j]*v2[j];
mas01cr@266 135 }
mas01cr@266 136
mas01cr@266 137 /* FIXME: we must deal with infinities better than this; there
mas01cr@266 138 could be all sorts of NaNs from arbitrary features. Best
mas01cr@266 139 include power thresholds or something... */
mas01cr@266 140 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@266 141
mas01cr@266 142 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@266 143 /* assume normalizedDistance == true for now */
mas01cr@266 144 /* FIXME: not convinced that the statistics we calculated in
mas01cr@271 145 TASLP paper are technically valid for normalizedDistance */
mas01cr@271 146
mas01cr@269 147 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@271 148 // double dist = v1norm + v2norm - 2*v1v2;
mas01cr@271 149
mas01cr@266 150 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@266 151 sumdist += dist;
mas01cr@266 152 sumlogdist += log(dist);
mas01cr@266 153 i++;
mas01cr@266 154 } else {
mas01cr@273 155 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@266 156 }
mas01cr@266 157 }
mas01cr@266 158
mas01cr@270 159 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@270 160 unsigned meanN = total / count;
mas01cr@270 161
mas01cr@270 162 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@270 163 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@268 164
mas01cr@266 165 std::cout << "Summary statistics" << std::endl;
mas01cr@270 166 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@266 167 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@266 168 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@271 169
mas01cr@271 170 /* FIXME: we'll also want some more summary statistics based on
mas01cr@271 171 propTable, for the minimum-of-X estimate */
mas01cr@270 172 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@268 173 std::cout << std::endl;
mas01cr@268 174 std::cout << "Estimated parameters" << std::endl;
mas01cr@271 175 std::cout << "sigma^2: " << sigma2 << "; ";
mas01cr@271 176 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
mas01cr@268 177 std::cout << "d: " << d << std::endl;
mas01cr@270 178
mas01cr@270 179 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@270 180 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@270 181 - (2 / d) * gsl_sf_log(meanN)
mas01cr@270 182 - gsl_sf_log(d/2)
mas01cr@270 183 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@270 184 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@270 185
mas01cr@270 186 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@266 187
mas01cr@266 188 delete[] propTable;
mas01cr@266 189 delete[] v1;
mas01cr@266 190 delete[] v2;
mas01cr@266 191 }