annotate sample.cpp @ 369:6564be3109c5 gcc-4.3-cleanups

gcc-4.3 warning cleanups for lshlib.cpp (I do not believe that any of these changes contain significant copyrightable "intellectual property". However, to the extent that they do, the changes are hereby released into the Public Domain, and may be therefore be used by anyone for any purpose without need for consideration of any kind.)
author mas01cr
date Wed, 12 Nov 2008 15:23:32 +0000
parents 521812d63516
children 0c1c8726a79b
rev   line source
mas01cr@280 1 #include "audioDB.h"
mas01cr@280 2
mas01cr@280 3 #include <gsl/gsl_sf.h>
mas01cr@280 4 #include <gsl/gsl_rng.h>
mas01cr@280 5
mas01cr@280 6 static
mas01cr@280 7 double yfun(double d) {
mas01cr@280 8 return gsl_sf_log(d) - gsl_sf_psi(d);
mas01cr@280 9 }
mas01cr@280 10
mas01cr@280 11 static
mas01cr@280 12 double yinv(double y) {
mas01cr@280 13 double a = 1.0e-5;
mas01cr@280 14 double b = 1000.0;
mas01cr@280 15
mas01cr@280 16 double ay = yfun(a);
mas01cr@280 17 double by = yfun(b);
mas01cr@280 18
mas01cr@283 19 double c = 0;
mas01cr@283 20 double cy;
mas01cr@280 21
mas01cr@280 22 /* FIXME: simple binary search; there's probably some clever solver
mas01cr@280 23 in gsl somewhere which is less sucky. */
mas01cr@280 24 while ((b - a) > 1.0e-5) {
mas01cr@280 25 c = (a + b) / 2;
mas01cr@280 26 cy = yfun(c);
mas01cr@280 27 if (cy > y) {
mas01cr@280 28 a = c;
mas01cr@280 29 ay = cy;
mas01cr@280 30 } else {
mas01cr@280 31 b = c;
mas01cr@280 32 by = cy;
mas01cr@280 33 }
mas01cr@280 34 }
mas01cr@280 35
mas01cr@280 36 return c;
mas01cr@280 37 }
mas01cr@280 38
mas01cr@284 39 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
mas01cr@280 40 /* FIXME: make this O(1) by using the alias-rejection method, or
mas01cr@280 41 some other sensible method of sampling from a discrete
mas01cr@280 42 distribution. */
mas01cr@280 43 double thing = gsl_rng_uniform(rng);
mas01cr@280 44 unsigned sofar = 0;
mas01cr@280 45 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 46 sofar += propTable[i];
mas01cr@280 47 if (thing < ((double) sofar / (double) total)) {
mas01cr@280 48 return i;
mas01cr@280 49 }
mas01cr@280 50 }
mas01cr@280 51 error("fell through in random_track()");
mas01cr@280 52
mas01cr@280 53 /* FIXME: decorate error's declaration so that this isn't necessary */
mas01cr@280 54 return 0;
mas01cr@280 55 }
mas01cr@280 56
mas01cr@280 57 void audioDB::sample(const char *dbName) {
mas01cr@280 58 initTables(dbName, 0);
mas01mc@324 59 if(dbH->flags & O2_FLAG_LARGE_ADB){
mas01mc@324 60 error("error: sample not yet supported for LARGE_ADB");
mas01mc@324 61 }
mas01mc@324 62
mas01cr@280 63 // build track offset table (FIXME: cut'n'pasted from query.cpp)
mas01cr@280 64 off_t *trackOffsetTable = new off_t[dbH->numFiles];
mas01cr@280 65 unsigned cumTrack=0;
mas01cr@280 66 for(unsigned int k = 0; k < dbH->numFiles; k++){
mas01cr@280 67 trackOffsetTable[k] = cumTrack;
mas01cr@280 68 cumTrack += trackTable[k] * dbH->dim;
mas01cr@280 69 }
mas01cr@280 70
mas01cr@280 71 unsigned *propTable = new unsigned[dbH->numFiles];
mas01cr@280 72 unsigned total = 0;
mas01cr@280 73 unsigned count = 0;
mas01cr@280 74
mas01cr@280 75 for (unsigned int i = 0; i < dbH->numFiles; i++) {
mas01cr@280 76 /* what kind of a stupid language doesn't have binary max(), let
mas01cr@280 77 alone nary? */
mas01cr@280 78 unsigned int prop = trackTable[i] - sequenceLength + 1;
mas01cr@280 79 prop = prop > 0 ? prop : 0;
mas01cr@280 80 if (prop > 0)
mas01cr@280 81 count++;
mas01cr@280 82 propTable[i] = prop;
mas01cr@280 83 total += prop;
mas01cr@280 84 }
mas01cr@280 85
mas01cr@280 86 if (total == 0) {
mas01cr@280 87 error("no sequences of this sequence length in the database", dbName);
mas01cr@280 88 }
mas01cr@280 89
mas01cr@280 90 unsigned int vlen = dbH->dim * sequenceLength;
mas01cr@280 91 double *v1 = new double[vlen];
mas01cr@280 92 double *v2 = new double[vlen];
mas01cr@280 93 double v1norm, v2norm, v1v2;
mas01cr@280 94
mas01cr@280 95 double sumdist = 0;
mas01cr@280 96 double sumlogdist = 0;
mas01cr@280 97
mas01cr@280 98 for (unsigned int i = 0; i < nsamples;) {
mas01cr@284 99 unsigned track1 = random_track(propTable, total);
mas01cr@284 100 unsigned track2 = random_track(propTable, total);
mas01cr@280 101
mas01cr@280 102 if(track1 == track2)
mas01cr@280 103 continue;
mas01cr@280 104
mas01cr@280 105 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
mas01cr@280 106 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
mas01cr@280 107
mas01cr@280 108 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
mas01cr@280 109
mas01cr@280 110 /* FIXME: this seeking, reading and distance calculation should
mas01cr@280 111 share more code with the query loop */
mas01cr@366 112 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@366 113 error("seek failure", "", "lseek");
mas01cr@366 114 }
mas01cr@366 115 CHECKED_READ(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 116
mas01cr@366 117 if(lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET) == (off_t) -1) {
mas01cr@366 118 error("seek failure", "", "lseek");
mas01cr@366 119 }
mas01cr@366 120 CHECKED_READ(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
mas01cr@280 121
mas01cr@280 122 v1norm = 0;
mas01cr@280 123 v2norm = 0;
mas01cr@280 124 v1v2 = 0;
mas01cr@280 125
mas01cr@280 126 for (unsigned int j = 0; j < vlen; j++) {
mas01cr@280 127 v1norm += v1[j]*v1[j];
mas01cr@280 128 v2norm += v2[j]*v2[j];
mas01cr@280 129 v1v2 += v1[j]*v2[j];
mas01cr@280 130 }
mas01cr@280 131
mas01cr@280 132 /* FIXME: we must deal with infinities better than this; there
mas01cr@280 133 could be all sorts of NaNs from arbitrary features. Best
mas01cr@280 134 include power thresholds or something... */
mas01cr@280 135 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
mas01cr@280 136
mas01cr@280 137 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
mas01cr@280 138 /* assume normalizedDistance == true for now */
mas01cr@280 139 /* FIXME: not convinced that the statistics we calculated in
mas01cr@280 140 TASLP paper are technically valid for normalizedDistance */
mas01cr@280 141
mas01cr@280 142 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
mas01cr@280 143 // double dist = v1norm + v2norm - 2*v1v2;
mas01cr@280 144
mas01cr@280 145 VERB_LOG(1, "%f %f\n", dist, log(dist));
mas01cr@280 146 sumdist += dist;
mas01cr@280 147 sumlogdist += log(dist);
mas01cr@280 148 i++;
mas01cr@280 149 } else {
mas01cr@280 150 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
mas01cr@280 151 }
mas01cr@280 152 }
mas01cr@280 153
mas01cr@280 154 /* FIXME: the mean isn't really what we should be reporting here */
mas01cr@280 155 unsigned meanN = total / count;
mas01cr@280 156
mas01cr@280 157 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
mas01cr@280 158 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
mas01cr@280 159
mas01cr@280 160 std::cout << "Summary statistics" << std::endl;
mas01cr@280 161 std::cout << "number of samples: " << nsamples << std::endl;
mas01cr@280 162 std::cout << "sum of distances (S): " << sumdist << std::endl;
mas01cr@280 163 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
mas01cr@280 164
mas01cr@280 165 /* FIXME: we'll also want some more summary statistics based on
mas01cr@280 166 propTable, for the minimum-of-X estimate */
mas01cr@280 167 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
mas01cr@280 168 std::cout << std::endl;
mas01cr@280 169 std::cout << "Estimated parameters" << std::endl;
mas01cr@280 170 std::cout << "sigma^2: " << sigma2 << "; ";
mas01cr@280 171 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
mas01cr@280 172 std::cout << "d: " << d << std::endl;
mas01cr@280 173
mas01cr@280 174 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
mas01cr@280 175 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
mas01cr@280 176 - (2 / d) * gsl_sf_log(meanN)
mas01cr@280 177 - gsl_sf_log(d/2)
mas01cr@280 178 - (2 / d) * gsl_sf_log(2 / d)
mas01cr@280 179 + (2 / d) * gsl_sf_lngamma(d / 2);
mas01cr@280 180
mas01cr@280 181 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
mas01cr@280 182
mas01cr@280 183 delete[] propTable;
mas01cr@280 184 delete[] v1;
mas01cr@280 185 delete[] v2;
mas01cr@280 186 }