mas01cr@280: #include "audioDB.h" mas01cr@280: mas01cr@280: #include mas01cr@280: #include mas01cr@280: mas01cr@280: static mas01cr@280: double yfun(double d) { mas01cr@280: return gsl_sf_log(d) - gsl_sf_psi(d); mas01cr@280: } mas01cr@280: mas01cr@280: static mas01cr@280: double yinv(double y) { mas01cr@280: double a = 1.0e-5; mas01cr@280: double b = 1000.0; mas01cr@280: mas01cr@280: double ay = yfun(a); mas01cr@280: double by = yfun(b); mas01cr@280: mas01cr@283: double c = 0; mas01cr@283: double cy; mas01cr@280: mas01cr@280: /* FIXME: simple binary search; there's probably some clever solver mas01cr@280: in gsl somewhere which is less sucky. */ mas01cr@280: while ((b - a) > 1.0e-5) { mas01cr@280: c = (a + b) / 2; mas01cr@280: cy = yfun(c); mas01cr@280: if (cy > y) { mas01cr@280: a = c; mas01cr@280: ay = cy; mas01cr@280: } else { mas01cr@280: b = c; mas01cr@280: by = cy; mas01cr@280: } mas01cr@280: } mas01cr@280: mas01cr@280: return c; mas01cr@280: } mas01cr@280: mas01cr@284: unsigned audioDB::random_track(unsigned *propTable, unsigned total) { mas01cr@280: /* FIXME: make this O(1) by using the alias-rejection method, or mas01cr@280: some other sensible method of sampling from a discrete mas01cr@280: distribution. */ mas01cr@280: double thing = gsl_rng_uniform(rng); mas01cr@280: unsigned sofar = 0; mas01cr@280: for (unsigned int i = 0; i < dbH->numFiles; i++) { mas01cr@280: sofar += propTable[i]; mas01cr@280: if (thing < ((double) sofar / (double) total)) { mas01cr@280: return i; mas01cr@280: } mas01cr@280: } mas01cr@280: error("fell through in random_track()"); mas01cr@280: mas01cr@280: /* FIXME: decorate error's declaration so that this isn't necessary */ mas01cr@280: return 0; mas01cr@280: } mas01cr@280: mas01cr@280: void audioDB::sample(const char *dbName) { mas01cr@280: initTables(dbName, 0); mas01mc@319: if(dbH->flags & O2_FLAG_LARGE_ADB){ mas01mc@319: error("error: sample not yet supported for LARGE_ADB"); mas01mc@319: } mas01mc@319: mas01cr@280: // build track offset table (FIXME: cut'n'pasted from query.cpp) mas01cr@280: off_t *trackOffsetTable = new off_t[dbH->numFiles]; mas01cr@280: unsigned cumTrack=0; mas01cr@280: for(unsigned int k = 0; k < dbH->numFiles; k++){ mas01cr@280: trackOffsetTable[k] = cumTrack; mas01cr@280: cumTrack += trackTable[k] * dbH->dim; mas01cr@280: } mas01cr@280: mas01cr@280: unsigned *propTable = new unsigned[dbH->numFiles]; mas01cr@280: unsigned total = 0; mas01cr@280: unsigned count = 0; mas01cr@280: mas01cr@280: for (unsigned int i = 0; i < dbH->numFiles; i++) { mas01cr@280: /* what kind of a stupid language doesn't have binary max(), let mas01cr@280: alone nary? */ mas01cr@280: unsigned int prop = trackTable[i] - sequenceLength + 1; mas01cr@280: prop = prop > 0 ? prop : 0; mas01cr@280: if (prop > 0) mas01cr@280: count++; mas01cr@280: propTable[i] = prop; mas01cr@280: total += prop; mas01cr@280: } mas01cr@280: mas01cr@280: if (total == 0) { mas01cr@280: error("no sequences of this sequence length in the database", dbName); mas01cr@280: } mas01cr@280: mas01cr@280: unsigned int vlen = dbH->dim * sequenceLength; mas01cr@280: double *v1 = new double[vlen]; mas01cr@280: double *v2 = new double[vlen]; mas01cr@280: double v1norm, v2norm, v1v2; mas01cr@280: mas01cr@280: double sumdist = 0; mas01cr@280: double sumlogdist = 0; mas01cr@280: mas01cr@280: for (unsigned int i = 0; i < nsamples;) { mas01cr@284: unsigned track1 = random_track(propTable, total); mas01cr@284: unsigned track2 = random_track(propTable, total); mas01cr@280: mas01cr@280: if(track1 == track2) mas01cr@280: continue; mas01cr@280: mas01cr@280: unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]); mas01cr@280: unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]); mas01cr@280: mas01cr@280: VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2); mas01cr@280: mas01cr@280: /* FIXME: this seeking, reading and distance calculation should mas01cr@280: share more code with the query loop */ mas01cr@280: lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET); mas01cr@280: read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double)); mas01cr@280: mas01cr@280: lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET); mas01cr@280: read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double)); mas01cr@280: mas01cr@280: v1norm = 0; mas01cr@280: v2norm = 0; mas01cr@280: v1v2 = 0; mas01cr@280: mas01cr@280: for (unsigned int j = 0; j < vlen; j++) { mas01cr@280: v1norm += v1[j]*v1[j]; mas01cr@280: v2norm += v2[j]*v2[j]; mas01cr@280: v1v2 += v1[j]*v2[j]; mas01cr@280: } mas01cr@280: mas01cr@280: /* FIXME: we must deal with infinities better than this; there mas01cr@280: could be all sorts of NaNs from arbitrary features. Best mas01cr@280: include power thresholds or something... */ mas01cr@280: if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) { mas01cr@280: mas01cr@280: VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2); mas01cr@280: /* assume normalizedDistance == true for now */ mas01cr@280: /* FIXME: not convinced that the statistics we calculated in mas01cr@280: TASLP paper are technically valid for normalizedDistance */ mas01cr@280: mas01cr@280: double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm); mas01cr@280: // double dist = v1norm + v2norm - 2*v1v2; mas01cr@280: mas01cr@280: VERB_LOG(1, "%f %f\n", dist, log(dist)); mas01cr@280: sumdist += dist; mas01cr@280: sumlogdist += log(dist); mas01cr@280: i++; mas01cr@280: } else { mas01cr@280: VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2); mas01cr@280: } mas01cr@280: } mas01cr@280: mas01cr@280: /* FIXME: the mean isn't really what we should be reporting here */ mas01cr@280: unsigned meanN = total / count; mas01cr@280: mas01cr@280: double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples); mas01cr@280: double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); mas01cr@280: mas01cr@280: std::cout << "Summary statistics" << std::endl; mas01cr@280: std::cout << "number of samples: " << nsamples << std::endl; mas01cr@280: std::cout << "sum of distances (S): " << sumdist << std::endl; mas01cr@280: std::cout << "sum of log distances (L): " << sumlogdist << std::endl; mas01cr@280: mas01cr@280: /* FIXME: we'll also want some more summary statistics based on mas01cr@280: propTable, for the minimum-of-X estimate */ mas01cr@280: std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; mas01cr@280: std::cout << std::endl; mas01cr@280: std::cout << "Estimated parameters" << std::endl; mas01cr@280: std::cout << "sigma^2: " << sigma2 << "; "; mas01cr@280: std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; mas01cr@280: std::cout << "d: " << d << std::endl; mas01cr@280: mas01cr@280: double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); mas01cr@280: double logxthresh = gsl_sf_log(sumdist / nsamples) + logw mas01cr@280: - (2 / d) * gsl_sf_log(meanN) mas01cr@280: - gsl_sf_log(d/2) mas01cr@280: - (2 / d) * gsl_sf_log(2 / d) mas01cr@280: + (2 / d) * gsl_sf_lngamma(d / 2); mas01cr@280: mas01cr@280: std::cout << "track xthresh: " << exp(logxthresh) << std::endl; mas01cr@280: mas01cr@280: delete[] propTable; mas01cr@280: delete[] v1; mas01cr@280: delete[] v2; mas01cr@280: }