mas01cr@280
|
1 #include "audioDB.h"
|
mas01cr@280
|
2
|
mas01cr@280
|
3 #include <gsl/gsl_sf.h>
|
mas01cr@280
|
4 #include <gsl/gsl_rng.h>
|
mas01cr@280
|
5
|
mas01cr@280
|
6 static
|
mas01cr@280
|
7 double yfun(double d) {
|
mas01cr@280
|
8 return gsl_sf_log(d) - gsl_sf_psi(d);
|
mas01cr@280
|
9 }
|
mas01cr@280
|
10
|
mas01cr@280
|
11 static
|
mas01cr@280
|
12 double yinv(double y) {
|
mas01cr@280
|
13 double a = 1.0e-5;
|
mas01cr@280
|
14 double b = 1000.0;
|
mas01cr@280
|
15
|
mas01cr@280
|
16 double ay = yfun(a);
|
mas01cr@280
|
17 double by = yfun(b);
|
mas01cr@280
|
18
|
mas01cr@283
|
19 double c = 0;
|
mas01cr@283
|
20 double cy;
|
mas01cr@280
|
21
|
mas01cr@280
|
22 /* FIXME: simple binary search; there's probably some clever solver
|
mas01cr@280
|
23 in gsl somewhere which is less sucky. */
|
mas01cr@280
|
24 while ((b - a) > 1.0e-5) {
|
mas01cr@280
|
25 c = (a + b) / 2;
|
mas01cr@280
|
26 cy = yfun(c);
|
mas01cr@280
|
27 if (cy > y) {
|
mas01cr@280
|
28 a = c;
|
mas01cr@280
|
29 ay = cy;
|
mas01cr@280
|
30 } else {
|
mas01cr@280
|
31 b = c;
|
mas01cr@280
|
32 by = cy;
|
mas01cr@280
|
33 }
|
mas01cr@280
|
34 }
|
mas01cr@280
|
35
|
mas01cr@280
|
36 return c;
|
mas01cr@280
|
37 }
|
mas01cr@280
|
38
|
mas01cr@284
|
39 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
|
mas01cr@280
|
40 /* FIXME: make this O(1) by using the alias-rejection method, or
|
mas01cr@280
|
41 some other sensible method of sampling from a discrete
|
mas01cr@280
|
42 distribution. */
|
mas01cr@280
|
43 double thing = gsl_rng_uniform(rng);
|
mas01cr@280
|
44 unsigned sofar = 0;
|
mas01cr@280
|
45 for (unsigned int i = 0; i < dbH->numFiles; i++) {
|
mas01cr@280
|
46 sofar += propTable[i];
|
mas01cr@280
|
47 if (thing < ((double) sofar / (double) total)) {
|
mas01cr@280
|
48 return i;
|
mas01cr@280
|
49 }
|
mas01cr@280
|
50 }
|
mas01cr@280
|
51 error("fell through in random_track()");
|
mas01cr@280
|
52
|
mas01cr@280
|
53 /* FIXME: decorate error's declaration so that this isn't necessary */
|
mas01cr@280
|
54 return 0;
|
mas01cr@280
|
55 }
|
mas01cr@280
|
56
|
mas01cr@280
|
57 void audioDB::sample(const char *dbName) {
|
mas01cr@280
|
58 initTables(dbName, 0);
|
mas01mc@319
|
59 if(dbH->flags & O2_FLAG_LARGE_ADB){
|
mas01mc@319
|
60 error("error: sample not yet supported for LARGE_ADB");
|
mas01mc@319
|
61 }
|
mas01mc@319
|
62
|
mas01cr@280
|
63 // build track offset table (FIXME: cut'n'pasted from query.cpp)
|
mas01cr@280
|
64 off_t *trackOffsetTable = new off_t[dbH->numFiles];
|
mas01cr@280
|
65 unsigned cumTrack=0;
|
mas01cr@280
|
66 for(unsigned int k = 0; k < dbH->numFiles; k++){
|
mas01cr@280
|
67 trackOffsetTable[k] = cumTrack;
|
mas01cr@280
|
68 cumTrack += trackTable[k] * dbH->dim;
|
mas01cr@280
|
69 }
|
mas01cr@280
|
70
|
mas01cr@280
|
71 unsigned *propTable = new unsigned[dbH->numFiles];
|
mas01cr@280
|
72 unsigned total = 0;
|
mas01cr@280
|
73 unsigned count = 0;
|
mas01cr@280
|
74
|
mas01cr@280
|
75 for (unsigned int i = 0; i < dbH->numFiles; i++) {
|
mas01cr@280
|
76 /* what kind of a stupid language doesn't have binary max(), let
|
mas01cr@280
|
77 alone nary? */
|
mas01cr@280
|
78 unsigned int prop = trackTable[i] - sequenceLength + 1;
|
mas01cr@280
|
79 prop = prop > 0 ? prop : 0;
|
mas01cr@280
|
80 if (prop > 0)
|
mas01cr@280
|
81 count++;
|
mas01cr@280
|
82 propTable[i] = prop;
|
mas01cr@280
|
83 total += prop;
|
mas01cr@280
|
84 }
|
mas01cr@280
|
85
|
mas01cr@280
|
86 if (total == 0) {
|
mas01cr@280
|
87 error("no sequences of this sequence length in the database", dbName);
|
mas01cr@280
|
88 }
|
mas01cr@280
|
89
|
mas01cr@280
|
90 unsigned int vlen = dbH->dim * sequenceLength;
|
mas01cr@280
|
91 double *v1 = new double[vlen];
|
mas01cr@280
|
92 double *v2 = new double[vlen];
|
mas01cr@280
|
93 double v1norm, v2norm, v1v2;
|
mas01cr@280
|
94
|
mas01cr@280
|
95 double sumdist = 0;
|
mas01cr@280
|
96 double sumlogdist = 0;
|
mas01cr@280
|
97
|
mas01cr@280
|
98 for (unsigned int i = 0; i < nsamples;) {
|
mas01cr@284
|
99 unsigned track1 = random_track(propTable, total);
|
mas01cr@284
|
100 unsigned track2 = random_track(propTable, total);
|
mas01cr@280
|
101
|
mas01cr@280
|
102 if(track1 == track2)
|
mas01cr@280
|
103 continue;
|
mas01cr@280
|
104
|
mas01cr@280
|
105 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]);
|
mas01cr@280
|
106 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]);
|
mas01cr@280
|
107
|
mas01cr@280
|
108 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
|
mas01cr@280
|
109
|
mas01cr@280
|
110 /* FIXME: this seeking, reading and distance calculation should
|
mas01cr@280
|
111 share more code with the query loop */
|
mas01cr@280
|
112 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
|
mas01cr@280
|
113 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
|
mas01cr@280
|
114
|
mas01cr@280
|
115 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
|
mas01cr@280
|
116 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
|
mas01cr@280
|
117
|
mas01cr@280
|
118 v1norm = 0;
|
mas01cr@280
|
119 v2norm = 0;
|
mas01cr@280
|
120 v1v2 = 0;
|
mas01cr@280
|
121
|
mas01cr@280
|
122 for (unsigned int j = 0; j < vlen; j++) {
|
mas01cr@280
|
123 v1norm += v1[j]*v1[j];
|
mas01cr@280
|
124 v2norm += v2[j]*v2[j];
|
mas01cr@280
|
125 v1v2 += v1[j]*v2[j];
|
mas01cr@280
|
126 }
|
mas01cr@280
|
127
|
mas01cr@280
|
128 /* FIXME: we must deal with infinities better than this; there
|
mas01cr@280
|
129 could be all sorts of NaNs from arbitrary features. Best
|
mas01cr@280
|
130 include power thresholds or something... */
|
mas01cr@280
|
131 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
|
mas01cr@280
|
132
|
mas01cr@280
|
133 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
|
mas01cr@280
|
134 /* assume normalizedDistance == true for now */
|
mas01cr@280
|
135 /* FIXME: not convinced that the statistics we calculated in
|
mas01cr@280
|
136 TASLP paper are technically valid for normalizedDistance */
|
mas01cr@280
|
137
|
mas01cr@280
|
138 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm);
|
mas01cr@280
|
139 // double dist = v1norm + v2norm - 2*v1v2;
|
mas01cr@280
|
140
|
mas01cr@280
|
141 VERB_LOG(1, "%f %f\n", dist, log(dist));
|
mas01cr@280
|
142 sumdist += dist;
|
mas01cr@280
|
143 sumlogdist += log(dist);
|
mas01cr@280
|
144 i++;
|
mas01cr@280
|
145 } else {
|
mas01cr@280
|
146 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2);
|
mas01cr@280
|
147 }
|
mas01cr@280
|
148 }
|
mas01cr@280
|
149
|
mas01cr@280
|
150 /* FIXME: the mean isn't really what we should be reporting here */
|
mas01cr@280
|
151 unsigned meanN = total / count;
|
mas01cr@280
|
152
|
mas01cr@280
|
153 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
|
mas01cr@280
|
154 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
|
mas01cr@280
|
155
|
mas01cr@280
|
156 std::cout << "Summary statistics" << std::endl;
|
mas01cr@280
|
157 std::cout << "number of samples: " << nsamples << std::endl;
|
mas01cr@280
|
158 std::cout << "sum of distances (S): " << sumdist << std::endl;
|
mas01cr@280
|
159 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
|
mas01cr@280
|
160
|
mas01cr@280
|
161 /* FIXME: we'll also want some more summary statistics based on
|
mas01cr@280
|
162 propTable, for the minimum-of-X estimate */
|
mas01cr@280
|
163 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
|
mas01cr@280
|
164 std::cout << std::endl;
|
mas01cr@280
|
165 std::cout << "Estimated parameters" << std::endl;
|
mas01cr@280
|
166 std::cout << "sigma^2: " << sigma2 << "; ";
|
mas01cr@280
|
167 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
|
mas01cr@280
|
168 std::cout << "d: " << d << std::endl;
|
mas01cr@280
|
169
|
mas01cr@280
|
170 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
|
mas01cr@280
|
171 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
|
mas01cr@280
|
172 - (2 / d) * gsl_sf_log(meanN)
|
mas01cr@280
|
173 - gsl_sf_log(d/2)
|
mas01cr@280
|
174 - (2 / d) * gsl_sf_log(2 / d)
|
mas01cr@280
|
175 + (2 / d) * gsl_sf_lngamma(d / 2);
|
mas01cr@280
|
176
|
mas01cr@280
|
177 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
|
mas01cr@280
|
178
|
mas01cr@280
|
179 delete[] propTable;
|
mas01cr@280
|
180 delete[] v1;
|
mas01cr@280
|
181 delete[] v2;
|
mas01cr@280
|
182 }
|