Mercurial > hg > audiodb
comparison sample.cpp @ 280:3be15407e814
Merge sampling branch (-r361:405, though I hope that the branch is now
finished) onto trunk. API developers take note.
Things still to clear up:
* whether the threshold distance it currently reports bears any relation
to reality;
* if not, how to bring it a bit more into alignment;
* minor code cleanup issues in sample.cpp;
* incorporating --absolute-threshold handling into sampling;
* writing suitable test cases.
author | mas01cr |
---|---|
date | Wed, 02 Jul 2008 14:07:10 +0000 |
parents | |
children | 0e44de38d675 |
comparison
equal
deleted
inserted
replaced
275:d209e8470a60 | 280:3be15407e814 |
---|---|
1 #include "audioDB.h" | |
2 | |
3 #include <gsl/gsl_sf.h> | |
4 #include <gsl/gsl_rng.h> | |
5 | |
6 static | |
7 double yfun(double d) { | |
8 return gsl_sf_log(d) - gsl_sf_psi(d); | |
9 } | |
10 | |
11 static | |
12 double yinv(double y) { | |
13 double a = 1.0e-5; | |
14 double b = 1000.0; | |
15 | |
16 double ay = yfun(a); | |
17 double by = yfun(b); | |
18 | |
19 double c, cy; | |
20 | |
21 /* FIXME: simple binary search; there's probably some clever solver | |
22 in gsl somewhere which is less sucky. */ | |
23 while ((b - a) > 1.0e-5) { | |
24 c = (a + b) / 2; | |
25 cy = yfun(c); | |
26 if (cy > y) { | |
27 a = c; | |
28 ay = cy; | |
29 } else { | |
30 b = c; | |
31 by = cy; | |
32 } | |
33 } | |
34 | |
35 return c; | |
36 } | |
37 | |
38 unsigned audioDB::random_track(unsigned *propTable, unsigned total, gsl_rng *rng) { | |
39 /* FIXME: make this O(1) by using the alias-rejection method, or | |
40 some other sensible method of sampling from a discrete | |
41 distribution. */ | |
42 double thing = gsl_rng_uniform(rng); | |
43 unsigned sofar = 0; | |
44 for (unsigned int i = 0; i < dbH->numFiles; i++) { | |
45 sofar += propTable[i]; | |
46 if (thing < ((double) sofar / (double) total)) { | |
47 return i; | |
48 } | |
49 } | |
50 error("fell through in random_track()"); | |
51 | |
52 /* FIXME: decorate error's declaration so that this isn't necessary */ | |
53 return 0; | |
54 } | |
55 | |
56 void audioDB::sample(const char *dbName) { | |
57 initTables(dbName, 0); | |
58 | |
59 gsl_rng *rng = gsl_rng_alloc(gsl_rng_mt19937); | |
60 | |
61 /* FIXME: in Real Life we'll want to initialize the RNG using | |
62 /dev/random or the current time or something, like this: | |
63 | |
64 unsigned int seed; | |
65 int fd = open("/dev/urandom", O_RDONLY); | |
66 read(fd, &seed, 4); | |
67 | |
68 gsl_rng_set(rng, seed); | |
69 */ | |
70 | |
71 // build track offset table (FIXME: cut'n'pasted from query.cpp) | |
72 off_t *trackOffsetTable = new off_t[dbH->numFiles]; | |
73 unsigned cumTrack=0; | |
74 for(unsigned int k = 0; k < dbH->numFiles; k++){ | |
75 trackOffsetTable[k] = cumTrack; | |
76 cumTrack += trackTable[k] * dbH->dim; | |
77 } | |
78 | |
79 unsigned *propTable = new unsigned[dbH->numFiles]; | |
80 unsigned total = 0; | |
81 unsigned count = 0; | |
82 | |
83 for (unsigned int i = 0; i < dbH->numFiles; i++) { | |
84 /* what kind of a stupid language doesn't have binary max(), let | |
85 alone nary? */ | |
86 unsigned int prop = trackTable[i] - sequenceLength + 1; | |
87 prop = prop > 0 ? prop : 0; | |
88 if (prop > 0) | |
89 count++; | |
90 propTable[i] = prop; | |
91 total += prop; | |
92 } | |
93 | |
94 if (total == 0) { | |
95 error("no sequences of this sequence length in the database", dbName); | |
96 } | |
97 | |
98 unsigned int vlen = dbH->dim * sequenceLength; | |
99 double *v1 = new double[vlen]; | |
100 double *v2 = new double[vlen]; | |
101 double v1norm, v2norm, v1v2; | |
102 | |
103 double sumdist = 0; | |
104 double sumlogdist = 0; | |
105 | |
106 for (unsigned int i = 0; i < nsamples;) { | |
107 unsigned track1 = random_track(propTable, total, rng); | |
108 unsigned track2 = random_track(propTable, total, rng); | |
109 | |
110 if(track1 == track2) | |
111 continue; | |
112 | |
113 unsigned i1 = gsl_rng_uniform_int(rng, propTable[track1]); | |
114 unsigned i2 = gsl_rng_uniform_int(rng, propTable[track2]); | |
115 | |
116 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2); | |
117 | |
118 /* FIXME: this seeking, reading and distance calculation should | |
119 share more code with the query loop */ | |
120 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET); | |
121 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double)); | |
122 | |
123 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET); | |
124 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double)); | |
125 | |
126 v1norm = 0; | |
127 v2norm = 0; | |
128 v1v2 = 0; | |
129 | |
130 for (unsigned int j = 0; j < vlen; j++) { | |
131 v1norm += v1[j]*v1[j]; | |
132 v2norm += v2[j]*v2[j]; | |
133 v1v2 += v1[j]*v2[j]; | |
134 } | |
135 | |
136 /* FIXME: we must deal with infinities better than this; there | |
137 could be all sorts of NaNs from arbitrary features. Best | |
138 include power thresholds or something... */ | |
139 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) { | |
140 | |
141 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2); | |
142 /* assume normalizedDistance == true for now */ | |
143 /* FIXME: not convinced that the statistics we calculated in | |
144 TASLP paper are technically valid for normalizedDistance */ | |
145 | |
146 double dist = 2 - 2 * v1v2 / sqrt(v1norm * v2norm); | |
147 // double dist = v1norm + v2norm - 2*v1v2; | |
148 | |
149 VERB_LOG(1, "%f %f\n", dist, log(dist)); | |
150 sumdist += dist; | |
151 sumlogdist += log(dist); | |
152 i++; | |
153 } else { | |
154 VERB_LOG(1, "infinity/NaN found: %f %f %f\n", v1norm, v2norm, v1v2); | |
155 } | |
156 } | |
157 | |
158 /* FIXME: the mean isn't really what we should be reporting here */ | |
159 unsigned meanN = total / count; | |
160 | |
161 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples); | |
162 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); | |
163 | |
164 std::cout << "Summary statistics" << std::endl; | |
165 std::cout << "number of samples: " << nsamples << std::endl; | |
166 std::cout << "sum of distances (S): " << sumdist << std::endl; | |
167 std::cout << "sum of log distances (L): " << sumlogdist << std::endl; | |
168 | |
169 /* FIXME: we'll also want some more summary statistics based on | |
170 propTable, for the minimum-of-X estimate */ | |
171 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; | |
172 std::cout << std::endl; | |
173 std::cout << "Estimated parameters" << std::endl; | |
174 std::cout << "sigma^2: " << sigma2 << "; "; | |
175 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; | |
176 std::cout << "d: " << d << std::endl; | |
177 | |
178 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); | |
179 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw | |
180 - (2 / d) * gsl_sf_log(meanN) | |
181 - gsl_sf_log(d/2) | |
182 - (2 / d) * gsl_sf_log(2 / d) | |
183 + (2 / d) * gsl_sf_lngamma(d / 2); | |
184 | |
185 std::cout << "track xthresh: " << exp(logxthresh) << std::endl; | |
186 | |
187 delete[] propTable; | |
188 delete[] v1; | |
189 delete[] v2; | |
190 } |