mas01cr@266
|
1 #include "audioDB.h"
|
mas01cr@266
|
2
|
mas01cr@266
|
3 unsigned audioDB::random_track(unsigned *propTable, unsigned total) {
|
mas01cr@266
|
4 /* FIXME: make this O(1) by using the alias-rejection method, or
|
mas01cr@266
|
5 some other sensible method of sampling from a discrete
|
mas01cr@266
|
6 distribution. */
|
mas01cr@266
|
7 /* FIXME: use a real random number generator, not random() */
|
mas01cr@266
|
8 double thing = random() / (double) RAND_MAX;
|
mas01cr@266
|
9 unsigned sofar = 0;
|
mas01cr@266
|
10 for (unsigned int i = 0; i < dbH->numFiles; i++) {
|
mas01cr@266
|
11 sofar += propTable[i];
|
mas01cr@266
|
12 if (thing < ((double) sofar / (double) total)) {
|
mas01cr@266
|
13 return i;
|
mas01cr@266
|
14 }
|
mas01cr@266
|
15 }
|
mas01cr@266
|
16 error("fell through in random_track()");
|
mas01cr@266
|
17
|
mas01cr@266
|
18 /* FIXME: decorate error's declaration so that this isn't necessary */
|
mas01cr@266
|
19 return 0;
|
mas01cr@266
|
20 }
|
mas01cr@266
|
21
|
mas01cr@266
|
22 void audioDB::sample(const char *dbName) {
|
mas01cr@266
|
23 initTables(dbName, 0);
|
mas01cr@266
|
24
|
mas01cr@266
|
25 // build track offset table (FIXME: cut'n'pasted from query.cpp)
|
mas01cr@266
|
26 off_t *trackOffsetTable = new off_t[dbH->numFiles];
|
mas01cr@266
|
27 unsigned cumTrack=0;
|
mas01cr@266
|
28 for(unsigned int k = 0; k < dbH->numFiles; k++){
|
mas01cr@266
|
29 trackOffsetTable[k] = cumTrack;
|
mas01cr@266
|
30 cumTrack += trackTable[k] * dbH->dim;
|
mas01cr@266
|
31 }
|
mas01cr@266
|
32
|
mas01cr@266
|
33 unsigned *propTable = new unsigned[dbH->numFiles];
|
mas01cr@266
|
34 unsigned total = 0;
|
mas01cr@266
|
35
|
mas01cr@266
|
36 for (unsigned int i = 0; i < dbH->numFiles; i++) {
|
mas01cr@266
|
37 /* what kind of a stupid language doesn't have binary max(), let
|
mas01cr@266
|
38 alone nary? */
|
mas01cr@266
|
39 unsigned int prop = trackTable[i] - sequenceLength + 1;
|
mas01cr@266
|
40 prop = prop > 0 ? prop : 0;
|
mas01cr@266
|
41 propTable[i] = prop;
|
mas01cr@266
|
42 total += prop;
|
mas01cr@266
|
43 }
|
mas01cr@266
|
44
|
mas01cr@266
|
45 if (total == 0) {
|
mas01cr@266
|
46 error("no sequences of this sequence length in the database", dbName);
|
mas01cr@266
|
47 }
|
mas01cr@266
|
48
|
mas01cr@266
|
49 unsigned int vlen = dbH->dim * sequenceLength;
|
mas01cr@266
|
50 double *v1 = new double[vlen];
|
mas01cr@266
|
51 double *v2 = new double[vlen];
|
mas01cr@266
|
52 double v1norm, v2norm, v1v2;
|
mas01cr@266
|
53
|
mas01cr@266
|
54 double sumdist = 0;
|
mas01cr@266
|
55 double sumlogdist = 0;
|
mas01cr@266
|
56
|
mas01cr@266
|
57 /* 1037 samples for now */
|
mas01cr@266
|
58 for (unsigned int i = 0; i < 1037;) {
|
mas01cr@266
|
59 /* FIXME: in Real Life we'll want to initialize the RNG using
|
mas01cr@266
|
60 /dev/random or the current time or something. */
|
mas01cr@266
|
61 unsigned track1 = random_track(propTable, total);
|
mas01cr@266
|
62 unsigned track2 = random_track(propTable, total);
|
mas01cr@266
|
63
|
mas01cr@266
|
64 /* FIXME: this uses lower-order bits, which is OK on Linux but not
|
mas01cr@266
|
65 necessarily elsewhere. Again, use a real random number
|
mas01cr@266
|
66 generator */
|
mas01cr@266
|
67 unsigned i1 = random() % propTable[track1];
|
mas01cr@266
|
68 unsigned i2 = random() % propTable[track2];
|
mas01cr@266
|
69
|
mas01cr@266
|
70 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2);
|
mas01cr@266
|
71
|
mas01cr@266
|
72 /* FIXME: this seeking, reading and distance calculation should
|
mas01cr@266
|
73 share more code with the query loop */
|
mas01cr@266
|
74 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET);
|
mas01cr@266
|
75 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double));
|
mas01cr@266
|
76
|
mas01cr@266
|
77 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET);
|
mas01cr@266
|
78 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double));
|
mas01cr@266
|
79
|
mas01cr@266
|
80 v1norm = 0;
|
mas01cr@266
|
81 v2norm = 0;
|
mas01cr@266
|
82 v1v2 = 0;
|
mas01cr@266
|
83
|
mas01cr@266
|
84 for (unsigned int j = 0; j < vlen; j++) {
|
mas01cr@266
|
85 v1norm += v1[j]*v1[j];
|
mas01cr@266
|
86 v2norm += v2[j]*v2[j];
|
mas01cr@266
|
87 v1v2 += v1[j]*v2[j];
|
mas01cr@266
|
88 }
|
mas01cr@266
|
89
|
mas01cr@266
|
90 /* FIXME: we must deal with infinities better than this; there
|
mas01cr@266
|
91 could be all sorts of NaNs from arbitrary features. Best
|
mas01cr@266
|
92 include power thresholds or something... */
|
mas01cr@266
|
93 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) {
|
mas01cr@266
|
94
|
mas01cr@266
|
95 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2);
|
mas01cr@266
|
96 /* assume normalizedDistance == true for now */
|
mas01cr@266
|
97 /* FIXME: not convinced that the statistics we calculated in
|
mas01cr@266
|
98 TASLP paper are valid for normalizedDistance */
|
mas01cr@266
|
99 double dist = 2 - v1v2 / sqrt(v1norm * v2norm);
|
mas01cr@266
|
100 VERB_LOG(1, "%f %f\n", dist, log(dist));
|
mas01cr@266
|
101 sumdist += dist;
|
mas01cr@266
|
102 sumlogdist += log(dist);
|
mas01cr@266
|
103 i++;
|
mas01cr@266
|
104 } else {
|
mas01cr@266
|
105 VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2);
|
mas01cr@266
|
106 }
|
mas01cr@266
|
107 }
|
mas01cr@266
|
108
|
mas01cr@266
|
109 std::cout << "Summary statistics" << std::endl;
|
mas01cr@266
|
110 std::cout << "number of samples: " << 1037 << std::endl;
|
mas01cr@266
|
111 std::cout << "sum of distances (S): " << sumdist << std::endl;
|
mas01cr@266
|
112 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
|
mas01cr@266
|
113
|
mas01cr@266
|
114 /* FIXME: we'll also want some summary statistics based on
|
mas01cr@266
|
115 propTable, for the minimum-of-X estimate */
|
mas01cr@266
|
116
|
mas01cr@266
|
117 delete[] propTable;
|
mas01cr@266
|
118 delete[] v1;
|
mas01cr@266
|
119 delete[] v2;
|
mas01cr@266
|
120 }
|