Mercurial > hg > audiodb
comparison sample.cpp @ 266:4ffa05f25a00 sampling
Add initial sampling of database distances. Zillions of FIXME comments
everywhere.
author | mas01cr |
---|---|
date | Sat, 14 Jun 2008 17:13:26 +0000 |
parents | |
children | 30a2a45f2b70 |
comparison
equal
deleted
inserted
replaced
265:235762857a70 | 266:4ffa05f25a00 |
---|---|
1 #include "audioDB.h" | |
2 | |
3 unsigned audioDB::random_track(unsigned *propTable, unsigned total) { | |
4 /* FIXME: make this O(1) by using the alias-rejection method, or | |
5 some other sensible method of sampling from a discrete | |
6 distribution. */ | |
7 /* FIXME: use a real random number generator, not random() */ | |
8 double thing = random() / (double) RAND_MAX; | |
9 unsigned sofar = 0; | |
10 for (unsigned int i = 0; i < dbH->numFiles; i++) { | |
11 sofar += propTable[i]; | |
12 if (thing < ((double) sofar / (double) total)) { | |
13 return i; | |
14 } | |
15 } | |
16 error("fell through in random_track()"); | |
17 | |
18 /* FIXME: decorate error's declaration so that this isn't necessary */ | |
19 return 0; | |
20 } | |
21 | |
22 void audioDB::sample(const char *dbName) { | |
23 initTables(dbName, 0); | |
24 | |
25 // build track offset table (FIXME: cut'n'pasted from query.cpp) | |
26 off_t *trackOffsetTable = new off_t[dbH->numFiles]; | |
27 unsigned cumTrack=0; | |
28 for(unsigned int k = 0; k < dbH->numFiles; k++){ | |
29 trackOffsetTable[k] = cumTrack; | |
30 cumTrack += trackTable[k] * dbH->dim; | |
31 } | |
32 | |
33 unsigned *propTable = new unsigned[dbH->numFiles]; | |
34 unsigned total = 0; | |
35 | |
36 for (unsigned int i = 0; i < dbH->numFiles; i++) { | |
37 /* what kind of a stupid language doesn't have binary max(), let | |
38 alone nary? */ | |
39 unsigned int prop = trackTable[i] - sequenceLength + 1; | |
40 prop = prop > 0 ? prop : 0; | |
41 propTable[i] = prop; | |
42 total += prop; | |
43 } | |
44 | |
45 if (total == 0) { | |
46 error("no sequences of this sequence length in the database", dbName); | |
47 } | |
48 | |
49 unsigned int vlen = dbH->dim * sequenceLength; | |
50 double *v1 = new double[vlen]; | |
51 double *v2 = new double[vlen]; | |
52 double v1norm, v2norm, v1v2; | |
53 | |
54 double sumdist = 0; | |
55 double sumlogdist = 0; | |
56 | |
57 /* 1037 samples for now */ | |
58 for (unsigned int i = 0; i < 1037;) { | |
59 /* FIXME: in Real Life we'll want to initialize the RNG using | |
60 /dev/random or the current time or something. */ | |
61 unsigned track1 = random_track(propTable, total); | |
62 unsigned track2 = random_track(propTable, total); | |
63 | |
64 /* FIXME: this uses lower-order bits, which is OK on Linux but not | |
65 necessarily elsewhere. Again, use a real random number | |
66 generator */ | |
67 unsigned i1 = random() % propTable[track1]; | |
68 unsigned i2 = random() % propTable[track2]; | |
69 | |
70 VERB_LOG(1, "%d %d, %d %d | ", track1, i1, track2, i2); | |
71 | |
72 /* FIXME: this seeking, reading and distance calculation should | |
73 share more code with the query loop */ | |
74 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track1] * sizeof(double) + i1 * dbH->dim * sizeof(double), SEEK_SET); | |
75 read(dbfid, v1, dbH->dim * sequenceLength * sizeof(double)); | |
76 | |
77 lseek(dbfid, dbH->dataOffset + trackOffsetTable[track2] * sizeof(double) + i2 * dbH->dim * sizeof(double), SEEK_SET); | |
78 read(dbfid, v2, dbH->dim * sequenceLength * sizeof(double)); | |
79 | |
80 v1norm = 0; | |
81 v2norm = 0; | |
82 v1v2 = 0; | |
83 | |
84 for (unsigned int j = 0; j < vlen; j++) { | |
85 v1norm += v1[j]*v1[j]; | |
86 v2norm += v2[j]*v2[j]; | |
87 v1v2 += v1[j]*v2[j]; | |
88 } | |
89 | |
90 /* FIXME: we must deal with infinities better than this; there | |
91 could be all sorts of NaNs from arbitrary features. Best | |
92 include power thresholds or something... */ | |
93 if(isfinite(v1norm) && isfinite(v2norm) && isfinite(v1v2)) { | |
94 | |
95 VERB_LOG(1, "%f %f %f | ", v1norm, v2norm, v1v2); | |
96 /* assume normalizedDistance == true for now */ | |
97 /* FIXME: not convinced that the statistics we calculated in | |
98 TASLP paper are valid for normalizedDistance */ | |
99 double dist = 2 - v1v2 / sqrt(v1norm * v2norm); | |
100 VERB_LOG(1, "%f %f\n", dist, log(dist)); | |
101 sumdist += dist; | |
102 sumlogdist += log(dist); | |
103 i++; | |
104 } else { | |
105 VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2); | |
106 } | |
107 } | |
108 | |
109 std::cout << "Summary statistics" << std::endl; | |
110 std::cout << "number of samples: " << 1037 << std::endl; | |
111 std::cout << "sum of distances (S): " << sumdist << std::endl; | |
112 std::cout << "sum of log distances (L): " << sumlogdist << std::endl; | |
113 | |
114 /* FIXME: we'll also want some summary statistics based on | |
115 propTable, for the minimum-of-X estimate */ | |
116 | |
117 delete[] propTable; | |
118 delete[] v1; | |
119 delete[] v2; | |
120 } |