annotate audioDB-internals.h @ 509:cc2b97d020b1

Code rearrangements to tease apart library code from C++ audioDB code. There should be precisely no functional changes in this commit. Instead, the only thing that has happened is that all the abstraction violation and other horribleness is concentrated in one place: the include of "audioDB-internals.h" in audioDB.h -- the separation will be complete once that include can be removed. This include is necessary because the command-line binary / SOAP server still does some things directly rather than through an API: not least of which the operations that have not yet been integrated into the API yet, but also some messing around with constants, flags and nominally internal functions. The intent is to remove as many of these as possible and think quite hard about the rest. In the meantime, the library is now much more self-contained: the only things it uses are in the audioDB_API.h and audioDB-internals.h headers; thus there are fewer nasty surprises lurking for readers of the code. The Makefile has been adjusted to take advantage of this rearrangement in the dependencies.
author mas01cr
date Thu, 15 Jan 2009 13:57:33 +0000
parents 342822c2d49a
children cbd5841e6b70 57e459f62788
rev   line source
mas01cr@509 1 #include <set>
mas01cr@509 2 #include <queue>
mas01cr@509 3 #include <map>
mas01cr@509 4 #include <string>
mas01cr@509 5
mas01cr@509 6 #include "pointpair.h"
mas01cr@498 7 #include "accumulator.h"
mas01cr@509 8 #include "lshlib.h"
mas01cr@498 9
mas01cr@498 10 /* this struct is for writing polymorphic routines as puns. When
mas01cr@498 11 * inserting, we might have a "datum" (with actual numerical data) or
mas01cr@498 12 * a "reference" (with strings denoting pathnames containing numerical
mas01cr@498 13 * data), but most of the operations are the same. This struct, used
mas01cr@498 14 * only internally, allows us to write the main body of the insert
mas01cr@498 15 * code only once.
mas01cr@498 16 */
mas01cr@498 17 typedef struct adb_datum_internal {
mas01cr@498 18 uint32_t nvectors;
mas01cr@498 19 uint32_t dim;
mas01cr@498 20 const char *key;
mas01cr@498 21 void *data;
mas01cr@498 22 void *times;
mas01cr@498 23 void *power;
mas01cr@498 24 } adb_datum_internal_t;
mas01cr@498 25
mas01cr@498 26 /* this struct is to collect together a bunch of information about a
mas01cr@498 27 * query (or, in fact, a single database entry, or even a whole
mas01cr@498 28 * database). The _data pointers are immutable (hey, FIXME: should
mas01cr@498 29 * they be constified in some way?) so that free() can work on them
mas01cr@498 30 * later, while the ones without the suffix are mutable to maintain
mas01cr@498 31 * the "current" position in some way. mean_duration points to a
mas01cr@498 32 * (possibly single-element) array of mean durations for each track.
mas01cr@498 33 */
mas01cr@498 34 typedef struct adb_qpointers_internal {
mas01cr@498 35 uint32_t nvectors;
mas01cr@498 36 double *l2norm_data;
mas01cr@498 37 double *l2norm;
mas01cr@498 38 double *power_data;
mas01cr@498 39 double *power;
mas01cr@498 40 double *mean_duration;
mas01cr@498 41 } adb_qpointers_internal_t;
mas01cr@498 42
mas01cr@498 43 /* this struct is for maintaining per-query state. We don't want to
mas01cr@498 44 * store this stuff in the adb struct itself, because (a) it doesn't
mas01cr@498 45 * belong there and (b) in principle people might do two queries in
mas01cr@498 46 * parallel using the same adb handle. (b) is in practice a little
mas01cr@498 47 * bit academic because at the moment we're seeking all over the disk
mas01cr@498 48 * using adb->fd, but changing to use pread() might win us
mas01cr@498 49 * threadsafety eventually.
mas01cr@498 50 */
mas01cr@498 51 typedef struct adb_qstate_internal {
mas01cr@498 52 Accumulator *accumulator;
mas01cr@498 53 std::set<std::string> *allowed_keys;
mas01cr@498 54 std::priority_queue<PointPair> *exact_evaluation_queue;
mas01cr@498 55 LSH *lsh;
mas01cr@498 56 } adb_qstate_internal_t;
mas01cr@498 57
mas01cr@509 58 /* this struct is the in-memory representation of the binary
mas01cr@509 59 * information stored at the head of each adb file */
mas01cr@509 60 typedef struct adbheader {
mas01cr@509 61 uint32_t magic;
mas01cr@509 62 uint32_t version;
mas01cr@509 63 uint32_t numFiles;
mas01cr@509 64 uint32_t dim;
mas01cr@509 65 uint32_t flags;
mas01cr@509 66 uint32_t headerSize;
mas01cr@509 67 off_t length;
mas01cr@509 68 off_t fileTableOffset;
mas01cr@509 69 off_t trackTableOffset;
mas01cr@509 70 off_t dataOffset;
mas01cr@509 71 off_t l2normTableOffset;
mas01cr@509 72 off_t timesTableOffset;
mas01cr@509 73 off_t powerTableOffset;
mas01cr@509 74 off_t dbSize;
mas01cr@509 75 } adb_header_t;
mas01cr@509 76
mas01cr@509 77 #define ADB_HEADER_SIZE (sizeof(struct adbheader))
mas01cr@509 78
mas01cr@509 79 #define ADB_HEADER_FLAG_L2NORM (0x1U)
mas01cr@509 80 #define ADB_HEADER_FLAG_POWER (0x4U)
mas01cr@509 81 #define ADB_HEADER_FLAG_TIMES (0x20U)
mas01cr@509 82 #define ADB_HEADER_FLAG_REFERENCES (0x40U)
mas01cr@509 83
mas01cr@498 84 /* the transparent version of the opaque (forward-declared) adb_t. */
mas01cr@498 85 struct adb {
mas01cr@498 86 char *path;
mas01cr@498 87 int fd;
mas01cr@498 88 int flags;
mas01cr@498 89 adb_header_t *header;
mas01cr@498 90 std::vector<std::string> *keys;
mas01cr@498 91 std::map<std::string,uint32_t> *keymap;
mas01cr@498 92 std::vector<uint32_t> *track_lengths;
mas01cr@498 93 std::vector<off_t> *track_offsets;
mas01cr@498 94 LSH *cached_lsh;
mas01cr@498 95 };
mas01cr@498 96
mas01cr@498 97 typedef struct {
mas01cr@498 98 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 99 return strcmp(r1.key, r2.key) < 0;
mas01cr@498 100 }
mas01cr@498 101 } adb_result_key_lt;
mas01cr@498 102
mas01cr@498 103 typedef struct {
mas01cr@498 104 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 105 return r1.qpos < r2.qpos;
mas01cr@498 106 }
mas01cr@498 107 } adb_result_qpos_lt;
mas01cr@498 108
mas01cr@498 109 typedef struct {
mas01cr@498 110 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 111 return r1.dist < r2.dist;
mas01cr@498 112 }
mas01cr@498 113 } adb_result_dist_lt;
mas01cr@498 114
mas01cr@498 115 typedef struct {
mas01cr@498 116 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 117 return r1.dist > r2.dist;
mas01cr@498 118 }
mas01cr@498 119 } adb_result_dist_gt;
mas01cr@498 120
mas01cr@498 121 typedef struct {
mas01cr@498 122 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 123 return ((r1.ipos < r2.ipos) ||
mas01cr@498 124 ((r1.ipos == r2.ipos) &&
mas01cr@498 125 ((r1.qpos < r2.qpos) ||
mas01cr@498 126 ((r1.qpos == r2.qpos) && (strcmp(r1.key, r2.key) < 0)))));
mas01cr@498 127 }
mas01cr@498 128 } adb_result_triple_lt;
mas01cr@498 129
mas01cr@498 130 /* We could go gcc-specific here and use typeof() instead of passing
mas01cr@498 131 * in an explicit type. Answers on a postcard as to whether that's a
mas01cr@498 132 * good plan or not. */
mas01cr@498 133 #define mmap_or_goto_error(type, var, start, length) \
mas01cr@498 134 { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \
mas01cr@498 135 if(tmp == (void *) -1) { \
mas01cr@498 136 goto error; \
mas01cr@498 137 } \
mas01cr@498 138 var = (type) tmp; \
mas01cr@498 139 }
mas01cr@498 140
mas01cr@498 141 #define maybe_munmap(table, length) \
mas01cr@498 142 { if(table) { \
mas01cr@498 143 munmap(table, length); \
mas01cr@498 144 } \
mas01cr@498 145 }
mas01cr@498 146
mas01cr@509 147 #define maybe_delete_array(pointer) \
mas01cr@509 148 { if(pointer) { \
mas01cr@509 149 delete [] pointer; \
mas01cr@509 150 pointer = NULL; \
mas01cr@509 151 } \
mas01cr@509 152 }
mas01cr@509 153
mas01cr@498 154 #define write_or_goto_error(fd, buffer, size) \
mas01cr@498 155 { ssize_t tmp = size; \
mas01cr@498 156 if(write(fd, buffer, size) != tmp) { \
mas01cr@498 157 goto error; \
mas01cr@498 158 } \
mas01cr@498 159 }
mas01cr@498 160
mas01cr@498 161 #define read_or_goto_error(fd, buffer, size) \
mas01cr@498 162 { ssize_t tmp = size; \
mas01cr@498 163 if(read(fd, buffer, size) != tmp) { \
mas01cr@498 164 goto error; \
mas01cr@498 165 } \
mas01cr@498 166 }
mas01cr@498 167
mas01cr@498 168 static inline int audiodb_sync_header(adb_t *adb) {
mas01cr@498 169 off_t pos;
mas01cr@498 170 pos = lseek(adb->fd, (off_t) 0, SEEK_CUR);
mas01cr@498 171 if(pos == (off_t) -1) {
mas01cr@498 172 goto error;
mas01cr@498 173 }
mas01cr@498 174 if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) {
mas01cr@498 175 goto error;
mas01cr@498 176 }
mas01cr@509 177 if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) {
mas01cr@498 178 goto error;
mas01cr@498 179 }
mas01cr@498 180
mas01cr@498 181 /* can be fsync() if fdatasync() is racily exciting and new */
mas01cr@498 182 fdatasync(adb->fd);
mas01cr@498 183 if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) {
mas01cr@498 184 goto error;
mas01cr@498 185 }
mas01cr@498 186 return 0;
mas01cr@498 187
mas01cr@498 188 error:
mas01cr@498 189 return 1;
mas01cr@498 190 }
mas01cr@498 191
mas01cr@498 192 static inline double audiodb_dot_product(double *p, double *q, size_t count) {
mas01cr@498 193 double result = 0;
mas01cr@498 194 while(count--) {
mas01cr@498 195 result += *p++ * *q++;
mas01cr@498 196 }
mas01cr@498 197 return result;
mas01cr@498 198 }
mas01cr@498 199
mas01cr@498 200 static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) {
mas01cr@498 201 while(nvectors--) {
mas01cr@498 202 double *d1 = d;
mas01cr@498 203 double *d2 = d;
mas01cr@498 204 *l++ = audiodb_dot_product(d1, d2, dim);
mas01cr@498 205 d += dim;
mas01cr@498 206 }
mas01cr@498 207 }
mas01cr@498 208
mas01cr@498 209 // This is a common pattern in sequence queries: what we are doing is
mas01cr@498 210 // taking a window of length seqlen over a buffer of length length,
mas01cr@498 211 // and placing the sum of the elements in that window in the first
mas01cr@498 212 // element of the window: thus replacing all but the last seqlen
mas01cr@498 213 // elements in the buffer with the corresponding windowed sum.
mas01cr@498 214 static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) {
mas01cr@498 215 double tmp1, tmp2, *ps;
mas01cr@498 216 int j, w;
mas01cr@498 217
mas01cr@498 218 tmp1 = *buffer;
mas01cr@498 219 j = 1;
mas01cr@498 220 w = seqlen - 1;
mas01cr@498 221 while(w--) {
mas01cr@498 222 *buffer += buffer[j++];
mas01cr@498 223 }
mas01cr@498 224 ps = buffer + 1;
mas01cr@498 225 w = length - seqlen; // +1 - 1
mas01cr@498 226 while(w--) {
mas01cr@498 227 tmp2 = *ps;
mas01cr@498 228 if(isfinite(tmp1)) {
mas01cr@498 229 *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1);
mas01cr@498 230 } else {
mas01cr@498 231 for(int i = 1; i < seqlen; i++) {
mas01cr@498 232 *ps += *(ps + i);
mas01cr@498 233 }
mas01cr@498 234 }
mas01cr@498 235 tmp1 = tmp2;
mas01cr@498 236 ps++;
mas01cr@498 237 }
mas01cr@498 238 }
mas01cr@498 239
mas01cr@498 240 // In contrast to audiodb_sequence_sum() above,
mas01cr@498 241 // audiodb_sequence_sqrt() and audiodb_sequence_average() below are
mas01cr@498 242 // simple mappers across the sequence.
mas01cr@498 243 static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) {
mas01cr@498 244 int w = length - seqlen + 1;
mas01cr@498 245 while(w--) {
mas01cr@498 246 *buffer = sqrt(*buffer);
mas01cr@498 247 buffer++;
mas01cr@498 248 }
mas01cr@498 249 }
mas01cr@498 250
mas01cr@498 251 static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) {
mas01cr@498 252 int w = length - seqlen + 1;
mas01cr@498 253 while(w--) {
mas01cr@498 254 *buffer /= seqlen;
mas01cr@498 255 buffer++;
mas01cr@498 256 }
mas01cr@498 257 }
mas01cr@498 258
mas01cr@498 259 static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) {
mas01cr@498 260 std::map<std::string,uint32_t>::iterator it;
mas01cr@498 261 it = adb->keymap->find(key);
mas01cr@498 262 if(it == adb->keymap->end()) {
mas01cr@498 263 return (uint32_t) -1;
mas01cr@498 264 } else {
mas01cr@498 265 return (*it).second;
mas01cr@498 266 }
mas01cr@498 267 }
mas01cr@498 268
mas01cr@498 269 static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) {
mas01cr@498 270 return (*adb->keys)[index].c_str();
mas01cr@498 271 }
mas01cr@498 272
mas01cr@498 273 static inline uint32_t audiodb_index_to_track_id(uint32_t lshid, uint32_t n_point_bits) {
mas01cr@498 274 return (lshid >> n_point_bits);
mas01cr@498 275 }
mas01cr@498 276
mas01cr@498 277 static inline uint32_t audiodb_index_to_track_pos(uint32_t lshid, uint32_t n_point_bits) {
mas01cr@498 278 return (lshid & ((1 << n_point_bits) - 1));
mas01cr@498 279 }
mas01cr@498 280
mas01cr@498 281 static inline uint32_t audiodb_index_from_trackinfo(uint32_t track_id, uint32_t track_pos, uint32_t n_point_bits) {
mas01cr@498 282 return ((track_id << n_point_bits) | track_pos);
mas01cr@498 283 }
mas01cr@498 284
mas01cr@509 285 #define ADB_FIXME_DEFAULT_LSH_N_POINT_BITS 14
mas01cr@509 286 #ifndef ADB_FIXME_LSH_N_POINT_BITS
mas01cr@509 287 #define ADB_FIXME_LSH_N_POINT_BITS ADB_FIXME_DEFAULT_LSH_N_POINT_BITS
mas01cr@509 288 #endif
mas01cr@509 289
mas01cr@498 290 static inline uint32_t audiodb_lsh_n_point_bits(adb_t *adb) {
mas01cr@498 291 uint32_t nbits = adb->header->flags >> 28;
mas01cr@509 292 return (nbits ? nbits : ADB_FIXME_LSH_N_POINT_BITS);
mas01cr@498 293 }
mas01cr@498 294
mas01cr@498 295 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
mas01cr@498 296 int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *);
mas01cr@498 297 int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *);
mas01cr@498 298 int audiodb_free_datum(adb_datum_t *);
mas01cr@498 299 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 300 int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 301 int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *);
mas01cr@498 302 int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@498 303 char *audiodb_index_get_name(const char *, double, uint32_t);
mas01cr@498 304 bool audiodb_index_exists(const char *, double, uint32_t);
mas01cr@498 305 int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@509 306 LSH *audiodb_index_allocate(adb_t *, char *, bool);
mas01cr@509 307 vector<vector<float> > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t);
mas01cr@509 308 void audiodb_index_delete_shingles(vector<vector<float> > *);
mas01cr@509 309 void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t);
mas01cr@509 310 int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float);
mas01cr@509 311
mas01cr@509 312 #define ADB_MAXSTR (512U)
mas01cr@509 313 #define ADB_FILETABLE_ENTRY_SIZE (256U)
mas01cr@509 314 #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t))
mas01cr@509 315 #define ADB_DISTANCE_TOLERANCE (1e-6)
mas01cr@509 316
mas01cr@509 317 #define ADB_DEFAULT_DATASIZE (1355U) /* in MB */
mas01cr@509 318 #define ADB_DEFAULT_NTRACKS (20000U)
mas01cr@509 319 #define ADB_DEFAULT_DATADIM (9U)
mas01cr@509 320
mas01cr@509 321 #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1)
mas01cr@509 322 #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1)
mas01cr@509 323
mas01cr@509 324 #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24)
mas01cr@509 325 #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24)
mas01cr@509 326 #define ADB_FORMAT_VERSION (4U)
mas01cr@509 327
mas01cr@509 328 #define ADB_LSH_MAXTRACKLEN (1 << ADB_FIXME_LSH_N_POINT_BITS)
mas01cr@509 329
mas01cr@509 330 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
mas01cr@509 331 #define align_down(x,w) ((x) & ~((1<<w)-1))
mas01cr@509 332
mas01cr@509 333 #define align_page_up(x) (((x) + (getpagesize()-1)) & ~(getpagesize()-1))
mas01cr@509 334 #define align_page_down(x) ((x) & ~(getpagesize()-1))
mas01cr@509 335