annotate audioDB-internals.h @ 770:c54bc2ffbf92 tip

update tags
author convert-repo
date Fri, 16 Dec 2011 11:34:01 +0000
parents b9dbe4611dde
children
rev   line source
mas01cr@590 1 #if defined(WIN32)
mas01cr@590 2 #include <sys/locking.h>
mas01cr@590 3 #endif
mas01cr@590 4 #if !defined(WIN32)
mas01cr@589 5 #include <sys/mman.h>
mas01cr@590 6 #endif
mas01cr@591 7 #include <sys/stat.h>
mas01cr@563 8 #include <sys/types.h>
mas01cr@693 9 #include <sys/time.h>
mas01cr@589 10
mas01cr@656 11 #include <stdio.h>
mas01cr@589 12 #include <errno.h>
mas01cr@589 13 #include <fcntl.h>
mas01cr@590 14 #if defined(WIN32)
mas01cr@590 15 #include <io.h>
mas01cr@590 16 #endif
mas01cr@589 17 #include <limits.h>
mas01cr@589 18 #include <math.h>
mas01cr@589 19 #include <string.h>
mas01cr@563 20 #include <unistd.h>
mas01mc@762 21 #include <stdio.h>
mas01cr@590 22 #if defined(WIN32)
mas01cr@590 23 #include <windows.h>
mas01cr@590 24 #endif
mas01cr@563 25
mas01cr@693 26 #include <gsl/gsl_rng.h>
mas01cr@693 27
mas01cr@589 28 #include <algorithm>
mas01cr@589 29 #include <iostream>
mas01cr@589 30 #include <map>
mas01cr@589 31 #include <queue>
mas01cr@509 32 #include <set>
mas01cr@509 33 #include <string>
mas01cr@589 34 #include <vector>
mas01cr@509 35
mas01cr@589 36 #include "accumulator.h"
mas01cr@509 37 #include "pointpair.h"
mas01cr@509 38 #include "lshlib.h"
mas01cr@498 39
mas01cr@589 40 using namespace std;
mas01cr@589 41
mas01cr@498 42 /* this struct is for writing polymorphic routines as puns. When
mas01cr@498 43 * inserting, we might have a "datum" (with actual numerical data) or
mas01cr@498 44 * a "reference" (with strings denoting pathnames containing numerical
mas01cr@498 45 * data), but most of the operations are the same. This struct, used
mas01cr@498 46 * only internally, allows us to write the main body of the insert
mas01cr@498 47 * code only once.
mas01cr@498 48 */
mas01cr@498 49 typedef struct adb_datum_internal {
mas01cr@498 50 uint32_t nvectors;
mas01cr@498 51 uint32_t dim;
mas01cr@498 52 const char *key;
mas01cr@498 53 void *data;
mas01cr@498 54 void *times;
mas01cr@498 55 void *power;
mas01cr@498 56 } adb_datum_internal_t;
mas01cr@498 57
mas01cr@498 58 /* this struct is to collect together a bunch of information about a
mas01cr@498 59 * query (or, in fact, a single database entry, or even a whole
mas01cr@498 60 * database). The _data pointers are immutable (hey, FIXME: should
mas01cr@498 61 * they be constified in some way?) so that free() can work on them
mas01cr@498 62 * later, while the ones without the suffix are mutable to maintain
mas01cr@498 63 * the "current" position in some way. mean_duration points to a
mas01cr@498 64 * (possibly single-element) array of mean durations for each track.
mas01cr@498 65 */
mas01cr@498 66 typedef struct adb_qpointers_internal {
mas01cr@498 67 uint32_t nvectors;
mas01cr@498 68 double *l2norm_data;
mas01cr@498 69 double *l2norm;
mas01cr@498 70 double *power_data;
mas01cr@498 71 double *power;
mas01cr@498 72 double *mean_duration;
mas01cr@498 73 } adb_qpointers_internal_t;
mas01cr@498 74
mas01cr@509 75 /* this struct is the in-memory representation of the binary
mas01cr@509 76 * information stored at the head of each adb file */
mas01cr@673 77 typedef struct adb_header {
mas01cr@509 78 uint32_t magic;
mas01cr@509 79 uint32_t version;
mas01cr@509 80 uint32_t numFiles;
mas01cr@509 81 uint32_t dim;
mas01cr@509 82 uint32_t flags;
mas01cr@509 83 uint32_t headerSize;
mas01cr@509 84 off_t length;
mas01cr@509 85 off_t fileTableOffset;
mas01cr@509 86 off_t trackTableOffset;
mas01cr@509 87 off_t dataOffset;
mas01cr@509 88 off_t l2normTableOffset;
mas01cr@509 89 off_t timesTableOffset;
mas01cr@509 90 off_t powerTableOffset;
mas01cr@509 91 off_t dbSize;
mas01cr@509 92 } adb_header_t;
mas01cr@509 93
mas01cr@673 94 #define ADB_HEADER_SIZE (sizeof(struct adb_header))
mas01cr@509 95
mas01cr@509 96 #define ADB_HEADER_FLAG_L2NORM (0x1U)
mas01cr@509 97 #define ADB_HEADER_FLAG_POWER (0x4U)
mas01cr@509 98 #define ADB_HEADER_FLAG_TIMES (0x20U)
mas01cr@509 99 #define ADB_HEADER_FLAG_REFERENCES (0x40U)
mas01cr@509 100
mas01cr@595 101 /* macros to make file/directory creation non-painful */
mas01cr@595 102 #if defined(WIN32)
mas01cr@595 103 #define mkdir_or_goto_error(dirname) \
mas01cr@595 104 if(_mkdir(dirname) < 0) { \
mas01cr@595 105 goto error; \
mas01cr@595 106 }
mas01cr@595 107 #define ADB_CREAT_PERMISSIONS (_S_IREAD|_S_IWRITE)
mas01cr@595 108 #else
mas01cr@595 109 #define mkdir_or_goto_error(dirname) \
mas01cr@595 110 if(mkdir(dirname, S_IRWXU|S_IRWXG|S_IRWXO) < 0) { \
mas01cr@595 111 goto error; \
mas01cr@595 112 }
mas01cr@595 113 #define ADB_CREAT_PERMISSIONS (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
mas01cr@595 114 #endif
mas01cr@498 115 /* the transparent version of the opaque (forward-declared) adb_t. */
mas01cr@498 116 struct adb {
mas01cr@498 117 char *path;
mas01cr@498 118 int fd;
mas01cr@498 119 int flags;
mas01cr@498 120 adb_header_t *header;
mas01cr@498 121 std::vector<std::string> *keys;
mas01cr@498 122 std::map<std::string,uint32_t> *keymap;
mas01cr@498 123 std::vector<uint32_t> *track_lengths;
mas01cr@498 124 std::vector<off_t> *track_offsets;
mas01cr@498 125 LSH *cached_lsh;
mas01cr@693 126 gsl_rng *rng;
mas01cr@498 127 };
mas01cr@498 128
mas01cr@498 129 typedef struct {
mas01cr@498 130 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@672 131 return strcmp(r1.ikey, r2.ikey) < 0;
mas01cr@498 132 }
mas01cr@672 133 } adb_result_ikey_lt;
mas01cr@498 134
mas01cr@498 135 typedef struct {
mas01cr@498 136 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 137 return r1.qpos < r2.qpos;
mas01cr@498 138 }
mas01cr@498 139 } adb_result_qpos_lt;
mas01cr@498 140
mas01cr@498 141 typedef struct {
mas01cr@498 142 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 143 return r1.dist < r2.dist;
mas01cr@498 144 }
mas01cr@498 145 } adb_result_dist_lt;
mas01cr@498 146
mas01cr@498 147 typedef struct {
mas01cr@498 148 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 149 return r1.dist > r2.dist;
mas01cr@498 150 }
mas01cr@498 151 } adb_result_dist_gt;
mas01cr@498 152
mas01cr@498 153 typedef struct {
mas01cr@498 154 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 155 return ((r1.ipos < r2.ipos) ||
mas01cr@498 156 ((r1.ipos == r2.ipos) &&
mas01cr@498 157 ((r1.qpos < r2.qpos) ||
mas01cr@672 158 ((r1.qpos == r2.qpos) && (strcmp(r1.ikey, r2.ikey) < 0)))));
mas01cr@498 159 }
mas01cr@498 160 } adb_result_triple_lt;
mas01cr@498 161
mas01cr@610 162 /* this struct is for maintaining per-query state. We don't want to
mas01cr@610 163 * store this stuff in the adb struct itself, because (a) it doesn't
mas01cr@610 164 * belong there and (b) in principle people might do two queries in
mas01cr@610 165 * parallel using the same adb handle. (b) is in practice a little
mas01cr@610 166 * bit academic because at the moment we're seeking all over the disk
mas01cr@610 167 * using adb->fd, but changing to use pread() might win us
mas01cr@610 168 * threadsafety eventually.
mas01cr@610 169 */
mas01cr@610 170 typedef struct adb_qstate_internal {
mas01cr@610 171 Accumulator *accumulator;
mas01cr@610 172 std::set<std::string> *allowed_keys;
mas01cr@610 173 std::priority_queue<PointPair> *exact_evaluation_queue;
mas01cr@610 174 std::set< adb_result_t, adb_result_triple_lt > *set;
mas01cr@610 175 LSH *lsh;
mas01cr@672 176 const char *qkey;
mas01cr@610 177 } adb_qstate_internal_t;
mas01cr@610 178
mas01cr@498 179 /* We could go gcc-specific here and use typeof() instead of passing
mas01cr@498 180 * in an explicit type. Answers on a postcard as to whether that's a
mas01cr@498 181 * good plan or not. */
mas01cr@498 182 #define mmap_or_goto_error(type, var, start, length) \
mas01cr@498 183 { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \
mas01cr@498 184 if(tmp == (void *) -1) { \
mas01cr@498 185 goto error; \
mas01cr@498 186 } \
mas01cr@498 187 var = (type) tmp; \
mas01cr@498 188 }
mas01cr@498 189
mas01cr@498 190 #define maybe_munmap(table, length) \
mas01cr@498 191 { if(table) { \
mas01cr@498 192 munmap(table, length); \
mas01cr@498 193 } \
mas01cr@498 194 }
mas01cr@498 195
mas01cr@595 196 #define malloc_and_fill_or_goto_error(type, var, start, length) \
mas01cr@595 197 { void *tmp = malloc(length); \
mas01cr@595 198 if(tmp == NULL) { \
mas01cr@595 199 goto error; \
mas01cr@595 200 } \
mas01cr@595 201 var = (type) tmp; \
mas01cr@595 202 lseek_set_or_goto_error(adb->fd, (start)); \
mas01cr@595 203 read_or_goto_error(adb->fd, var, length); \
mas01cr@595 204 }
mas01cr@595 205
mas01cr@595 206 #define maybe_free(var) \
mas01cr@595 207 { if(var) { \
mas01cr@595 208 free(var); \
mas01cr@595 209 } \
mas01cr@595 210 }
mas01cr@595 211
mas01cr@509 212 #define maybe_delete_array(pointer) \
mas01cr@509 213 { if(pointer) { \
mas01cr@509 214 delete [] pointer; \
mas01cr@509 215 pointer = NULL; \
mas01cr@509 216 } \
mas01cr@509 217 }
mas01cr@509 218
mas01cr@498 219 #define write_or_goto_error(fd, buffer, size) \
mas01cr@498 220 { ssize_t tmp = size; \
mas01cr@498 221 if(write(fd, buffer, size) != tmp) { \
mas01cr@498 222 goto error; \
mas01cr@498 223 } \
mas01cr@498 224 }
mas01cr@498 225
mas01cr@498 226 #define read_or_goto_error(fd, buffer, size) \
mas01cr@498 227 { ssize_t tmp = size; \
mas01cr@498 228 if(read(fd, buffer, size) != tmp) { \
mas01cr@498 229 goto error; \
mas01cr@498 230 } \
mas01cr@498 231 }
mas01cr@498 232
mas01cr@592 233 #define lseek_set_or_goto_error(fd, offset) \
mas01cr@592 234 { if(lseek(fd, offset, SEEK_SET) == (off_t) -1) \
mas01cr@592 235 goto error; \
mas01cr@592 236 } \
mas01cr@592 237
mas01cr@498 238 static inline int audiodb_sync_header(adb_t *adb) {
mas01cr@498 239 off_t pos;
mas01cr@498 240 pos = lseek(adb->fd, (off_t) 0, SEEK_CUR);
mas01cr@498 241 if(pos == (off_t) -1) {
mas01cr@498 242 goto error;
mas01cr@498 243 }
mas01cr@498 244 if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) {
mas01cr@498 245 goto error;
mas01cr@498 246 }
mas01cr@509 247 if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) {
mas01cr@498 248 goto error;
mas01cr@498 249 }
mas01cr@498 250
mas01cr@588 251 #if defined(WIN32)
mas01cr@588 252 _commit(adb->fd);
mas01cr@588 253 #elif defined(_POSIX_SYNCHRONIZED_IO) && (_POSIX_SYNCHRONIZED_IO > 0)
mas01cr@498 254 fdatasync(adb->fd);
mas01cr@563 255 #else
mas01cr@563 256 fsync(adb->fd);
mas01cr@563 257 #endif
mas01cr@498 258 if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) {
mas01cr@498 259 goto error;
mas01cr@498 260 }
mas01cr@498 261 return 0;
mas01cr@498 262
mas01cr@498 263 error:
mas01cr@498 264 return 1;
mas01cr@498 265 }
mas01cr@498 266
mas01cr@498 267 static inline double audiodb_dot_product(double *p, double *q, size_t count) {
mas01cr@498 268 double result = 0;
mas01cr@498 269 while(count--) {
mas01cr@498 270 result += *p++ * *q++;
mas01cr@498 271 }
mas01cr@498 272 return result;
mas01cr@498 273 }
mas01cr@498 274
mas01mc@768 275 static inline double audiodb_kullback_leibler(double *p, double *q, size_t count) {
mas01mc@768 276 double a,b, tmp1, tmp2, result = 0;
mas01mc@768 277 while(count--){
mas01mc@768 278 a = *p++;
mas01mc@768 279 b = *q++;
mas01mc@768 280 tmp1 = a * log( a / b );
mas01mc@768 281 if(isnan(tmp1))
mas01mc@768 282 tmp1=0.0;
mas01mc@768 283 tmp2 = b * log( b / a );
mas01mc@768 284 if(isnan(tmp2))
mas01mc@768 285 tmp2=0.0;
mas01mc@768 286 result += ( tmp1 + tmp2 ) / 2.0;
mas01mc@768 287 }
mas01mc@768 288 return result;
mas01mc@768 289 }
mas01mc@768 290
mas01mc@768 291
mas01cr@498 292 static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) {
mas01cr@498 293 while(nvectors--) {
mas01cr@498 294 double *d1 = d;
mas01cr@498 295 double *d2 = d;
mas01cr@498 296 *l++ = audiodb_dot_product(d1, d2, dim);
mas01cr@498 297 d += dim;
mas01cr@498 298 }
mas01cr@498 299 }
mas01cr@498 300
mas01cr@498 301 // This is a common pattern in sequence queries: what we are doing is
mas01cr@498 302 // taking a window of length seqlen over a buffer of length length,
mas01cr@498 303 // and placing the sum of the elements in that window in the first
mas01cr@498 304 // element of the window: thus replacing all but the last seqlen
mas01cr@498 305 // elements in the buffer with the corresponding windowed sum.
mas01cr@498 306 static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) {
mas01cr@498 307 double tmp1, tmp2, *ps;
mas01cr@498 308 int j, w;
mas01cr@498 309
mas01cr@498 310 tmp1 = *buffer;
mas01cr@498 311 j = 1;
mas01cr@498 312 w = seqlen - 1;
mas01cr@498 313 while(w--) {
mas01cr@498 314 *buffer += buffer[j++];
mas01cr@498 315 }
mas01cr@498 316 ps = buffer + 1;
mas01cr@498 317 w = length - seqlen; // +1 - 1
mas01cr@498 318 while(w--) {
mas01cr@498 319 tmp2 = *ps;
mas01cr@498 320 if(isfinite(tmp1)) {
mas01cr@498 321 *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1);
mas01cr@498 322 } else {
mas01cr@498 323 for(int i = 1; i < seqlen; i++) {
mas01cr@498 324 *ps += *(ps + i);
mas01cr@498 325 }
mas01cr@498 326 }
mas01cr@498 327 tmp1 = tmp2;
mas01cr@498 328 ps++;
mas01cr@498 329 }
mas01cr@498 330 }
mas01cr@498 331
mas01cr@498 332 // In contrast to audiodb_sequence_sum() above,
mas01cr@498 333 // audiodb_sequence_sqrt() and audiodb_sequence_average() below are
mas01cr@498 334 // simple mappers across the sequence.
mas01cr@498 335 static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) {
mas01cr@498 336 int w = length - seqlen + 1;
mas01cr@498 337 while(w--) {
mas01cr@498 338 *buffer = sqrt(*buffer);
mas01cr@498 339 buffer++;
mas01cr@498 340 }
mas01cr@498 341 }
mas01cr@498 342
mas01cr@498 343 static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) {
mas01cr@498 344 int w = length - seqlen + 1;
mas01cr@498 345 while(w--) {
mas01cr@498 346 *buffer /= seqlen;
mas01cr@498 347 buffer++;
mas01cr@498 348 }
mas01cr@498 349 }
mas01cr@498 350
mas01cr@498 351 static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) {
mas01cr@498 352 std::map<std::string,uint32_t>::iterator it;
mas01cr@498 353 it = adb->keymap->find(key);
mas01cr@498 354 if(it == adb->keymap->end()) {
mas01cr@498 355 return (uint32_t) -1;
mas01cr@498 356 } else {
mas01cr@498 357 return (*it).second;
mas01cr@498 358 }
mas01cr@498 359 }
mas01cr@498 360
mas01cr@498 361 static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) {
mas01cr@498 362 return (*adb->keys)[index].c_str();
mas01cr@498 363 }
mas01cr@498 364
mas01mc@557 365 static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){
mas01mc@557 366 off_t offset = (off_t)lshid*adb->header->dim*sizeof(double);
mas01cr@536 367 std::vector<off_t>::iterator b = (*adb->track_offsets).begin();
mas01cr@536 368 std::vector<off_t>::iterator e = (*adb->track_offsets).end();
mas01cr@536 369 std::vector<off_t>::iterator p = std::upper_bound(b, e, offset);
mas01cr@536 370 return p - b - 1;
mas01cr@498 371 }
mas01cr@498 372
mas01mc@534 373 static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) {
mas01mc@534 374 uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
mas01mc@534 375 return lshid - trackIndexOffset;
mas01cr@498 376 }
mas01cr@498 377
mas01mc@534 378 static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) {
mas01mc@534 379 uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
mas01mc@534 380 return trackIndexOffset + track_pos;
mas01cr@498 381 }
mas01cr@498 382
mas01cr@498 383 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
mas01cr@498 384 int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *);
mas01cr@498 385 int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *);
mas01cr@580 386 int audiodb_really_free_datum(adb_datum_t *);
mas01cr@498 387 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 388 int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 389 int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *);
mas01cr@498 390 int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@498 391 char *audiodb_index_get_name(const char *, double, uint32_t);
mas01cr@498 392 bool audiodb_index_exists(const char *, double, uint32_t);
mas01cr@498 393 int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@509 394 LSH *audiodb_index_allocate(adb_t *, char *, bool);
mas01cr@509 395 vector<vector<float> > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t);
mas01cr@509 396 void audiodb_index_delete_shingles(vector<vector<float> > *);
mas01cr@509 397 void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t);
mas01cr@509 398 int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float);
mas01cr@693 399 int audiodb_sample_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@509 400
mas01cr@509 401 #define ADB_MAXSTR (512U)
mas01cr@509 402 #define ADB_FILETABLE_ENTRY_SIZE (256U)
mas01cr@509 403 #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t))
mas01cr@509 404 #define ADB_DISTANCE_TOLERANCE (1e-6)
mas01cr@509 405
mas01mc@768 406 #define ADB_DEFAULT_DATASIZE (2000U) /* in MB */
mas01mc@768 407 #define ADB_DEFAULT_NTRACKS (200000U)
mas01mc@768 408 #define ADB_DEFAULT_DATADIM (12U)
mas01cr@509 409
mas01cr@509 410 #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1)
mas01cr@509 411 #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1)
mas01cr@509 412
mas01cr@509 413 #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24)
mas01cr@509 414 #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24)
mas01cr@509 415 #define ADB_FORMAT_VERSION (4U)
mas01cr@509 416
mas01cr@509 417 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
mas01cr@509 418 #define align_down(x,w) ((x) & ~((1<<w)-1))
mas01cr@509 419
mas01cr@591 420 #if defined(WIN32)
mas01cr@591 421 #define getpagesize() (64*1024)
mas01cr@591 422 #endif
mas01cr@591 423
mas01cr@509 424 #define align_page_up(x) (((x) + (getpagesize()-1)) & ~(getpagesize()-1))
mas01cr@509 425 #define align_page_down(x) ((x) & ~(getpagesize()-1))
mas01cr@509 426