annotate audioDB-internals.h @ 601:82d23418d867

Fix some fd leaks in the command-line binary Strictly speaking, they're not really leaks, because the only codepath that suffers from these leaks exits immediately afterwards. On the other hand, this fix makes valgrind on e.g. tests/0025 happier, going from 5 errors to none.
author mas01cr
date Fri, 14 Aug 2009 16:39:32 +0000
parents 31a1556fc2d6
children e21a3db643af
rev   line source
mas01cr@590 1 #if defined(WIN32)
mas01cr@590 2 #include <sys/locking.h>
mas01cr@590 3 #endif
mas01cr@590 4 #if !defined(WIN32)
mas01cr@589 5 #include <sys/mman.h>
mas01cr@590 6 #endif
mas01cr@591 7 #include <sys/stat.h>
mas01cr@563 8 #include <sys/types.h>
mas01cr@589 9
mas01cr@589 10 #include <errno.h>
mas01cr@589 11 #include <fcntl.h>
mas01cr@590 12 #if defined(WIN32)
mas01cr@590 13 #include <io.h>
mas01cr@590 14 #endif
mas01cr@589 15 #include <limits.h>
mas01cr@589 16 #include <math.h>
mas01cr@589 17 #include <string.h>
mas01cr@563 18 #include <unistd.h>
mas01cr@590 19 #if defined(WIN32)
mas01cr@590 20 #include <windows.h>
mas01cr@590 21 #endif
mas01cr@563 22
mas01cr@589 23 #include <algorithm>
mas01cr@589 24 #include <iostream>
mas01cr@589 25 #include <map>
mas01cr@589 26 #include <queue>
mas01cr@509 27 #include <set>
mas01cr@509 28 #include <string>
mas01cr@589 29 #include <vector>
mas01cr@509 30
mas01cr@589 31 #include "accumulator.h"
mas01cr@509 32 #include "pointpair.h"
mas01cr@509 33 #include "lshlib.h"
mas01cr@498 34
mas01cr@589 35 using namespace std;
mas01cr@589 36
mas01cr@498 37 /* this struct is for writing polymorphic routines as puns. When
mas01cr@498 38 * inserting, we might have a "datum" (with actual numerical data) or
mas01cr@498 39 * a "reference" (with strings denoting pathnames containing numerical
mas01cr@498 40 * data), but most of the operations are the same. This struct, used
mas01cr@498 41 * only internally, allows us to write the main body of the insert
mas01cr@498 42 * code only once.
mas01cr@498 43 */
mas01cr@498 44 typedef struct adb_datum_internal {
mas01cr@498 45 uint32_t nvectors;
mas01cr@498 46 uint32_t dim;
mas01cr@498 47 const char *key;
mas01cr@498 48 void *data;
mas01cr@498 49 void *times;
mas01cr@498 50 void *power;
mas01cr@498 51 } adb_datum_internal_t;
mas01cr@498 52
mas01cr@498 53 /* this struct is to collect together a bunch of information about a
mas01cr@498 54 * query (or, in fact, a single database entry, or even a whole
mas01cr@498 55 * database). The _data pointers are immutable (hey, FIXME: should
mas01cr@498 56 * they be constified in some way?) so that free() can work on them
mas01cr@498 57 * later, while the ones without the suffix are mutable to maintain
mas01cr@498 58 * the "current" position in some way. mean_duration points to a
mas01cr@498 59 * (possibly single-element) array of mean durations for each track.
mas01cr@498 60 */
mas01cr@498 61 typedef struct adb_qpointers_internal {
mas01cr@498 62 uint32_t nvectors;
mas01cr@498 63 double *l2norm_data;
mas01cr@498 64 double *l2norm;
mas01cr@498 65 double *power_data;
mas01cr@498 66 double *power;
mas01cr@498 67 double *mean_duration;
mas01cr@498 68 } adb_qpointers_internal_t;
mas01cr@498 69
mas01cr@498 70 /* this struct is for maintaining per-query state. We don't want to
mas01cr@498 71 * store this stuff in the adb struct itself, because (a) it doesn't
mas01cr@498 72 * belong there and (b) in principle people might do two queries in
mas01cr@498 73 * parallel using the same adb handle. (b) is in practice a little
mas01cr@498 74 * bit academic because at the moment we're seeking all over the disk
mas01cr@498 75 * using adb->fd, but changing to use pread() might win us
mas01cr@498 76 * threadsafety eventually.
mas01cr@498 77 */
mas01cr@498 78 typedef struct adb_qstate_internal {
mas01cr@498 79 Accumulator *accumulator;
mas01cr@498 80 std::set<std::string> *allowed_keys;
mas01cr@498 81 std::priority_queue<PointPair> *exact_evaluation_queue;
mas01cr@498 82 LSH *lsh;
mas01cr@498 83 } adb_qstate_internal_t;
mas01cr@498 84
mas01cr@509 85 /* this struct is the in-memory representation of the binary
mas01cr@509 86 * information stored at the head of each adb file */
mas01cr@509 87 typedef struct adbheader {
mas01cr@509 88 uint32_t magic;
mas01cr@509 89 uint32_t version;
mas01cr@509 90 uint32_t numFiles;
mas01cr@509 91 uint32_t dim;
mas01cr@509 92 uint32_t flags;
mas01cr@509 93 uint32_t headerSize;
mas01cr@509 94 off_t length;
mas01cr@509 95 off_t fileTableOffset;
mas01cr@509 96 off_t trackTableOffset;
mas01cr@509 97 off_t dataOffset;
mas01cr@509 98 off_t l2normTableOffset;
mas01cr@509 99 off_t timesTableOffset;
mas01cr@509 100 off_t powerTableOffset;
mas01cr@509 101 off_t dbSize;
mas01cr@509 102 } adb_header_t;
mas01cr@509 103
mas01cr@509 104 #define ADB_HEADER_SIZE (sizeof(struct adbheader))
mas01cr@509 105
mas01cr@509 106 #define ADB_HEADER_FLAG_L2NORM (0x1U)
mas01cr@509 107 #define ADB_HEADER_FLAG_POWER (0x4U)
mas01cr@509 108 #define ADB_HEADER_FLAG_TIMES (0x20U)
mas01cr@509 109 #define ADB_HEADER_FLAG_REFERENCES (0x40U)
mas01cr@509 110
mas01cr@595 111 /* macros to make file/directory creation non-painful */
mas01cr@595 112 #if defined(WIN32)
mas01cr@595 113 #define mkdir_or_goto_error(dirname) \
mas01cr@595 114 if(_mkdir(dirname) < 0) { \
mas01cr@595 115 goto error; \
mas01cr@595 116 }
mas01cr@595 117 #define ADB_CREAT_PERMISSIONS (_S_IREAD|_S_IWRITE)
mas01cr@595 118 #else
mas01cr@595 119 #define mkdir_or_goto_error(dirname) \
mas01cr@595 120 if(mkdir(dirname, S_IRWXU|S_IRWXG|S_IRWXO) < 0) { \
mas01cr@595 121 goto error; \
mas01cr@595 122 }
mas01cr@595 123 #define ADB_CREAT_PERMISSIONS (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)
mas01cr@595 124 #endif
mas01cr@498 125 /* the transparent version of the opaque (forward-declared) adb_t. */
mas01cr@498 126 struct adb {
mas01cr@498 127 char *path;
mas01cr@498 128 int fd;
mas01cr@498 129 int flags;
mas01cr@498 130 adb_header_t *header;
mas01cr@498 131 std::vector<std::string> *keys;
mas01cr@498 132 std::map<std::string,uint32_t> *keymap;
mas01cr@498 133 std::vector<uint32_t> *track_lengths;
mas01cr@498 134 std::vector<off_t> *track_offsets;
mas01cr@498 135 LSH *cached_lsh;
mas01cr@498 136 };
mas01cr@498 137
mas01cr@498 138 typedef struct {
mas01cr@498 139 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 140 return strcmp(r1.key, r2.key) < 0;
mas01cr@498 141 }
mas01cr@498 142 } adb_result_key_lt;
mas01cr@498 143
mas01cr@498 144 typedef struct {
mas01cr@498 145 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 146 return r1.qpos < r2.qpos;
mas01cr@498 147 }
mas01cr@498 148 } adb_result_qpos_lt;
mas01cr@498 149
mas01cr@498 150 typedef struct {
mas01cr@498 151 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 152 return r1.dist < r2.dist;
mas01cr@498 153 }
mas01cr@498 154 } adb_result_dist_lt;
mas01cr@498 155
mas01cr@498 156 typedef struct {
mas01cr@498 157 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 158 return r1.dist > r2.dist;
mas01cr@498 159 }
mas01cr@498 160 } adb_result_dist_gt;
mas01cr@498 161
mas01cr@498 162 typedef struct {
mas01cr@498 163 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 164 return ((r1.ipos < r2.ipos) ||
mas01cr@498 165 ((r1.ipos == r2.ipos) &&
mas01cr@498 166 ((r1.qpos < r2.qpos) ||
mas01cr@498 167 ((r1.qpos == r2.qpos) && (strcmp(r1.key, r2.key) < 0)))));
mas01cr@498 168 }
mas01cr@498 169 } adb_result_triple_lt;
mas01cr@498 170
mas01cr@498 171 /* We could go gcc-specific here and use typeof() instead of passing
mas01cr@498 172 * in an explicit type. Answers on a postcard as to whether that's a
mas01cr@498 173 * good plan or not. */
mas01cr@498 174 #define mmap_or_goto_error(type, var, start, length) \
mas01cr@498 175 { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \
mas01cr@498 176 if(tmp == (void *) -1) { \
mas01cr@498 177 goto error; \
mas01cr@498 178 } \
mas01cr@498 179 var = (type) tmp; \
mas01cr@498 180 }
mas01cr@498 181
mas01cr@498 182 #define maybe_munmap(table, length) \
mas01cr@498 183 { if(table) { \
mas01cr@498 184 munmap(table, length); \
mas01cr@498 185 } \
mas01cr@498 186 }
mas01cr@498 187
mas01cr@595 188 #define malloc_and_fill_or_goto_error(type, var, start, length) \
mas01cr@595 189 { void *tmp = malloc(length); \
mas01cr@595 190 if(tmp == NULL) { \
mas01cr@595 191 goto error; \
mas01cr@595 192 } \
mas01cr@595 193 var = (type) tmp; \
mas01cr@595 194 lseek_set_or_goto_error(adb->fd, (start)); \
mas01cr@595 195 read_or_goto_error(adb->fd, var, length); \
mas01cr@595 196 }
mas01cr@595 197
mas01cr@595 198 #define maybe_free(var) \
mas01cr@595 199 { if(var) { \
mas01cr@595 200 free(var); \
mas01cr@595 201 } \
mas01cr@595 202 }
mas01cr@595 203
mas01cr@509 204 #define maybe_delete_array(pointer) \
mas01cr@509 205 { if(pointer) { \
mas01cr@509 206 delete [] pointer; \
mas01cr@509 207 pointer = NULL; \
mas01cr@509 208 } \
mas01cr@509 209 }
mas01cr@509 210
mas01cr@498 211 #define write_or_goto_error(fd, buffer, size) \
mas01cr@498 212 { ssize_t tmp = size; \
mas01cr@498 213 if(write(fd, buffer, size) != tmp) { \
mas01cr@498 214 goto error; \
mas01cr@498 215 } \
mas01cr@498 216 }
mas01cr@498 217
mas01cr@498 218 #define read_or_goto_error(fd, buffer, size) \
mas01cr@498 219 { ssize_t tmp = size; \
mas01cr@498 220 if(read(fd, buffer, size) != tmp) { \
mas01cr@498 221 goto error; \
mas01cr@498 222 } \
mas01cr@498 223 }
mas01cr@498 224
mas01cr@592 225 #define lseek_set_or_goto_error(fd, offset) \
mas01cr@592 226 { if(lseek(fd, offset, SEEK_SET) == (off_t) -1) \
mas01cr@592 227 goto error; \
mas01cr@592 228 } \
mas01cr@592 229
mas01cr@498 230 static inline int audiodb_sync_header(adb_t *adb) {
mas01cr@498 231 off_t pos;
mas01cr@498 232 pos = lseek(adb->fd, (off_t) 0, SEEK_CUR);
mas01cr@498 233 if(pos == (off_t) -1) {
mas01cr@498 234 goto error;
mas01cr@498 235 }
mas01cr@498 236 if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) {
mas01cr@498 237 goto error;
mas01cr@498 238 }
mas01cr@509 239 if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) {
mas01cr@498 240 goto error;
mas01cr@498 241 }
mas01cr@498 242
mas01cr@588 243 #if defined(WIN32)
mas01cr@588 244 _commit(adb->fd);
mas01cr@588 245 #elif defined(_POSIX_SYNCHRONIZED_IO) && (_POSIX_SYNCHRONIZED_IO > 0)
mas01cr@498 246 fdatasync(adb->fd);
mas01cr@563 247 #else
mas01cr@563 248 fsync(adb->fd);
mas01cr@563 249 #endif
mas01cr@498 250 if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) {
mas01cr@498 251 goto error;
mas01cr@498 252 }
mas01cr@498 253 return 0;
mas01cr@498 254
mas01cr@498 255 error:
mas01cr@498 256 return 1;
mas01cr@498 257 }
mas01cr@498 258
mas01cr@498 259 static inline double audiodb_dot_product(double *p, double *q, size_t count) {
mas01cr@498 260 double result = 0;
mas01cr@498 261 while(count--) {
mas01cr@498 262 result += *p++ * *q++;
mas01cr@498 263 }
mas01cr@498 264 return result;
mas01cr@498 265 }
mas01cr@498 266
mas01cr@498 267 static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) {
mas01cr@498 268 while(nvectors--) {
mas01cr@498 269 double *d1 = d;
mas01cr@498 270 double *d2 = d;
mas01cr@498 271 *l++ = audiodb_dot_product(d1, d2, dim);
mas01cr@498 272 d += dim;
mas01cr@498 273 }
mas01cr@498 274 }
mas01cr@498 275
mas01cr@498 276 // This is a common pattern in sequence queries: what we are doing is
mas01cr@498 277 // taking a window of length seqlen over a buffer of length length,
mas01cr@498 278 // and placing the sum of the elements in that window in the first
mas01cr@498 279 // element of the window: thus replacing all but the last seqlen
mas01cr@498 280 // elements in the buffer with the corresponding windowed sum.
mas01cr@498 281 static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) {
mas01cr@498 282 double tmp1, tmp2, *ps;
mas01cr@498 283 int j, w;
mas01cr@498 284
mas01cr@498 285 tmp1 = *buffer;
mas01cr@498 286 j = 1;
mas01cr@498 287 w = seqlen - 1;
mas01cr@498 288 while(w--) {
mas01cr@498 289 *buffer += buffer[j++];
mas01cr@498 290 }
mas01cr@498 291 ps = buffer + 1;
mas01cr@498 292 w = length - seqlen; // +1 - 1
mas01cr@498 293 while(w--) {
mas01cr@498 294 tmp2 = *ps;
mas01cr@498 295 if(isfinite(tmp1)) {
mas01cr@498 296 *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1);
mas01cr@498 297 } else {
mas01cr@498 298 for(int i = 1; i < seqlen; i++) {
mas01cr@498 299 *ps += *(ps + i);
mas01cr@498 300 }
mas01cr@498 301 }
mas01cr@498 302 tmp1 = tmp2;
mas01cr@498 303 ps++;
mas01cr@498 304 }
mas01cr@498 305 }
mas01cr@498 306
mas01cr@498 307 // In contrast to audiodb_sequence_sum() above,
mas01cr@498 308 // audiodb_sequence_sqrt() and audiodb_sequence_average() below are
mas01cr@498 309 // simple mappers across the sequence.
mas01cr@498 310 static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) {
mas01cr@498 311 int w = length - seqlen + 1;
mas01cr@498 312 while(w--) {
mas01cr@498 313 *buffer = sqrt(*buffer);
mas01cr@498 314 buffer++;
mas01cr@498 315 }
mas01cr@498 316 }
mas01cr@498 317
mas01cr@498 318 static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) {
mas01cr@498 319 int w = length - seqlen + 1;
mas01cr@498 320 while(w--) {
mas01cr@498 321 *buffer /= seqlen;
mas01cr@498 322 buffer++;
mas01cr@498 323 }
mas01cr@498 324 }
mas01cr@498 325
mas01cr@498 326 static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) {
mas01cr@498 327 std::map<std::string,uint32_t>::iterator it;
mas01cr@498 328 it = adb->keymap->find(key);
mas01cr@498 329 if(it == adb->keymap->end()) {
mas01cr@498 330 return (uint32_t) -1;
mas01cr@498 331 } else {
mas01cr@498 332 return (*it).second;
mas01cr@498 333 }
mas01cr@498 334 }
mas01cr@498 335
mas01cr@498 336 static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) {
mas01cr@498 337 return (*adb->keys)[index].c_str();
mas01cr@498 338 }
mas01cr@498 339
mas01mc@557 340 static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){
mas01mc@557 341 off_t offset = (off_t)lshid*adb->header->dim*sizeof(double);
mas01cr@536 342 std::vector<off_t>::iterator b = (*adb->track_offsets).begin();
mas01cr@536 343 std::vector<off_t>::iterator e = (*adb->track_offsets).end();
mas01cr@536 344 std::vector<off_t>::iterator p = std::upper_bound(b, e, offset);
mas01cr@536 345 return p - b - 1;
mas01cr@498 346 }
mas01cr@498 347
mas01mc@534 348 static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) {
mas01mc@534 349 uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
mas01mc@534 350 return lshid - trackIndexOffset;
mas01cr@498 351 }
mas01cr@498 352
mas01mc@534 353 static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) {
mas01mc@534 354 uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
mas01mc@534 355 return trackIndexOffset + track_pos;
mas01cr@498 356 }
mas01cr@498 357
mas01cr@498 358 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
mas01cr@498 359 int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *);
mas01cr@498 360 int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *);
mas01cr@580 361 int audiodb_really_free_datum(adb_datum_t *);
mas01cr@498 362 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 363 int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 364 int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *);
mas01cr@498 365 int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@498 366 char *audiodb_index_get_name(const char *, double, uint32_t);
mas01cr@498 367 bool audiodb_index_exists(const char *, double, uint32_t);
mas01cr@498 368 int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@509 369 LSH *audiodb_index_allocate(adb_t *, char *, bool);
mas01cr@509 370 vector<vector<float> > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t);
mas01cr@509 371 void audiodb_index_delete_shingles(vector<vector<float> > *);
mas01cr@509 372 void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t);
mas01cr@509 373 int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float);
mas01cr@509 374
mas01cr@509 375 #define ADB_MAXSTR (512U)
mas01cr@509 376 #define ADB_FILETABLE_ENTRY_SIZE (256U)
mas01cr@509 377 #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t))
mas01cr@509 378 #define ADB_DISTANCE_TOLERANCE (1e-6)
mas01cr@509 379
mas01cr@509 380 #define ADB_DEFAULT_DATASIZE (1355U) /* in MB */
mas01cr@509 381 #define ADB_DEFAULT_NTRACKS (20000U)
mas01cr@509 382 #define ADB_DEFAULT_DATADIM (9U)
mas01cr@509 383
mas01cr@509 384 #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1)
mas01cr@509 385 #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1)
mas01cr@509 386
mas01cr@509 387 #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24)
mas01cr@509 388 #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24)
mas01cr@509 389 #define ADB_FORMAT_VERSION (4U)
mas01cr@509 390
mas01cr@509 391 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
mas01cr@509 392 #define align_down(x,w) ((x) & ~((1<<w)-1))
mas01cr@509 393
mas01cr@591 394 #if defined(WIN32)
mas01cr@591 395 #define getpagesize() (64*1024)
mas01cr@591 396 #endif
mas01cr@591 397
mas01cr@509 398 #define align_page_up(x) (((x) + (getpagesize()-1)) & ~(getpagesize()-1))
mas01cr@509 399 #define align_page_down(x) ((x) & ~(getpagesize()-1))
mas01cr@509 400