annotate audioDB-internals.h @ 534:57e459f62788

Removed LSH_N_POINT_BITS coding for LSH index. Now uses binay search via STL lower_bound to locate tracks and positions from global pointID searching over cumulative track lengths. MERGE of branches/multiprobeLSH -r 819:821 onto trunk. This is a non backward-compatible change; WARNING generated on attempt to use INDEXING with older audioDB databases. Only INDEXES are broken, not ADB instances.
author mas01mc
date Wed, 04 Feb 2009 10:45:57 +0000
parents cc2b97d020b1
children 77e63d5c6de0
rev   line source
mas01cr@509 1 #include <set>
mas01cr@509 2 #include <queue>
mas01cr@509 3 #include <map>
mas01cr@509 4 #include <string>
mas01cr@509 5
mas01cr@509 6 #include "pointpair.h"
mas01cr@498 7 #include "accumulator.h"
mas01cr@509 8 #include "lshlib.h"
mas01cr@498 9
mas01cr@498 10 /* this struct is for writing polymorphic routines as puns. When
mas01cr@498 11 * inserting, we might have a "datum" (with actual numerical data) or
mas01cr@498 12 * a "reference" (with strings denoting pathnames containing numerical
mas01cr@498 13 * data), but most of the operations are the same. This struct, used
mas01cr@498 14 * only internally, allows us to write the main body of the insert
mas01cr@498 15 * code only once.
mas01cr@498 16 */
mas01cr@498 17 typedef struct adb_datum_internal {
mas01cr@498 18 uint32_t nvectors;
mas01cr@498 19 uint32_t dim;
mas01cr@498 20 const char *key;
mas01cr@498 21 void *data;
mas01cr@498 22 void *times;
mas01cr@498 23 void *power;
mas01cr@498 24 } adb_datum_internal_t;
mas01cr@498 25
mas01cr@498 26 /* this struct is to collect together a bunch of information about a
mas01cr@498 27 * query (or, in fact, a single database entry, or even a whole
mas01cr@498 28 * database). The _data pointers are immutable (hey, FIXME: should
mas01cr@498 29 * they be constified in some way?) so that free() can work on them
mas01cr@498 30 * later, while the ones without the suffix are mutable to maintain
mas01cr@498 31 * the "current" position in some way. mean_duration points to a
mas01cr@498 32 * (possibly single-element) array of mean durations for each track.
mas01cr@498 33 */
mas01cr@498 34 typedef struct adb_qpointers_internal {
mas01cr@498 35 uint32_t nvectors;
mas01cr@498 36 double *l2norm_data;
mas01cr@498 37 double *l2norm;
mas01cr@498 38 double *power_data;
mas01cr@498 39 double *power;
mas01cr@498 40 double *mean_duration;
mas01cr@498 41 } adb_qpointers_internal_t;
mas01cr@498 42
mas01cr@498 43 /* this struct is for maintaining per-query state. We don't want to
mas01cr@498 44 * store this stuff in the adb struct itself, because (a) it doesn't
mas01cr@498 45 * belong there and (b) in principle people might do two queries in
mas01cr@498 46 * parallel using the same adb handle. (b) is in practice a little
mas01cr@498 47 * bit academic because at the moment we're seeking all over the disk
mas01cr@498 48 * using adb->fd, but changing to use pread() might win us
mas01cr@498 49 * threadsafety eventually.
mas01cr@498 50 */
mas01cr@498 51 typedef struct adb_qstate_internal {
mas01cr@498 52 Accumulator *accumulator;
mas01cr@498 53 std::set<std::string> *allowed_keys;
mas01cr@498 54 std::priority_queue<PointPair> *exact_evaluation_queue;
mas01cr@498 55 LSH *lsh;
mas01cr@498 56 } adb_qstate_internal_t;
mas01cr@498 57
mas01cr@509 58 /* this struct is the in-memory representation of the binary
mas01cr@509 59 * information stored at the head of each adb file */
mas01cr@509 60 typedef struct adbheader {
mas01cr@509 61 uint32_t magic;
mas01cr@509 62 uint32_t version;
mas01cr@509 63 uint32_t numFiles;
mas01cr@509 64 uint32_t dim;
mas01cr@509 65 uint32_t flags;
mas01cr@509 66 uint32_t headerSize;
mas01cr@509 67 off_t length;
mas01cr@509 68 off_t fileTableOffset;
mas01cr@509 69 off_t trackTableOffset;
mas01cr@509 70 off_t dataOffset;
mas01cr@509 71 off_t l2normTableOffset;
mas01cr@509 72 off_t timesTableOffset;
mas01cr@509 73 off_t powerTableOffset;
mas01cr@509 74 off_t dbSize;
mas01cr@509 75 } adb_header_t;
mas01cr@509 76
mas01cr@509 77 #define ADB_HEADER_SIZE (sizeof(struct adbheader))
mas01cr@509 78
mas01cr@509 79 #define ADB_HEADER_FLAG_L2NORM (0x1U)
mas01cr@509 80 #define ADB_HEADER_FLAG_POWER (0x4U)
mas01cr@509 81 #define ADB_HEADER_FLAG_TIMES (0x20U)
mas01cr@509 82 #define ADB_HEADER_FLAG_REFERENCES (0x40U)
mas01cr@509 83
mas01cr@498 84 /* the transparent version of the opaque (forward-declared) adb_t. */
mas01cr@498 85 struct adb {
mas01cr@498 86 char *path;
mas01cr@498 87 int fd;
mas01cr@498 88 int flags;
mas01cr@498 89 adb_header_t *header;
mas01cr@498 90 std::vector<std::string> *keys;
mas01cr@498 91 std::map<std::string,uint32_t> *keymap;
mas01cr@498 92 std::vector<uint32_t> *track_lengths;
mas01cr@498 93 std::vector<off_t> *track_offsets;
mas01cr@498 94 LSH *cached_lsh;
mas01cr@498 95 };
mas01cr@498 96
mas01cr@498 97 typedef struct {
mas01cr@498 98 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 99 return strcmp(r1.key, r2.key) < 0;
mas01cr@498 100 }
mas01cr@498 101 } adb_result_key_lt;
mas01cr@498 102
mas01cr@498 103 typedef struct {
mas01cr@498 104 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 105 return r1.qpos < r2.qpos;
mas01cr@498 106 }
mas01cr@498 107 } adb_result_qpos_lt;
mas01cr@498 108
mas01cr@498 109 typedef struct {
mas01cr@498 110 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 111 return r1.dist < r2.dist;
mas01cr@498 112 }
mas01cr@498 113 } adb_result_dist_lt;
mas01cr@498 114
mas01cr@498 115 typedef struct {
mas01cr@498 116 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 117 return r1.dist > r2.dist;
mas01cr@498 118 }
mas01cr@498 119 } adb_result_dist_gt;
mas01cr@498 120
mas01cr@498 121 typedef struct {
mas01cr@498 122 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@498 123 return ((r1.ipos < r2.ipos) ||
mas01cr@498 124 ((r1.ipos == r2.ipos) &&
mas01cr@498 125 ((r1.qpos < r2.qpos) ||
mas01cr@498 126 ((r1.qpos == r2.qpos) && (strcmp(r1.key, r2.key) < 0)))));
mas01cr@498 127 }
mas01cr@498 128 } adb_result_triple_lt;
mas01cr@498 129
mas01cr@498 130 /* We could go gcc-specific here and use typeof() instead of passing
mas01cr@498 131 * in an explicit type. Answers on a postcard as to whether that's a
mas01cr@498 132 * good plan or not. */
mas01cr@498 133 #define mmap_or_goto_error(type, var, start, length) \
mas01cr@498 134 { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \
mas01cr@498 135 if(tmp == (void *) -1) { \
mas01cr@498 136 goto error; \
mas01cr@498 137 } \
mas01cr@498 138 var = (type) tmp; \
mas01cr@498 139 }
mas01cr@498 140
mas01cr@498 141 #define maybe_munmap(table, length) \
mas01cr@498 142 { if(table) { \
mas01cr@498 143 munmap(table, length); \
mas01cr@498 144 } \
mas01cr@498 145 }
mas01cr@498 146
mas01cr@509 147 #define maybe_delete_array(pointer) \
mas01cr@509 148 { if(pointer) { \
mas01cr@509 149 delete [] pointer; \
mas01cr@509 150 pointer = NULL; \
mas01cr@509 151 } \
mas01cr@509 152 }
mas01cr@509 153
mas01cr@498 154 #define write_or_goto_error(fd, buffer, size) \
mas01cr@498 155 { ssize_t tmp = size; \
mas01cr@498 156 if(write(fd, buffer, size) != tmp) { \
mas01cr@498 157 goto error; \
mas01cr@498 158 } \
mas01cr@498 159 }
mas01cr@498 160
mas01cr@498 161 #define read_or_goto_error(fd, buffer, size) \
mas01cr@498 162 { ssize_t tmp = size; \
mas01cr@498 163 if(read(fd, buffer, size) != tmp) { \
mas01cr@498 164 goto error; \
mas01cr@498 165 } \
mas01cr@498 166 }
mas01cr@498 167
mas01cr@498 168 static inline int audiodb_sync_header(adb_t *adb) {
mas01cr@498 169 off_t pos;
mas01cr@498 170 pos = lseek(adb->fd, (off_t) 0, SEEK_CUR);
mas01cr@498 171 if(pos == (off_t) -1) {
mas01cr@498 172 goto error;
mas01cr@498 173 }
mas01cr@498 174 if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) {
mas01cr@498 175 goto error;
mas01cr@498 176 }
mas01cr@509 177 if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) {
mas01cr@498 178 goto error;
mas01cr@498 179 }
mas01cr@498 180
mas01cr@498 181 /* can be fsync() if fdatasync() is racily exciting and new */
mas01cr@498 182 fdatasync(adb->fd);
mas01cr@498 183 if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) {
mas01cr@498 184 goto error;
mas01cr@498 185 }
mas01cr@498 186 return 0;
mas01cr@498 187
mas01cr@498 188 error:
mas01cr@498 189 return 1;
mas01cr@498 190 }
mas01cr@498 191
mas01cr@498 192 static inline double audiodb_dot_product(double *p, double *q, size_t count) {
mas01cr@498 193 double result = 0;
mas01cr@498 194 while(count--) {
mas01cr@498 195 result += *p++ * *q++;
mas01cr@498 196 }
mas01cr@498 197 return result;
mas01cr@498 198 }
mas01cr@498 199
mas01cr@498 200 static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) {
mas01cr@498 201 while(nvectors--) {
mas01cr@498 202 double *d1 = d;
mas01cr@498 203 double *d2 = d;
mas01cr@498 204 *l++ = audiodb_dot_product(d1, d2, dim);
mas01cr@498 205 d += dim;
mas01cr@498 206 }
mas01cr@498 207 }
mas01cr@498 208
mas01cr@498 209 // This is a common pattern in sequence queries: what we are doing is
mas01cr@498 210 // taking a window of length seqlen over a buffer of length length,
mas01cr@498 211 // and placing the sum of the elements in that window in the first
mas01cr@498 212 // element of the window: thus replacing all but the last seqlen
mas01cr@498 213 // elements in the buffer with the corresponding windowed sum.
mas01cr@498 214 static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) {
mas01cr@498 215 double tmp1, tmp2, *ps;
mas01cr@498 216 int j, w;
mas01cr@498 217
mas01cr@498 218 tmp1 = *buffer;
mas01cr@498 219 j = 1;
mas01cr@498 220 w = seqlen - 1;
mas01cr@498 221 while(w--) {
mas01cr@498 222 *buffer += buffer[j++];
mas01cr@498 223 }
mas01cr@498 224 ps = buffer + 1;
mas01cr@498 225 w = length - seqlen; // +1 - 1
mas01cr@498 226 while(w--) {
mas01cr@498 227 tmp2 = *ps;
mas01cr@498 228 if(isfinite(tmp1)) {
mas01cr@498 229 *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1);
mas01cr@498 230 } else {
mas01cr@498 231 for(int i = 1; i < seqlen; i++) {
mas01cr@498 232 *ps += *(ps + i);
mas01cr@498 233 }
mas01cr@498 234 }
mas01cr@498 235 tmp1 = tmp2;
mas01cr@498 236 ps++;
mas01cr@498 237 }
mas01cr@498 238 }
mas01cr@498 239
mas01cr@498 240 // In contrast to audiodb_sequence_sum() above,
mas01cr@498 241 // audiodb_sequence_sqrt() and audiodb_sequence_average() below are
mas01cr@498 242 // simple mappers across the sequence.
mas01cr@498 243 static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) {
mas01cr@498 244 int w = length - seqlen + 1;
mas01cr@498 245 while(w--) {
mas01cr@498 246 *buffer = sqrt(*buffer);
mas01cr@498 247 buffer++;
mas01cr@498 248 }
mas01cr@498 249 }
mas01cr@498 250
mas01cr@498 251 static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) {
mas01cr@498 252 int w = length - seqlen + 1;
mas01cr@498 253 while(w--) {
mas01cr@498 254 *buffer /= seqlen;
mas01cr@498 255 buffer++;
mas01cr@498 256 }
mas01cr@498 257 }
mas01cr@498 258
mas01cr@498 259 static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) {
mas01cr@498 260 std::map<std::string,uint32_t>::iterator it;
mas01cr@498 261 it = adb->keymap->find(key);
mas01cr@498 262 if(it == adb->keymap->end()) {
mas01cr@498 263 return (uint32_t) -1;
mas01cr@498 264 } else {
mas01cr@498 265 return (*it).second;
mas01cr@498 266 }
mas01cr@498 267 }
mas01cr@498 268
mas01cr@498 269 static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) {
mas01cr@498 270 return (*adb->keys)[index].c_str();
mas01cr@498 271 }
mas01cr@498 272
mas01mc@534 273 static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){
mas01mc@534 274 std::vector<off_t>::iterator it_b = (*adb->track_offsets).begin();
mas01mc@534 275 std::vector<off_t>::iterator it_e = (*adb->track_offsets).end();
mas01mc@534 276 off_t test_id = lshid*adb->header->dim*sizeof(double);
mas01mc@534 277 std::vector<off_t>::iterator point_p = std::lower_bound(it_b, it_e, test_id);
mas01mc@534 278 if(*point_p == test_id)
mas01mc@534 279 return point_p - it_b; // lshid is first point in found track
mas01mc@534 280 else
mas01mc@534 281 return point_p - it_b - 1; // lshid is a point in the previous track
mas01cr@498 282 }
mas01cr@498 283
mas01mc@534 284 static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) {
mas01mc@534 285 uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
mas01mc@534 286 return lshid - trackIndexOffset;
mas01cr@498 287 }
mas01cr@498 288
mas01mc@534 289 static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) {
mas01mc@534 290 uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
mas01mc@534 291 return trackIndexOffset + track_pos;
mas01cr@498 292 }
mas01cr@498 293
mas01cr@498 294 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
mas01cr@498 295 int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *);
mas01cr@498 296 int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *);
mas01cr@498 297 int audiodb_free_datum(adb_datum_t *);
mas01cr@498 298 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 299 int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
mas01cr@498 300 int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *);
mas01cr@498 301 int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@498 302 char *audiodb_index_get_name(const char *, double, uint32_t);
mas01cr@498 303 bool audiodb_index_exists(const char *, double, uint32_t);
mas01cr@498 304 int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@509 305 LSH *audiodb_index_allocate(adb_t *, char *, bool);
mas01cr@509 306 vector<vector<float> > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t);
mas01cr@509 307 void audiodb_index_delete_shingles(vector<vector<float> > *);
mas01cr@509 308 void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t);
mas01cr@509 309 int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float);
mas01cr@509 310
mas01cr@509 311 #define ADB_MAXSTR (512U)
mas01cr@509 312 #define ADB_FILETABLE_ENTRY_SIZE (256U)
mas01cr@509 313 #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t))
mas01cr@509 314 #define ADB_DISTANCE_TOLERANCE (1e-6)
mas01cr@509 315
mas01cr@509 316 #define ADB_DEFAULT_DATASIZE (1355U) /* in MB */
mas01cr@509 317 #define ADB_DEFAULT_NTRACKS (20000U)
mas01cr@509 318 #define ADB_DEFAULT_DATADIM (9U)
mas01cr@509 319
mas01cr@509 320 #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1)
mas01cr@509 321 #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1)
mas01cr@509 322
mas01cr@509 323 #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24)
mas01cr@509 324 #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24)
mas01cr@509 325 #define ADB_FORMAT_VERSION (4U)
mas01cr@509 326
mas01cr@509 327 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
mas01cr@509 328 #define align_down(x,w) ((x) & ~((1<<w)-1))
mas01cr@509 329
mas01cr@509 330 #define align_page_up(x) (((x) + (getpagesize()-1)) & ~(getpagesize()-1))
mas01cr@509 331 #define align_page_down(x) ((x) & ~(getpagesize()-1))
mas01cr@509 332