annotate audioDB-internals.h @ 465:1030664df98c api-inversion

No more audioDB::index_allocate and audioDB::index_init_query No more SERVER_LSH_INDEX_SINGLETON, either; instead each adb_t contains a single cache of the last used in-core index. At the moment, this cache is unused by the server (and the previous cache code has been replaced by a comment), but I think that this way everyone can be allowed to benefit without anyone having to explicitly manage indexes themselves. I'm not going to say how long I wandered in a maze of valgrind before giving up and keeping the hacky workaround for loading the lsh tables [see the FIXME comment in audiodb_index_init_query()]; let's just say that it was long enough to find the extra bonus crashy close(lshfid) in audioDB::index_index_db. Also, delete the abstraction-inverting LSH stuff from query.cpp where we are making our reporters; the fix for that, which is presumably when creating small indexes for large datasets, is to implement space-efficient reporters. (The accumulator code, which is my second attempt, is more space-efficient than the reporters; inspiration may wish to be drawn...)
author mas01cr
date Tue, 30 Dec 2008 23:56:57 +0000
parents 35bb388d0eac
children 4dbd7917bf9e
rev   line source
mas01cr@457 1 #include "accumulator.h"
mas01cr@457 2
mas01cr@457 3 /* this struct is for writing polymorphic routines as puns. When
mas01cr@457 4 * inserting, we might have a "datum" (with actual numerical data) or
mas01cr@457 5 * a "reference" (with strings denoting pathnames containing numerical
mas01cr@457 6 * data), but most of the operations are the same. This struct, used
mas01cr@457 7 * only internally, allows us to write the main body of the insert
mas01cr@457 8 * code only once.
mas01cr@457 9 */
mas01cr@408 10 typedef struct adb_datum_internal {
mas01cr@408 11 uint32_t nvectors;
mas01cr@408 12 uint32_t dim;
mas01cr@408 13 const char *key;
mas01cr@408 14 void *data;
mas01cr@408 15 void *times;
mas01cr@408 16 void *power;
mas01cr@408 17 } adb_datum_internal_t;
mas01cr@408 18
mas01cr@463 19 /* this struct is to collect together a bunch of information about a
mas01cr@463 20 * query (or, in fact, a single database entry, or even a whole
mas01cr@463 21 * database). The _data pointers are immutable (hey, FIXME: should
mas01cr@463 22 * they be constified in some way?) so that free() can work on them
mas01cr@463 23 * later, while the ones without the suffix are mutable to maintain
mas01cr@463 24 * the "current" position in some way. mean_duration points to a
mas01cr@463 25 * (possibly single-element) array of mean durations for each track.
mas01cr@463 26 */
mas01cr@463 27 typedef struct adb_qpointers_internal {
mas01cr@463 28 uint32_t nvectors;
mas01cr@463 29 double *l2norm_data;
mas01cr@463 30 double *l2norm;
mas01cr@463 31 double *power_data;
mas01cr@463 32 double *power;
mas01cr@463 33 double *mean_duration;
mas01cr@463 34 } adb_qpointers_internal_t;
mas01cr@463 35
mas01cr@457 36 /* this struct is for maintaining per-query state. We don't want to
mas01cr@457 37 * store this stuff in the adb struct itself, because (a) it doesn't
mas01cr@457 38 * belong there and (b) in principle people might do two queries in
mas01cr@457 39 * parallel using the same adb handle. (b) is in practice a little
mas01cr@457 40 * bit academic because at the moment we're seeking all over the disk
mas01cr@457 41 * using adb->fd, but changing to use pread() might win us
mas01cr@457 42 * threadsafety eventually.
mas01cr@457 43 */
mas01cr@458 44 // FIXME moved to audioDB.h for now
mas01cr@457 45
mas01cr@402 46 struct adb {
mas01cr@402 47 char *path;
mas01cr@402 48 int fd;
mas01cr@402 49 int flags;
mas01cr@402 50 adb_header_t *header;
mas01cr@453 51 std::vector<std::string> *keys;
mas01cr@453 52 std::map<std::string,uint32_t> *keymap;
mas01cr@432 53 std::vector<uint32_t> *track_lengths;
mas01cr@442 54 std::vector<off_t> *track_offsets;
mas01cr@465 55 LSH *cached_lsh;
mas01cr@402 56 };
mas01cr@402 57
mas01cr@416 58 typedef struct {
mas01cr@416 59 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@416 60 return strcmp(r1.key, r2.key) < 0;
mas01cr@416 61 }
mas01cr@416 62 } adb_result_key_lt;
mas01cr@416 63
mas01cr@416 64 typedef struct {
mas01cr@416 65 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@416 66 return r1.qpos < r2.qpos;
mas01cr@416 67 }
mas01cr@416 68 } adb_result_qpos_lt;
mas01cr@416 69
mas01cr@416 70 typedef struct {
mas01cr@416 71 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@416 72 return r1.dist < r2.dist;
mas01cr@416 73 }
mas01cr@416 74 } adb_result_dist_lt;
mas01cr@416 75
mas01cr@416 76 typedef struct {
mas01cr@416 77 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@416 78 return r1.dist > r2.dist;
mas01cr@416 79 }
mas01cr@416 80 } adb_result_dist_gt;
mas01cr@416 81
mas01cr@416 82 typedef struct {
mas01cr@416 83 bool operator() (const adb_result_t &r1, const adb_result_t &r2) {
mas01cr@416 84 return ((r1.ipos < r2.ipos) ||
mas01cr@416 85 ((r1.ipos == r2.ipos) &&
mas01cr@416 86 ((r1.qpos < r2.qpos) ||
mas01cr@416 87 ((r1.qpos == r2.qpos) && (strcmp(r1.key, r2.key) < 0)))));
mas01cr@416 88 }
mas01cr@416 89 } adb_result_triple_lt;
mas01cr@416 90
mas01cr@401 91 /* We could go gcc-specific here and use typeof() instead of passing
mas01cr@401 92 * in an explicit type. Answers on a postcard as to whether that's a
mas01cr@401 93 * good plan or not. */
mas01cr@401 94 #define mmap_or_goto_error(type, var, start, length) \
mas01cr@401 95 { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \
mas01cr@401 96 if(tmp == (void *) -1) { \
mas01cr@401 97 goto error; \
mas01cr@401 98 } \
mas01cr@401 99 var = (type) tmp; \
mas01cr@401 100 }
mas01cr@401 101
mas01cr@401 102 #define maybe_munmap(table, length) \
mas01cr@401 103 { if(table) { \
mas01cr@401 104 munmap(table, length); \
mas01cr@401 105 } \
mas01cr@401 106 }
mas01cr@401 107
mas01cr@410 108 #define write_or_goto_error(fd, buffer, size) \
mas01cr@410 109 { ssize_t tmp = size; \
mas01cr@410 110 if(write(fd, buffer, size) != tmp) { \
mas01cr@410 111 goto error; \
mas01cr@410 112 } \
mas01cr@410 113 }
mas01cr@410 114
mas01cr@410 115 #define read_or_goto_error(fd, buffer, size) \
mas01cr@410 116 { ssize_t tmp = size; \
mas01cr@410 117 if(read(fd, buffer, size) != tmp) { \
mas01cr@410 118 goto error; \
mas01cr@410 119 } \
mas01cr@410 120 }
mas01cr@410 121
mas01cr@401 122 static inline int audiodb_sync_header(adb_t *adb) {
mas01cr@401 123 off_t pos;
mas01cr@401 124 pos = lseek(adb->fd, (off_t) 0, SEEK_CUR);
mas01cr@401 125 if(pos == (off_t) -1) {
mas01cr@401 126 goto error;
mas01cr@401 127 }
mas01cr@401 128 if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) {
mas01cr@401 129 goto error;
mas01cr@401 130 }
mas01cr@401 131 if(write(adb->fd, adb->header, O2_HEADERSIZE) != O2_HEADERSIZE) {
mas01cr@401 132 goto error;
mas01cr@401 133 }
mas01cr@401 134
mas01cr@401 135 /* can be fsync() if fdatasync() is racily exciting and new */
mas01cr@401 136 fdatasync(adb->fd);
mas01cr@401 137 if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) {
mas01cr@401 138 goto error;
mas01cr@401 139 }
mas01cr@401 140 return 0;
mas01cr@401 141
mas01cr@401 142 error:
mas01cr@401 143 return 1;
mas01cr@401 144 }
mas01cr@425 145
mas01cr@425 146 static inline double audiodb_dot_product(double *p, double *q, size_t count) {
mas01cr@425 147 double result = 0;
mas01cr@425 148 while(count--) {
mas01cr@425 149 result += *p++ * *q++;
mas01cr@425 150 }
mas01cr@425 151 return result;
mas01cr@425 152 }
mas01cr@426 153
mas01cr@426 154 static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) {
mas01cr@426 155 while(nvectors--) {
mas01cr@426 156 double *d1 = d;
mas01cr@426 157 double *d2 = d;
mas01cr@426 158 *l++ = audiodb_dot_product(d1, d2, dim);
mas01cr@426 159 d += dim;
mas01cr@426 160 }
mas01cr@426 161 }
mas01cr@427 162
mas01cr@427 163 // This is a common pattern in sequence queries: what we are doing is
mas01cr@427 164 // taking a window of length seqlen over a buffer of length length,
mas01cr@427 165 // and placing the sum of the elements in that window in the first
mas01cr@427 166 // element of the window: thus replacing all but the last seqlen
mas01cr@427 167 // elements in the buffer with the corresponding windowed sum.
mas01cr@427 168 static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) {
mas01cr@427 169 double tmp1, tmp2, *ps;
mas01cr@427 170 int j, w;
mas01cr@427 171
mas01cr@427 172 tmp1 = *buffer;
mas01cr@427 173 j = 1;
mas01cr@427 174 w = seqlen - 1;
mas01cr@427 175 while(w--) {
mas01cr@427 176 *buffer += buffer[j++];
mas01cr@427 177 }
mas01cr@427 178 ps = buffer + 1;
mas01cr@427 179 w = length - seqlen; // +1 - 1
mas01cr@427 180 while(w--) {
mas01cr@427 181 tmp2 = *ps;
mas01cr@427 182 if(isfinite(tmp1)) {
mas01cr@427 183 *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1);
mas01cr@427 184 } else {
mas01cr@427 185 for(int i = 1; i < seqlen; i++) {
mas01cr@427 186 *ps += *(ps + i);
mas01cr@427 187 }
mas01cr@427 188 }
mas01cr@427 189 tmp1 = tmp2;
mas01cr@427 190 ps++;
mas01cr@427 191 }
mas01cr@427 192 }
mas01cr@427 193
mas01cr@427 194 // In contrast to audiodb_sequence_sum() above,
mas01cr@427 195 // audiodb_sequence_sqrt() and audiodb_sequence_average() below are
mas01cr@427 196 // simple mappers across the sequence.
mas01cr@427 197 static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) {
mas01cr@427 198 int w = length - seqlen + 1;
mas01cr@427 199 while(w--) {
mas01cr@427 200 *buffer = sqrt(*buffer);
mas01cr@427 201 buffer++;
mas01cr@427 202 }
mas01cr@427 203 }
mas01cr@427 204
mas01cr@427 205 static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) {
mas01cr@427 206 int w = length - seqlen + 1;
mas01cr@427 207 while(w--) {
mas01cr@427 208 *buffer /= seqlen;
mas01cr@427 209 buffer++;
mas01cr@427 210 }
mas01cr@427 211 }
mas01cr@430 212
mas01cr@430 213 static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) {
mas01cr@430 214 std::map<std::string,uint32_t>::iterator it;
mas01cr@453 215 it = adb->keymap->find(key);
mas01cr@453 216 if(it == adb->keymap->end()) {
mas01cr@430 217 return (uint32_t) -1;
mas01cr@430 218 } else {
mas01cr@430 219 return (*it).second;
mas01cr@430 220 }
mas01cr@430 221 }
mas01cr@433 222
mas01cr@458 223 static inline uint32_t audiodb_index_to_track_id(uint32_t lshid, uint32_t n_point_bits) {
mas01cr@458 224 return (lshid >> n_point_bits);
mas01cr@458 225 }
mas01cr@458 226
mas01cr@458 227 static inline uint32_t audiodb_index_to_track_pos(uint32_t lshid, uint32_t n_point_bits) {
mas01cr@458 228 return (lshid & ((1 << n_point_bits) - 1));
mas01cr@458 229 }
mas01cr@458 230
mas01cr@458 231 static inline uint32_t audiodb_index_from_trackinfo(uint32_t track_id, uint32_t track_pos, uint32_t n_point_bits) {
mas01cr@458 232 return ((track_id << n_point_bits) | track_pos);
mas01cr@458 233 }
mas01cr@458 234
mas01cr@458 235 static inline uint32_t audiodb_lsh_n_point_bits(adb_t *adb) {
mas01cr@458 236 uint32_t nbits = adb->header->flags >> 28;
mas01cr@458 237 return (nbits ? nbits : O2_DEFAULT_LSH_N_POINT_BITS);
mas01cr@458 238 }
mas01cr@458 239
mas01cr@433 240 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
mas01cr@443 241 int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *);
mas01cr@461 242 int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *);
mas01cr@443 243 int audiodb_free_datum(adb_datum_t *);
mas01cr@461 244 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
mas01cr@444 245 int audiodb_query_spec_qpointers(adb_t *, adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
mas01cr@463 246 int audiodb_query_queue_loop(adb_t *, adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *);
mas01cr@463 247 int audiodb_query_loop(adb_t *, adb_query_spec_t *, adb_qstate_internal_t *);
mas01cr@460 248 char *audiodb_index_get_name(const char *, double, uint32_t);
mas01cr@460 249 bool audiodb_index_exists(const char *, double, uint32_t);