mas01cr@509: #include mas01cr@509: #include mas01cr@509: #include mas01cr@509: #include mas01cr@509: mas01cr@509: #include "pointpair.h" mas01cr@498: #include "accumulator.h" mas01cr@509: #include "lshlib.h" mas01cr@498: mas01cr@498: /* this struct is for writing polymorphic routines as puns. When mas01cr@498: * inserting, we might have a "datum" (with actual numerical data) or mas01cr@498: * a "reference" (with strings denoting pathnames containing numerical mas01cr@498: * data), but most of the operations are the same. This struct, used mas01cr@498: * only internally, allows us to write the main body of the insert mas01cr@498: * code only once. mas01cr@498: */ mas01cr@498: typedef struct adb_datum_internal { mas01cr@498: uint32_t nvectors; mas01cr@498: uint32_t dim; mas01cr@498: const char *key; mas01cr@498: void *data; mas01cr@498: void *times; mas01cr@498: void *power; mas01cr@498: } adb_datum_internal_t; mas01cr@498: mas01cr@498: /* this struct is to collect together a bunch of information about a mas01cr@498: * query (or, in fact, a single database entry, or even a whole mas01cr@498: * database). The _data pointers are immutable (hey, FIXME: should mas01cr@498: * they be constified in some way?) so that free() can work on them mas01cr@498: * later, while the ones without the suffix are mutable to maintain mas01cr@498: * the "current" position in some way. mean_duration points to a mas01cr@498: * (possibly single-element) array of mean durations for each track. mas01cr@498: */ mas01cr@498: typedef struct adb_qpointers_internal { mas01cr@498: uint32_t nvectors; mas01cr@498: double *l2norm_data; mas01cr@498: double *l2norm; mas01cr@498: double *power_data; mas01cr@498: double *power; mas01cr@498: double *mean_duration; mas01cr@498: } adb_qpointers_internal_t; mas01cr@498: mas01cr@498: /* this struct is for maintaining per-query state. We don't want to mas01cr@498: * store this stuff in the adb struct itself, because (a) it doesn't mas01cr@498: * belong there and (b) in principle people might do two queries in mas01cr@498: * parallel using the same adb handle. (b) is in practice a little mas01cr@498: * bit academic because at the moment we're seeking all over the disk mas01cr@498: * using adb->fd, but changing to use pread() might win us mas01cr@498: * threadsafety eventually. mas01cr@498: */ mas01cr@498: typedef struct adb_qstate_internal { mas01cr@498: Accumulator *accumulator; mas01cr@498: std::set *allowed_keys; mas01mc@530: std::set > * set; mas01mc@530: std::priority_queue, greater > *exact_evaluation_queue; mas01cr@498: LSH *lsh; mas01cr@498: } adb_qstate_internal_t; mas01cr@498: mas01mc@541: /* this struct is for caching file descriptors for multiple reads from a data file mas01mc@541: */ mas01mc@541: mas01mc@541: typedef struct adb_fd_cache { mas01mc@541: uint32_t track_id; mas01mc@541: adb_reference_t* reference; mas01mc@541: char* fname; mas01mc@541: int data_fd; mas01mc@541: int power_fd; mas01mc@541: FILE* times_file; mas01mc@541: } adb_fd_cache_t; mas01mc@541: mas01cr@509: /* this struct is the in-memory representation of the binary mas01cr@509: * information stored at the head of each adb file */ mas01cr@509: typedef struct adbheader { mas01cr@509: uint32_t magic; mas01cr@509: uint32_t version; mas01cr@509: uint32_t numFiles; mas01cr@509: uint32_t dim; mas01cr@509: uint32_t flags; mas01cr@509: uint32_t headerSize; mas01cr@509: off_t length; mas01cr@509: off_t fileTableOffset; mas01cr@509: off_t trackTableOffset; mas01cr@509: off_t dataOffset; mas01cr@509: off_t l2normTableOffset; mas01cr@509: off_t timesTableOffset; mas01cr@509: off_t powerTableOffset; mas01cr@509: off_t dbSize; mas01cr@509: } adb_header_t; mas01cr@509: mas01cr@509: #define ADB_HEADER_SIZE (sizeof(struct adbheader)) mas01cr@509: mas01cr@509: #define ADB_HEADER_FLAG_L2NORM (0x1U) mas01cr@509: #define ADB_HEADER_FLAG_POWER (0x4U) mas01cr@509: #define ADB_HEADER_FLAG_TIMES (0x20U) mas01cr@509: #define ADB_HEADER_FLAG_REFERENCES (0x40U) mas01cr@509: mas01cr@498: /* the transparent version of the opaque (forward-declared) adb_t. */ mas01cr@498: struct adb { mas01cr@498: char *path; mas01cr@498: int fd; mas01cr@498: int flags; mas01cr@498: adb_header_t *header; mas01cr@498: std::vector *keys; mas01cr@498: std::map *keymap; mas01cr@498: std::vector *track_lengths; mas01cr@498: std::vector *track_offsets; mas01cr@498: LSH *cached_lsh; mas01cr@498: }; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return strcmp(r1.key, r2.key) < 0; mas01cr@498: } mas01cr@498: } adb_result_key_lt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return r1.qpos < r2.qpos; mas01cr@498: } mas01cr@498: } adb_result_qpos_lt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return r1.dist < r2.dist; mas01cr@498: } mas01cr@498: } adb_result_dist_lt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return r1.dist > r2.dist; mas01cr@498: } mas01cr@498: } adb_result_dist_gt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return ((r1.ipos < r2.ipos) || mas01cr@498: ((r1.ipos == r2.ipos) && mas01cr@498: ((r1.qpos < r2.qpos) || mas01cr@498: ((r1.qpos == r2.qpos) && (strcmp(r1.key, r2.key) < 0))))); mas01cr@498: } mas01cr@498: } adb_result_triple_lt; mas01cr@498: mas01cr@498: /* We could go gcc-specific here and use typeof() instead of passing mas01cr@498: * in an explicit type. Answers on a postcard as to whether that's a mas01cr@498: * good plan or not. */ mas01cr@498: #define mmap_or_goto_error(type, var, start, length) \ mas01cr@498: { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \ mas01cr@498: if(tmp == (void *) -1) { \ mas01cr@498: goto error; \ mas01cr@498: } \ mas01cr@498: var = (type) tmp; \ mas01cr@498: } mas01cr@498: mas01cr@498: #define maybe_munmap(table, length) \ mas01cr@498: { if(table) { \ mas01cr@498: munmap(table, length); \ mas01cr@498: } \ mas01cr@498: } mas01cr@498: mas01cr@509: #define maybe_delete_array(pointer) \ mas01cr@509: { if(pointer) { \ mas01cr@509: delete [] pointer; \ mas01cr@509: pointer = NULL; \ mas01cr@509: } \ mas01cr@509: } mas01cr@509: mas01cr@498: #define write_or_goto_error(fd, buffer, size) \ mas01cr@498: { ssize_t tmp = size; \ mas01cr@498: if(write(fd, buffer, size) != tmp) { \ mas01cr@498: goto error; \ mas01cr@498: } \ mas01cr@498: } mas01cr@498: mas01cr@498: #define read_or_goto_error(fd, buffer, size) \ mas01cr@498: { ssize_t tmp = size; \ mas01cr@498: if(read(fd, buffer, size) != tmp) { \ mas01cr@498: goto error; \ mas01cr@498: } \ mas01cr@498: } mas01cr@498: mas01cr@498: static inline int audiodb_sync_header(adb_t *adb) { mas01cr@498: off_t pos; mas01cr@498: pos = lseek(adb->fd, (off_t) 0, SEEK_CUR); mas01cr@498: if(pos == (off_t) -1) { mas01cr@498: goto error; mas01cr@498: } mas01cr@498: if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) { mas01cr@498: goto error; mas01cr@498: } mas01cr@509: if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) { mas01cr@498: goto error; mas01cr@498: } mas01cr@498: mas01cr@498: /* can be fsync() if fdatasync() is racily exciting and new */ mas01cr@498: fdatasync(adb->fd); mas01cr@498: if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) { mas01cr@498: goto error; mas01cr@498: } mas01cr@498: return 0; mas01cr@498: mas01cr@498: error: mas01cr@498: return 1; mas01cr@498: } mas01cr@498: mas01cr@498: static inline double audiodb_dot_product(double *p, double *q, size_t count) { mas01cr@498: double result = 0; mas01cr@498: while(count--) { mas01cr@498: result += *p++ * *q++; mas01cr@498: } mas01cr@498: return result; mas01cr@498: } mas01cr@498: mas01cr@498: static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) { mas01cr@498: while(nvectors--) { mas01cr@498: double *d1 = d; mas01cr@498: double *d2 = d; mas01cr@498: *l++ = audiodb_dot_product(d1, d2, dim); mas01cr@498: d += dim; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: // This is a common pattern in sequence queries: what we are doing is mas01cr@498: // taking a window of length seqlen over a buffer of length length, mas01cr@498: // and placing the sum of the elements in that window in the first mas01cr@498: // element of the window: thus replacing all but the last seqlen mas01cr@498: // elements in the buffer with the corresponding windowed sum. mas01cr@498: static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) { mas01cr@498: double tmp1, tmp2, *ps; mas01cr@498: int j, w; mas01cr@498: mas01cr@498: tmp1 = *buffer; mas01cr@498: j = 1; mas01cr@498: w = seqlen - 1; mas01cr@498: while(w--) { mas01cr@498: *buffer += buffer[j++]; mas01cr@498: } mas01cr@498: ps = buffer + 1; mas01cr@498: w = length - seqlen; // +1 - 1 mas01cr@498: while(w--) { mas01cr@498: tmp2 = *ps; mas01cr@498: if(isfinite(tmp1)) { mas01cr@498: *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1); mas01cr@498: } else { mas01cr@498: for(int i = 1; i < seqlen; i++) { mas01cr@498: *ps += *(ps + i); mas01cr@498: } mas01cr@498: } mas01cr@498: tmp1 = tmp2; mas01cr@498: ps++; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: // In contrast to audiodb_sequence_sum() above, mas01cr@498: // audiodb_sequence_sqrt() and audiodb_sequence_average() below are mas01cr@498: // simple mappers across the sequence. mas01cr@498: static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) { mas01cr@498: int w = length - seqlen + 1; mas01cr@498: while(w--) { mas01cr@498: *buffer = sqrt(*buffer); mas01cr@498: buffer++; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) { mas01cr@498: int w = length - seqlen + 1; mas01cr@498: while(w--) { mas01cr@498: *buffer /= seqlen; mas01cr@498: buffer++; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) { mas01cr@498: std::map::iterator it; mas01cr@498: it = adb->keymap->find(key); mas01cr@498: if(it == adb->keymap->end()) { mas01cr@498: return (uint32_t) -1; mas01cr@498: } else { mas01cr@498: return (*it).second; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) { mas01cr@498: return (*adb->keys)[index].c_str(); mas01cr@498: } mas01cr@498: mas01mc@532: static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){ mas01mc@556: off_t offset = (off_t)lshid*adb->header->dim*sizeof(double); mas01mc@538: std::vector::iterator b = (*adb->track_offsets).begin(); mas01mc@538: std::vector::iterator e = (*adb->track_offsets).end(); mas01mc@538: std::vector::iterator p = std::upper_bound(b, e, offset); mas01mc@538: return p - b - 1; mas01cr@498: } mas01cr@498: mas01mc@532: static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) { mas01mc@532: uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double)); mas01mc@532: return lshid - trackIndexOffset; mas01cr@498: } mas01cr@498: mas01mc@532: static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) { mas01mc@532: uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double)); mas01mc@532: return trackIndexOffset + track_pos; mas01cr@498: } mas01cr@498: mas01cr@498: int audiodb_read_data(adb_t *, int, int, double **, size_t *); mas01mc@546: int audiodb_insert_create_datum_offset(adb_insert_t *, adb_datum_t *, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache=NULL); mas01mc@546: int audiodb_track_id_datum_offset(adb_t *, uint32_t , adb_datum_t *, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache=NULL); mas01mc@545: int audiodb_insert_create_datum(adb_insert_t * insert, adb_datum_t *datum); mas01mc@545: int audiodb_track_id_datum(adb_t * adb, uint32_t track_id, adb_datum_t *datum); mas01cr@498: int audiodb_free_datum(adb_datum_t *); mas01mc@541: int audiodb_free_datum_cache(adb_fd_cache_t *); mas01mc@541: int audiodb_free_datum_reference(adb_reference_t * reference); mas01cr@498: int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *); mas01mc@528: int audiodb_datum_qpointers_partial(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *, adb_qstate_internal_t *); mas01cr@498: int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *); mas01cr@498: int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *); mas01cr@498: int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); mas01cr@498: char *audiodb_index_get_name(const char *, double, uint32_t); mas01cr@498: bool audiodb_index_exists(const char *, double, uint32_t); mas01cr@498: int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); mas01cr@509: LSH *audiodb_index_allocate(adb_t *, char *, bool); mas01cr@509: vector > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t); mas01cr@509: void audiodb_index_delete_shingles(vector > *); mas01cr@509: void audiodb_index_make_shingle(vector > *, uint32_t, double *, uint32_t, uint32_t); mas01cr@509: int audiodb_index_norm_shingles(vector > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float); mas01cr@509: mas01cr@509: #define ADB_MAXSTR (512U) mas01cr@509: #define ADB_FILETABLE_ENTRY_SIZE (256U) mas01cr@509: #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t)) mas01cr@509: #define ADB_DISTANCE_TOLERANCE (1e-6) mas01cr@509: mas01cr@509: #define ADB_DEFAULT_DATASIZE (1355U) /* in MB */ mas01cr@509: #define ADB_DEFAULT_NTRACKS (20000U) mas01cr@509: #define ADB_DEFAULT_DATADIM (9U) mas01cr@509: mas01cr@509: #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1) mas01cr@509: #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1) mas01cr@509: mas01cr@509: #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24) mas01cr@509: #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24) mas01cr@509: #define ADB_FORMAT_VERSION (4U) mas01cr@509: mas01cr@509: #define align_up(x,w) (((x) + ((1<