mas01cr@590: #if defined(WIN32) mas01cr@590: #include mas01cr@590: #endif mas01cr@590: #if !defined(WIN32) mas01cr@589: #include mas01cr@590: #endif mas01cr@591: #include mas01cr@563: #include mas01cr@693: #include mas01cr@589: mas01cr@656: #include mas01cr@589: #include mas01cr@589: #include mas01cr@590: #if defined(WIN32) mas01cr@590: #include mas01cr@590: #endif mas01cr@589: #include mas01cr@589: #include mas01cr@589: #include mas01cr@563: #include mas01mc@762: #include mas01cr@590: #if defined(WIN32) mas01cr@590: #include mas01cr@590: #endif mas01cr@563: mas01cr@693: #include mas01cr@693: mas01cr@589: #include mas01cr@589: #include mas01cr@589: #include mas01cr@589: #include mas01cr@509: #include mas01cr@509: #include mas01cr@589: #include mas01cr@509: mas01cr@589: #include "accumulator.h" mas01cr@509: #include "pointpair.h" mas01cr@509: #include "lshlib.h" mas01cr@498: mas01cr@589: using namespace std; mas01cr@589: mas01cr@498: /* this struct is for writing polymorphic routines as puns. When mas01cr@498: * inserting, we might have a "datum" (with actual numerical data) or mas01cr@498: * a "reference" (with strings denoting pathnames containing numerical mas01cr@498: * data), but most of the operations are the same. This struct, used mas01cr@498: * only internally, allows us to write the main body of the insert mas01cr@498: * code only once. mas01cr@498: */ mas01cr@498: typedef struct adb_datum_internal { mas01cr@498: uint32_t nvectors; mas01cr@498: uint32_t dim; mas01cr@498: const char *key; mas01cr@498: void *data; mas01cr@498: void *times; mas01cr@498: void *power; mas01cr@498: } adb_datum_internal_t; mas01cr@498: mas01cr@498: /* this struct is to collect together a bunch of information about a mas01cr@498: * query (or, in fact, a single database entry, or even a whole mas01cr@498: * database). The _data pointers are immutable (hey, FIXME: should mas01cr@498: * they be constified in some way?) so that free() can work on them mas01cr@498: * later, while the ones without the suffix are mutable to maintain mas01cr@498: * the "current" position in some way. mean_duration points to a mas01cr@498: * (possibly single-element) array of mean durations for each track. mas01cr@498: */ mas01cr@498: typedef struct adb_qpointers_internal { mas01cr@498: uint32_t nvectors; mas01cr@498: double *l2norm_data; mas01cr@498: double *l2norm; mas01cr@498: double *power_data; mas01cr@498: double *power; mas01cr@498: double *mean_duration; mas01cr@498: } adb_qpointers_internal_t; mas01cr@498: mas01cr@509: /* this struct is the in-memory representation of the binary mas01cr@509: * information stored at the head of each adb file */ mas01cr@673: typedef struct adb_header { mas01cr@509: uint32_t magic; mas01cr@509: uint32_t version; mas01cr@509: uint32_t numFiles; mas01cr@509: uint32_t dim; mas01cr@509: uint32_t flags; mas01cr@509: uint32_t headerSize; mas01cr@509: off_t length; mas01cr@509: off_t fileTableOffset; mas01cr@509: off_t trackTableOffset; mas01cr@509: off_t dataOffset; mas01cr@509: off_t l2normTableOffset; mas01cr@509: off_t timesTableOffset; mas01cr@509: off_t powerTableOffset; mas01cr@509: off_t dbSize; mas01cr@509: } adb_header_t; mas01cr@509: mas01cr@673: #define ADB_HEADER_SIZE (sizeof(struct adb_header)) mas01cr@509: mas01cr@509: #define ADB_HEADER_FLAG_L2NORM (0x1U) mas01cr@509: #define ADB_HEADER_FLAG_POWER (0x4U) mas01cr@509: #define ADB_HEADER_FLAG_TIMES (0x20U) mas01cr@509: #define ADB_HEADER_FLAG_REFERENCES (0x40U) mas01cr@509: mas01cr@595: /* macros to make file/directory creation non-painful */ mas01cr@595: #if defined(WIN32) mas01cr@595: #define mkdir_or_goto_error(dirname) \ mas01cr@595: if(_mkdir(dirname) < 0) { \ mas01cr@595: goto error; \ mas01cr@595: } mas01cr@595: #define ADB_CREAT_PERMISSIONS (_S_IREAD|_S_IWRITE) mas01cr@595: #else mas01cr@595: #define mkdir_or_goto_error(dirname) \ mas01cr@595: if(mkdir(dirname, S_IRWXU|S_IRWXG|S_IRWXO) < 0) { \ mas01cr@595: goto error; \ mas01cr@595: } mas01cr@595: #define ADB_CREAT_PERMISSIONS (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH) mas01cr@595: #endif mas01cr@498: /* the transparent version of the opaque (forward-declared) adb_t. */ mas01cr@498: struct adb { mas01cr@498: char *path; mas01cr@498: int fd; mas01cr@498: int flags; mas01cr@498: adb_header_t *header; mas01cr@498: std::vector *keys; mas01cr@498: std::map *keymap; mas01cr@498: std::vector *track_lengths; mas01cr@498: std::vector *track_offsets; mas01cr@498: LSH *cached_lsh; mas01cr@693: gsl_rng *rng; mas01cr@498: }; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@672: return strcmp(r1.ikey, r2.ikey) < 0; mas01cr@498: } mas01cr@672: } adb_result_ikey_lt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return r1.qpos < r2.qpos; mas01cr@498: } mas01cr@498: } adb_result_qpos_lt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return r1.dist < r2.dist; mas01cr@498: } mas01cr@498: } adb_result_dist_lt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return r1.dist > r2.dist; mas01cr@498: } mas01cr@498: } adb_result_dist_gt; mas01cr@498: mas01cr@498: typedef struct { mas01cr@498: bool operator() (const adb_result_t &r1, const adb_result_t &r2) { mas01cr@498: return ((r1.ipos < r2.ipos) || mas01cr@498: ((r1.ipos == r2.ipos) && mas01cr@498: ((r1.qpos < r2.qpos) || mas01cr@672: ((r1.qpos == r2.qpos) && (strcmp(r1.ikey, r2.ikey) < 0))))); mas01cr@498: } mas01cr@498: } adb_result_triple_lt; mas01cr@498: mas01cr@610: /* this struct is for maintaining per-query state. We don't want to mas01cr@610: * store this stuff in the adb struct itself, because (a) it doesn't mas01cr@610: * belong there and (b) in principle people might do two queries in mas01cr@610: * parallel using the same adb handle. (b) is in practice a little mas01cr@610: * bit academic because at the moment we're seeking all over the disk mas01cr@610: * using adb->fd, but changing to use pread() might win us mas01cr@610: * threadsafety eventually. mas01cr@610: */ mas01cr@610: typedef struct adb_qstate_internal { mas01cr@610: Accumulator *accumulator; mas01cr@610: std::set *allowed_keys; mas01cr@610: std::priority_queue *exact_evaluation_queue; mas01cr@610: std::set< adb_result_t, adb_result_triple_lt > *set; mas01cr@610: LSH *lsh; mas01cr@672: const char *qkey; mas01cr@610: } adb_qstate_internal_t; mas01cr@610: mas01cr@498: /* We could go gcc-specific here and use typeof() instead of passing mas01cr@498: * in an explicit type. Answers on a postcard as to whether that's a mas01cr@498: * good plan or not. */ mas01cr@498: #define mmap_or_goto_error(type, var, start, length) \ mas01cr@498: { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \ mas01cr@498: if(tmp == (void *) -1) { \ mas01cr@498: goto error; \ mas01cr@498: } \ mas01cr@498: var = (type) tmp; \ mas01cr@498: } mas01cr@498: mas01cr@498: #define maybe_munmap(table, length) \ mas01cr@498: { if(table) { \ mas01cr@498: munmap(table, length); \ mas01cr@498: } \ mas01cr@498: } mas01cr@498: mas01cr@595: #define malloc_and_fill_or_goto_error(type, var, start, length) \ mas01cr@595: { void *tmp = malloc(length); \ mas01cr@595: if(tmp == NULL) { \ mas01cr@595: goto error; \ mas01cr@595: } \ mas01cr@595: var = (type) tmp; \ mas01cr@595: lseek_set_or_goto_error(adb->fd, (start)); \ mas01cr@595: read_or_goto_error(adb->fd, var, length); \ mas01cr@595: } mas01cr@595: mas01cr@595: #define maybe_free(var) \ mas01cr@595: { if(var) { \ mas01cr@595: free(var); \ mas01cr@595: } \ mas01cr@595: } mas01cr@595: mas01cr@509: #define maybe_delete_array(pointer) \ mas01cr@509: { if(pointer) { \ mas01cr@509: delete [] pointer; \ mas01cr@509: pointer = NULL; \ mas01cr@509: } \ mas01cr@509: } mas01cr@509: mas01cr@498: #define write_or_goto_error(fd, buffer, size) \ mas01cr@498: { ssize_t tmp = size; \ mas01cr@498: if(write(fd, buffer, size) != tmp) { \ mas01cr@498: goto error; \ mas01cr@498: } \ mas01cr@498: } mas01cr@498: mas01cr@498: #define read_or_goto_error(fd, buffer, size) \ mas01cr@498: { ssize_t tmp = size; \ mas01cr@498: if(read(fd, buffer, size) != tmp) { \ mas01cr@498: goto error; \ mas01cr@498: } \ mas01cr@498: } mas01cr@498: mas01cr@592: #define lseek_set_or_goto_error(fd, offset) \ mas01cr@592: { if(lseek(fd, offset, SEEK_SET) == (off_t) -1) \ mas01cr@592: goto error; \ mas01cr@592: } \ mas01cr@592: mas01cr@498: static inline int audiodb_sync_header(adb_t *adb) { mas01cr@498: off_t pos; mas01cr@498: pos = lseek(adb->fd, (off_t) 0, SEEK_CUR); mas01cr@498: if(pos == (off_t) -1) { mas01cr@498: goto error; mas01cr@498: } mas01cr@498: if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) { mas01cr@498: goto error; mas01cr@498: } mas01cr@509: if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) { mas01cr@498: goto error; mas01cr@498: } mas01cr@498: mas01cr@588: #if defined(WIN32) mas01cr@588: _commit(adb->fd); mas01cr@588: #elif defined(_POSIX_SYNCHRONIZED_IO) && (_POSIX_SYNCHRONIZED_IO > 0) mas01cr@498: fdatasync(adb->fd); mas01cr@563: #else mas01cr@563: fsync(adb->fd); mas01cr@563: #endif mas01cr@498: if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) { mas01cr@498: goto error; mas01cr@498: } mas01cr@498: return 0; mas01cr@498: mas01cr@498: error: mas01cr@498: return 1; mas01cr@498: } mas01cr@498: mas01cr@498: static inline double audiodb_dot_product(double *p, double *q, size_t count) { mas01cr@498: double result = 0; mas01cr@498: while(count--) { mas01cr@498: result += *p++ * *q++; mas01cr@498: } mas01cr@498: return result; mas01cr@498: } mas01cr@498: mas01mc@768: static inline double audiodb_kullback_leibler(double *p, double *q, size_t count) { mas01mc@768: double a,b, tmp1, tmp2, result = 0; mas01mc@768: while(count--){ mas01mc@768: a = *p++; mas01mc@768: b = *q++; mas01mc@768: tmp1 = a * log( a / b ); mas01mc@768: if(isnan(tmp1)) mas01mc@768: tmp1=0.0; mas01mc@768: tmp2 = b * log( b / a ); mas01mc@768: if(isnan(tmp2)) mas01mc@768: tmp2=0.0; mas01mc@768: result += ( tmp1 + tmp2 ) / 2.0; mas01mc@768: } mas01mc@768: return result; mas01mc@768: } mas01mc@768: mas01mc@768: mas01cr@498: static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) { mas01cr@498: while(nvectors--) { mas01cr@498: double *d1 = d; mas01cr@498: double *d2 = d; mas01cr@498: *l++ = audiodb_dot_product(d1, d2, dim); mas01cr@498: d += dim; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: // This is a common pattern in sequence queries: what we are doing is mas01cr@498: // taking a window of length seqlen over a buffer of length length, mas01cr@498: // and placing the sum of the elements in that window in the first mas01cr@498: // element of the window: thus replacing all but the last seqlen mas01cr@498: // elements in the buffer with the corresponding windowed sum. mas01cr@498: static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) { mas01cr@498: double tmp1, tmp2, *ps; mas01cr@498: int j, w; mas01cr@498: mas01cr@498: tmp1 = *buffer; mas01cr@498: j = 1; mas01cr@498: w = seqlen - 1; mas01cr@498: while(w--) { mas01cr@498: *buffer += buffer[j++]; mas01cr@498: } mas01cr@498: ps = buffer + 1; mas01cr@498: w = length - seqlen; // +1 - 1 mas01cr@498: while(w--) { mas01cr@498: tmp2 = *ps; mas01cr@498: if(isfinite(tmp1)) { mas01cr@498: *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1); mas01cr@498: } else { mas01cr@498: for(int i = 1; i < seqlen; i++) { mas01cr@498: *ps += *(ps + i); mas01cr@498: } mas01cr@498: } mas01cr@498: tmp1 = tmp2; mas01cr@498: ps++; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: // In contrast to audiodb_sequence_sum() above, mas01cr@498: // audiodb_sequence_sqrt() and audiodb_sequence_average() below are mas01cr@498: // simple mappers across the sequence. mas01cr@498: static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) { mas01cr@498: int w = length - seqlen + 1; mas01cr@498: while(w--) { mas01cr@498: *buffer = sqrt(*buffer); mas01cr@498: buffer++; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) { mas01cr@498: int w = length - seqlen + 1; mas01cr@498: while(w--) { mas01cr@498: *buffer /= seqlen; mas01cr@498: buffer++; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) { mas01cr@498: std::map::iterator it; mas01cr@498: it = adb->keymap->find(key); mas01cr@498: if(it == adb->keymap->end()) { mas01cr@498: return (uint32_t) -1; mas01cr@498: } else { mas01cr@498: return (*it).second; mas01cr@498: } mas01cr@498: } mas01cr@498: mas01cr@498: static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) { mas01cr@498: return (*adb->keys)[index].c_str(); mas01cr@498: } mas01cr@498: mas01mc@557: static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){ mas01mc@557: off_t offset = (off_t)lshid*adb->header->dim*sizeof(double); mas01cr@536: std::vector::iterator b = (*adb->track_offsets).begin(); mas01cr@536: std::vector::iterator e = (*adb->track_offsets).end(); mas01cr@536: std::vector::iterator p = std::upper_bound(b, e, offset); mas01cr@536: return p - b - 1; mas01cr@498: } mas01cr@498: mas01mc@534: static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) { mas01mc@534: uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double)); mas01mc@534: return lshid - trackIndexOffset; mas01cr@498: } mas01cr@498: mas01mc@534: static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) { mas01mc@534: uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double)); mas01mc@534: return trackIndexOffset + track_pos; mas01cr@498: } mas01cr@498: mas01cr@498: int audiodb_read_data(adb_t *, int, int, double **, size_t *); mas01cr@498: int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *); mas01cr@498: int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *); mas01cr@580: int audiodb_really_free_datum(adb_datum_t *); mas01cr@498: int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *); mas01cr@498: int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *); mas01cr@498: int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *); mas01cr@498: int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); mas01cr@498: char *audiodb_index_get_name(const char *, double, uint32_t); mas01cr@498: bool audiodb_index_exists(const char *, double, uint32_t); mas01cr@498: int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); mas01cr@509: LSH *audiodb_index_allocate(adb_t *, char *, bool); mas01cr@509: vector > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t); mas01cr@509: void audiodb_index_delete_shingles(vector > *); mas01cr@509: void audiodb_index_make_shingle(vector > *, uint32_t, double *, uint32_t, uint32_t); mas01cr@509: int audiodb_index_norm_shingles(vector > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float); mas01cr@693: int audiodb_sample_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); mas01cr@509: mas01cr@509: #define ADB_MAXSTR (512U) mas01cr@509: #define ADB_FILETABLE_ENTRY_SIZE (256U) mas01cr@509: #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t)) mas01cr@509: #define ADB_DISTANCE_TOLERANCE (1e-6) mas01cr@509: mas01mc@768: #define ADB_DEFAULT_DATASIZE (2000U) /* in MB */ mas01mc@768: #define ADB_DEFAULT_NTRACKS (200000U) mas01mc@768: #define ADB_DEFAULT_DATADIM (12U) mas01cr@509: mas01cr@509: #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1) mas01cr@509: #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1) mas01cr@509: mas01cr@509: #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24) mas01cr@509: #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24) mas01cr@509: #define ADB_FORMAT_VERSION (4U) mas01cr@509: mas01cr@509: #define align_up(x,w) (((x) + ((1<