Mercurial > hg > audiodb
view audioDB-internals.h @ 634:37fc7411e1ef
Actually test for retrieve behaviour in sb-alien bindings
Motivated by Jamie's bug report that it doesn't in fact work at all on
32-bit platforms. (Ticket #32 in audioDB trac)
author | mas01cr |
---|---|
date | Tue, 29 Sep 2009 16:23:39 +0000 |
parents | e21a3db643af |
children | b82b90c6445e |
line wrap: on
line source
#if defined(WIN32) #include <sys/locking.h> #endif #if !defined(WIN32) #include <sys/mman.h> #endif #include <sys/stat.h> #include <sys/types.h> #include <errno.h> #include <fcntl.h> #if defined(WIN32) #include <io.h> #endif #include <limits.h> #include <math.h> #include <string.h> #include <unistd.h> #if defined(WIN32) #include <windows.h> #endif #include <algorithm> #include <iostream> #include <map> #include <queue> #include <set> #include <string> #include <vector> #include "accumulator.h" #include "pointpair.h" #include "lshlib.h" using namespace std; /* this struct is for writing polymorphic routines as puns. When * inserting, we might have a "datum" (with actual numerical data) or * a "reference" (with strings denoting pathnames containing numerical * data), but most of the operations are the same. This struct, used * only internally, allows us to write the main body of the insert * code only once. */ typedef struct adb_datum_internal { uint32_t nvectors; uint32_t dim; const char *key; void *data; void *times; void *power; } adb_datum_internal_t; /* this struct is to collect together a bunch of information about a * query (or, in fact, a single database entry, or even a whole * database). The _data pointers are immutable (hey, FIXME: should * they be constified in some way?) so that free() can work on them * later, while the ones without the suffix are mutable to maintain * the "current" position in some way. mean_duration points to a * (possibly single-element) array of mean durations for each track. */ typedef struct adb_qpointers_internal { uint32_t nvectors; double *l2norm_data; double *l2norm; double *power_data; double *power; double *mean_duration; } adb_qpointers_internal_t; /* this struct is the in-memory representation of the binary * information stored at the head of each adb file */ typedef struct adbheader { uint32_t magic; uint32_t version; uint32_t numFiles; uint32_t dim; uint32_t flags; uint32_t headerSize; off_t length; off_t fileTableOffset; off_t trackTableOffset; off_t dataOffset; off_t l2normTableOffset; off_t timesTableOffset; off_t powerTableOffset; off_t dbSize; } adb_header_t; #define ADB_HEADER_SIZE (sizeof(struct adbheader)) #define ADB_HEADER_FLAG_L2NORM (0x1U) #define ADB_HEADER_FLAG_POWER (0x4U) #define ADB_HEADER_FLAG_TIMES (0x20U) #define ADB_HEADER_FLAG_REFERENCES (0x40U) /* macros to make file/directory creation non-painful */ #if defined(WIN32) #define mkdir_or_goto_error(dirname) \ if(_mkdir(dirname) < 0) { \ goto error; \ } #define ADB_CREAT_PERMISSIONS (_S_IREAD|_S_IWRITE) #else #define mkdir_or_goto_error(dirname) \ if(mkdir(dirname, S_IRWXU|S_IRWXG|S_IRWXO) < 0) { \ goto error; \ } #define ADB_CREAT_PERMISSIONS (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH) #endif /* the transparent version of the opaque (forward-declared) adb_t. */ struct adb { char *path; int fd; int flags; adb_header_t *header; std::vector<std::string> *keys; std::map<std::string,uint32_t> *keymap; std::vector<uint32_t> *track_lengths; std::vector<off_t> *track_offsets; LSH *cached_lsh; }; typedef struct { bool operator() (const adb_result_t &r1, const adb_result_t &r2) { return strcmp(r1.key, r2.key) < 0; } } adb_result_key_lt; typedef struct { bool operator() (const adb_result_t &r1, const adb_result_t &r2) { return r1.qpos < r2.qpos; } } adb_result_qpos_lt; typedef struct { bool operator() (const adb_result_t &r1, const adb_result_t &r2) { return r1.dist < r2.dist; } } adb_result_dist_lt; typedef struct { bool operator() (const adb_result_t &r1, const adb_result_t &r2) { return r1.dist > r2.dist; } } adb_result_dist_gt; typedef struct { bool operator() (const adb_result_t &r1, const adb_result_t &r2) { return ((r1.ipos < r2.ipos) || ((r1.ipos == r2.ipos) && ((r1.qpos < r2.qpos) || ((r1.qpos == r2.qpos) && (strcmp(r1.key, r2.key) < 0))))); } } adb_result_triple_lt; /* this struct is for maintaining per-query state. We don't want to * store this stuff in the adb struct itself, because (a) it doesn't * belong there and (b) in principle people might do two queries in * parallel using the same adb handle. (b) is in practice a little * bit academic because at the moment we're seeking all over the disk * using adb->fd, but changing to use pread() might win us * threadsafety eventually. */ typedef struct adb_qstate_internal { Accumulator *accumulator; std::set<std::string> *allowed_keys; std::priority_queue<PointPair> *exact_evaluation_queue; std::set< adb_result_t, adb_result_triple_lt > *set; LSH *lsh; } adb_qstate_internal_t; /* We could go gcc-specific here and use typeof() instead of passing * in an explicit type. Answers on a postcard as to whether that's a * good plan or not. */ #define mmap_or_goto_error(type, var, start, length) \ { void *tmp = mmap(0, length, PROT_READ, MAP_SHARED, adb->fd, (start)); \ if(tmp == (void *) -1) { \ goto error; \ } \ var = (type) tmp; \ } #define maybe_munmap(table, length) \ { if(table) { \ munmap(table, length); \ } \ } #define malloc_and_fill_or_goto_error(type, var, start, length) \ { void *tmp = malloc(length); \ if(tmp == NULL) { \ goto error; \ } \ var = (type) tmp; \ lseek_set_or_goto_error(adb->fd, (start)); \ read_or_goto_error(adb->fd, var, length); \ } #define maybe_free(var) \ { if(var) { \ free(var); \ } \ } #define maybe_delete_array(pointer) \ { if(pointer) { \ delete [] pointer; \ pointer = NULL; \ } \ } #define write_or_goto_error(fd, buffer, size) \ { ssize_t tmp = size; \ if(write(fd, buffer, size) != tmp) { \ goto error; \ } \ } #define read_or_goto_error(fd, buffer, size) \ { ssize_t tmp = size; \ if(read(fd, buffer, size) != tmp) { \ goto error; \ } \ } #define lseek_set_or_goto_error(fd, offset) \ { if(lseek(fd, offset, SEEK_SET) == (off_t) -1) \ goto error; \ } \ static inline int audiodb_sync_header(adb_t *adb) { off_t pos; pos = lseek(adb->fd, (off_t) 0, SEEK_CUR); if(pos == (off_t) -1) { goto error; } if(lseek(adb->fd, (off_t) 0, SEEK_SET) == (off_t) -1) { goto error; } if(write(adb->fd, adb->header, ADB_HEADER_SIZE) != ADB_HEADER_SIZE) { goto error; } #if defined(WIN32) _commit(adb->fd); #elif defined(_POSIX_SYNCHRONIZED_IO) && (_POSIX_SYNCHRONIZED_IO > 0) fdatasync(adb->fd); #else fsync(adb->fd); #endif if(lseek(adb->fd, pos, SEEK_SET) == (off_t) -1) { goto error; } return 0; error: return 1; } static inline double audiodb_dot_product(double *p, double *q, size_t count) { double result = 0; while(count--) { result += *p++ * *q++; } return result; } static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) { while(nvectors--) { double *d1 = d; double *d2 = d; *l++ = audiodb_dot_product(d1, d2, dim); d += dim; } } // This is a common pattern in sequence queries: what we are doing is // taking a window of length seqlen over a buffer of length length, // and placing the sum of the elements in that window in the first // element of the window: thus replacing all but the last seqlen // elements in the buffer with the corresponding windowed sum. static inline void audiodb_sequence_sum(double *buffer, int length, int seqlen) { double tmp1, tmp2, *ps; int j, w; tmp1 = *buffer; j = 1; w = seqlen - 1; while(w--) { *buffer += buffer[j++]; } ps = buffer + 1; w = length - seqlen; // +1 - 1 while(w--) { tmp2 = *ps; if(isfinite(tmp1)) { *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1); } else { for(int i = 1; i < seqlen; i++) { *ps += *(ps + i); } } tmp1 = tmp2; ps++; } } // In contrast to audiodb_sequence_sum() above, // audiodb_sequence_sqrt() and audiodb_sequence_average() below are // simple mappers across the sequence. static inline void audiodb_sequence_sqrt(double *buffer, int length, int seqlen) { int w = length - seqlen + 1; while(w--) { *buffer = sqrt(*buffer); buffer++; } } static inline void audiodb_sequence_average(double *buffer, int length, int seqlen) { int w = length - seqlen + 1; while(w--) { *buffer /= seqlen; buffer++; } } static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) { std::map<std::string,uint32_t>::iterator it; it = adb->keymap->find(key); if(it == adb->keymap->end()) { return (uint32_t) -1; } else { return (*it).second; } } static inline const char *audiodb_index_key(adb_t *adb, uint32_t index) { return (*adb->keys)[index].c_str(); } static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){ off_t offset = (off_t)lshid*adb->header->dim*sizeof(double); std::vector<off_t>::iterator b = (*adb->track_offsets).begin(); std::vector<off_t>::iterator e = (*adb->track_offsets).end(); std::vector<off_t>::iterator p = std::upper_bound(b, e, offset); return p - b - 1; } static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) { uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double)); return lshid - trackIndexOffset; } static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) { uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double)); return trackIndexOffset + track_pos; } int audiodb_read_data(adb_t *, int, int, double **, size_t *); int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *); int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *); int audiodb_really_free_datum(adb_datum_t *); int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *); int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *); int audiodb_query_queue_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *, double *, adb_qpointers_internal_t *); int audiodb_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); char *audiodb_index_get_name(const char *, double, uint32_t); bool audiodb_index_exists(const char *, double, uint32_t); int audiodb_index_query_loop(adb_t *, const adb_query_spec_t *, adb_qstate_internal_t *); LSH *audiodb_index_allocate(adb_t *, char *, bool); vector<vector<float> > *audiodb_index_initialize_shingles(uint32_t, uint32_t, uint32_t); void audiodb_index_delete_shingles(vector<vector<float> > *); void audiodb_index_make_shingle(vector<vector<float> > *, uint32_t, double *, uint32_t, uint32_t); int audiodb_index_norm_shingles(vector<vector<float> > *, double *, double *, uint32_t, uint32_t, double, bool, bool, float); #define ADB_MAXSTR (512U) #define ADB_FILETABLE_ENTRY_SIZE (256U) #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t)) #define ADB_DISTANCE_TOLERANCE (1e-6) #define ADB_DEFAULT_DATASIZE (1355U) /* in MB */ #define ADB_DEFAULT_NTRACKS (20000U) #define ADB_DEFAULT_DATADIM (9U) #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1) #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1) #define ADB_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24) #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24) #define ADB_FORMAT_VERSION (4U) #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1)) #define align_down(x,w) ((x) & ~((1<<w)-1)) #if defined(WIN32) #define getpagesize() (64*1024) #endif #define align_page_up(x) (((x) + (getpagesize()-1)) & ~(getpagesize()-1)) #define align_page_down(x) ((x) & ~(getpagesize()-1))