Mercurial > hg > audiodb
changeset 752:1731a6c457d7 fewerQueryDatumReads
Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author | mas01mc |
---|---|
date | Thu, 25 Nov 2010 13:42:40 +0000 |
parents | bf89c80ec4cc |
children | |
files | Makefile audioDB-internals.h audioDB.cpp insert.cpp lshlib.cpp lshlib.h query.cpp |
diffstat | 7 files changed, 191 insertions(+), 54 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Sun Feb 08 15:53:57 2009 +0000 +++ b/Makefile Thu Nov 25 13:42:40 2010 +0000 @@ -17,7 +17,7 @@ MINORVERSION=0 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION) -override CFLAGS+=-g -O3 -fPIC +override CFLAGS+=-g -ggdb -fPIC # set to generate profile (gprof) and coverage (gcov) info #override CFLAGS+=-fprofile-arcs -ftest-coverage -pg
--- a/audioDB-internals.h Sun Feb 08 15:53:57 2009 +0000 +++ b/audioDB-internals.h Thu Nov 25 13:42:40 2010 +0000 @@ -284,7 +284,7 @@ } static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){ - off_t offset = lshid*adb->header->dim*sizeof(double); + off_t offset = (off_t)lshid*adb->header->dim*sizeof(double); std::vector<off_t>::iterator b = (*adb->track_offsets).begin(); std::vector<off_t>::iterator e = (*adb->track_offsets).end(); std::vector<off_t>::iterator p = std::upper_bound(b, e, offset); @@ -302,8 +302,8 @@ } int audiodb_read_data(adb_t *, int, int, double **, size_t *); -int audiodb_insert_create_datum_offset(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size, adb_fd_cache_t * cache=NULL); -int audiodb_track_id_datum_offset(adb_t *, uint32_t , adb_datum_t *, off_t vector_offset, size_t vector_size, adb_fd_cache_t * cache=NULL); +int audiodb_insert_create_datum_offset(adb_insert_t *, adb_datum_t *, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache=NULL); +int audiodb_track_id_datum_offset(adb_t *, uint32_t , adb_datum_t *, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache=NULL); int audiodb_insert_create_datum(adb_insert_t * insert, adb_datum_t *datum); int audiodb_track_id_datum(adb_t * adb, uint32_t track_id, adb_datum_t *datum); int audiodb_free_datum(adb_datum_t *);
--- a/audioDB.cpp Sun Feb 08 15:53:57 2009 +0000 +++ b/audioDB.cpp Thu Nov 25 13:42:40 2010 +0000 @@ -873,10 +873,7 @@ qspec.qid.flags = 0; qspec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE; qspec.qid.flags |= lsh_exact ? 0 : ADB_QID_FLAG_ALLOW_FALSE_POSITIVES; - if(!query_from_key && usingQueryPoint) // FIXME: query_from_key uses qspec.qid.sequence_start to locate data - qspec.qid.sequence_start = 0; - else - qspec.qid.sequence_start = queryPoint; + qspec.qid.sequence_start = queryPoint; switch(queryType) { case O2_POINT_QUERY:
--- a/insert.cpp Sun Feb 08 15:53:57 2009 +0000 +++ b/insert.cpp Thu Nov 25 13:42:40 2010 +0000 @@ -337,7 +337,7 @@ return audiodb_insert_create_datum_offset(insert, datum, 0, 0, 0); } -int audiodb_insert_create_datum_offset(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset, size_t data_size, adb_fd_cache_t *cache) { +int audiodb_insert_create_datum_offset(adb_insert_t *insert, adb_datum_t *datum, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache) { int fd = 0; FILE *file = NULL; struct stat st; @@ -351,8 +351,9 @@ } // STEP 1 check if we need to clear the cache - if(cache && (cache->fname && strncmp(cache->fname, insert->features, strlen(insert->features))!=0)) + if(cache && (cache->fname && strncmp(cache->fname, insert->features, strlen(insert->features))!=0)){ clear_cache = true; + } // STEP 2. Clear the cache if necessary if(cache && clear_cache){ @@ -388,10 +389,12 @@ } // STEP 5. Allocate data memory if necessary, read the requested amount of data - if(data_size) - size = data_size; - else + if(num_vectors){ + size = num_vectors*datum->dim*sizeof(double); + } + else{ size = st.st_size - sizeof(uint32_t); + } datum->nvectors = size / (sizeof(double) * datum->dim); @@ -403,13 +406,15 @@ goto error; } - if(data_offset) - lseek(fd, sizeof(uint32_t) + data_offset, SEEK_SET); + if(vector_offset){ + lseek(fd, sizeof(uint32_t) + vector_offset*datum->dim*sizeof(double), SEEK_SET); + } read_or_goto_error(fd, datum->data, size); // STEP 6. Close the file descriptor, unless we are caching it - if(!cache) + if(!cache){ close(fd); + } fd = 0; // we're done with the data if(insert->power) { @@ -422,8 +427,9 @@ } // Use the cached file descriptor or open a new file descriptor - if (cache && cache->power_fd) + if (cache && cache->power_fd){ fd = cache->power_fd; + } else if((fd = open(insert->power, O_RDONLY)) == -1) { goto error; } @@ -459,7 +465,7 @@ * I hate C. */ - if( (!data_size) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) { + if( (!num_vectors) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) { goto error; } @@ -469,8 +475,9 @@ if(dim != 1) { goto error; } - if(cache) + if(cache){ cache->power_fd = fd; + } } // Allocate data memory if necessary, read the requested amount of data @@ -480,13 +487,15 @@ goto error; } - if(data_offset) - lseek(fd, sizeof(uint32_t) + data_offset/datum->dim, SEEK_SET); + if(vector_offset){ + lseek(fd, sizeof(uint32_t) + vector_offset*sizeof(double), SEEK_SET); + } read_or_goto_error(fd, datum->power, size / datum->dim); - if(!cache) + if(!cache){ close(fd); + } fd = 0; } @@ -500,31 +509,38 @@ } // Use the cached file descriptor or open a new file descriptor and maybe cache - if (cache && cache->times_file) + if (cache && cache->times_file){ file = cache->times_file; + } else{ if(!(file = fopen(insert->times, "r"))) { goto error; } - if(cache) + if(cache){ cache->times_file = file; + } } // Allocate data memory if necessary, read the requested amount of data - if(!datum->times) + if(!datum->times){ datum->times = (double *) malloc(2 * size / datum->dim); + } if(!datum->times) { goto error; } rewind(file); + if(fscanf(file, " %lf", &t) != 1) { goto error; } - if(data_offset) - while(data_offset-- != 1 ) - if(fscanf(file, " %lf", &t) != 1) + if(vector_offset){ + while(vector_offset-- != 1 ){ + if(fscanf(file, " %lf", &t) != 1){ goto error; + } + } + } tp = datum->times; *tp++ = t; for(unsigned int n = 0; n < datum->nvectors - 1; n++) {
--- a/lshlib.cpp Sun Feb 08 15:53:57 2009 +0000 +++ b/lshlib.cpp Thu Nov 25 13:42:40 2010 +0000 @@ -1507,7 +1507,7 @@ if(!pointFound) error("State machine error: point", "unserialize_hashtable_row_format2()"); H::t2 = H::p; // Copy last found token to t2 - } + } return H::t2; // holds current token } @@ -1657,7 +1657,7 @@ // Retrieval is performed by generating a hash key query_t2 for query point q // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry // point for retrieve_from_core_hashtable_array -#define SKIP_BITS (0xC0000000) +#define SKIP_BITS (0xC0000000U) void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){ Uns32T skip; Uns32T t2; @@ -1674,9 +1674,9 @@ p2 = *p++; skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB); if( t2 == H::t2 ){ - add_point_callback(calling_instance, p1 ^ (p1 & SKIP_BITS), qpos, radius); + add_point_callback(calling_instance, p1 & ~SKIP_BITS, qpos, radius); if(skip--){ - add_point_callback(calling_instance, p2 ^ (p2 & SKIP_BITS), qpos, radius); + add_point_callback(calling_instance, p2 & ~SKIP_BITS, qpos, radius); while(skip-- ) add_point_callback(calling_instance, *p++, qpos, radius); } @@ -1710,6 +1710,124 @@ } } + void G::dump_core_row(Uns32T n){ + if(!(n<H::N)){ + printf("ROW OUT OF RANGE:%d (MAX:%d)\n", n, H::N-1); + return; + } + for(Uns32T j = 0 ; j < H::L ; j++ ){ + bucket* bPtr = h[j][n]; + if(bPtr){ + printf("C[%d,%d]", j, n); +#ifdef LSH_LIST_HEAD_COUNTERS + printf("[numBuckets=%d]",bPtr->snext.numBuckets); + if(bPtr->t2&LSH_CORE_ARRAY_BIT) { + dump_core_hashtable_array((Uns32T*)(bPtr->next)); + } + else { + dump_hashtable_row(bPtr->next); + } +#else + dump_hashtable_row(bPtr); +#endif + printf("\n"); + } + } + } + + void G::dump_disk_row(char* filename, Uns32T n){ + int dbfid = unserialize_lsh_header(filename); + if(dbfid<0){ + cerr << "Could not read header from " << filename << endl; + return; + } + FILE* dbFile = 0; + dbFile = fdopen(dbfid, "rb"); + if(!dbFile){ + cerr << "Could not create FILE pointer from file:" << filename << " with fid:" << dbfid << endl; + close(dbfid); + return; + } + + Uns32T x=0,y=0; + + // Seek to hashtable base offset + if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ + fclose(dbFile); + error("fSeek error in unserialize_lsh_hashtables_format2"); + } + Uns32T token = 0; + Uns32T pointID; + + // Read the hash tables into core (structure is given in header) + while( x < H::L){ + y=0; + if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error T1","unserialize_lsh_hashtables_format2()"); + } + while(token != O2_SERIAL_TOKEN_ENDTABLE){ + if(token == O2_SERIAL_TOKEN_T1){ + if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error t1","unserialize_lsh_hashtables_format2()"); + } + y=token; + if(y==n){ + printf("D[%d,%d]", x, y); + if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error 2","unserialize_lsh_hashtables_format2()"); + } + printf("[numElements=%d]", token); + if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error 3","unserialize_lsh_hashtables_format2()"); + } + while(!(token==O2_SERIAL_TOKEN_ENDTABLE || token==O2_SERIAL_TOKEN_T1)){ + // Check for T2 token + if(token!=O2_SERIAL_TOKEN_T2){ + printf("t2=%d",token); + fclose(dbFile); + error("State machine error T2 token", "unserialize_hashtable_row_format2()"); + } + // Read t2 value + if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error t2","unserialize_hashtable_row_format2"); + } + if(fread(&pointID, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error pointID","unserialize_hashtable_row_format2"); + } + while(!(pointID==O2_SERIAL_TOKEN_ENDTABLE || pointID==O2_SERIAL_TOKEN_T1 || pointID==O2_SERIAL_TOKEN_T2 )){ + printf("(%0X,%u)", token, pointID); + if(fread(&pointID, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error H::p","unserialize_hashtable_row_format2"); + } + } + token = pointID; // Copy last found token + } + printf("\n"); + } + else{ // gobble up rest of row + while(!(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE)){ + if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){ + fclose(dbFile); + error("Read error 4","unserialize_lsh_hashtables_format2()"); + } + } + } + } + } + if(token==O2_SERIAL_TOKEN_ENDTABLE){ + x++; + } + } + close(dbfid); + } + void G::dump_core_hashtable_array(Uns32T* p){ Uns32T skip; Uns32T t2; @@ -1721,11 +1839,11 @@ p1 = *p++; p2 = *p++; skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB); - printf("(%0x, %0x)", t2, p1 ^ (p1 & SKIP_BITS)); + printf("(%0X, %u)", t2, p1 & ~SKIP_BITS); if(skip--){ - printf("(%0x, %0x)", t2, p2 ^ (p2 & SKIP_BITS)); + printf("(%0X, %u)", t2, p2 & ~SKIP_BITS); while(skip-- ) - printf("(%0x, %0x)", t2, *p++); + printf("(%0X, %u)", t2, *p++); } }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN ); }
--- a/lshlib.h Sun Feb 08 15:53:57 2009 +0000 +++ b/lshlib.h Thu Nov 25 13:42:40 2010 +0000 @@ -393,7 +393,8 @@ float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; } char* get_indexName(){return indexName;} void dump_hashtables(); - + void dump_core_row(Uns32T n); + void dump_disk_row(char*, Uns32T n); }; typedef class G LSH;
--- a/query.cpp Sun Feb 08 15:53:57 2009 +0000 +++ b/query.cpp Thu Nov 25 13:42:40 2010 +0000 @@ -203,31 +203,35 @@ } int audiodb_track_id_datum_offset(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset, size_t num_vectors, adb_fd_cache_t* cache){ - off_t track_offset = (*adb->track_offsets)[track_id]; - if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) { /* create a reference/insert, then use adb_insert_create_datum() */ adb_reference_t *reference = NULL; if(! (cache && cache->reference) ){ reference = (adb_reference_t *) malloc(sizeof(adb_reference_t)); reference->features = (char*) malloc(ADB_MAXSTR*sizeof(char)); - if(adb->header->flags & ADB_HEADER_FLAG_POWER) + if(adb->header->flags & ADB_HEADER_FLAG_POWER) { reference->power = (char*) malloc(ADB_MAXSTR*sizeof(char)); - else + } + else{ reference->power = NULL; - if(adb->header->flags & ADB_HEADER_FLAG_TIMES) + } + if(adb->header->flags & ADB_HEADER_FLAG_TIMES){ reference->times = (char*)malloc(ADB_MAXSTR*sizeof(char)); - else + } + else{ reference->times = NULL; - if(cache) + } + if(cache){ cache->reference = reference; + } } - else + else{ reference = cache->reference; - + } if(! (cache && cache->track_id==track_id) ){ - if(cache) + if(cache){ cache->track_id = track_id; + } lseek(adb->fd, adb->header->dataOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); read_or_goto_error(adb->fd, (void *)reference->features, ADB_MAXSTR); if(adb->header->flags & ADB_HEADER_FLAG_POWER) { @@ -239,14 +243,15 @@ read_or_goto_error(adb->fd, (void *)reference->times, ADB_MAXSTR); } } - - int retval = audiodb_insert_create_datum_offset(reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double), cache); + int retval = audiodb_insert_create_datum_offset(reference, d, vector_offset, num_vectors, cache); if(!cache){ audiodb_free_datum_reference(reference); free(reference); } return retval; - } else { + } + else { + off_t track_offset = (*adb->track_offsets)[track_id]; /* initialize from sources of data that we already have */ if(num_vectors) d->nvectors = num_vectors; @@ -353,7 +358,7 @@ datum = spec->qid.datum; sequence_length = spec->qid.sequence_length; - sequence_start = spec->qid.sequence_start; + sequence_start = (spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) ? spec->qid.sequence_start : 0; if(datum->data) { if(datum->dim != adb->header->dim) { @@ -386,11 +391,11 @@ if(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) { /* the qpointers are already at the start, and so correct. */ } else { - /* adjust the qpointers to point to the correct place in the sequence */ - *vector = *vector_data + spec->qid.sequence_start * d.dim; - qpointers->l2norm = qpointers->l2norm_data + spec->qid.sequence_start; + /* qpointers are already at the start of the extracted sequence */ + *vector = *vector_data; + qpointers->l2norm = qpointers->l2norm_data; if(d.power) { - qpointers->power = qpointers->power_data + spec->qid.sequence_start; + qpointers->power = qpointers->power_data; } qpointers->nvectors = sequence_length; }