Mercurial > hg > audiodb
changeset 524:469b50a3dd84 multiprobeLSH
Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted.
author | mas01mc |
---|---|
date | Wed, 28 Jan 2009 16:02:17 +0000 |
parents | 83e37b76b483 |
children | 11dd6eab15c8 |
files | Makefile lshlib.cpp lshlib.h |
diffstat | 3 files changed, 53 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Wed Jan 28 05:18:14 2009 +0000 +++ b/Makefile Wed Jan 28 16:02:17 2009 +0000 @@ -17,7 +17,7 @@ MINORVERSION=0 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION) -override CFLAGS+=-O3 -g -fPIC +override CFLAGS+=-ggdb -g -fPIC # set to DUMP hashtables on QUERY load #override CFLAGS+=-DLSH_DUMP_CORE_TABLES
--- a/lshlib.cpp Wed Jan 28 05:18:14 2009 +0000 +++ b/lshlib.cpp Wed Jan 28 16:02:17 2009 +0000 @@ -431,7 +431,7 @@ __sbucket_insert_point(p->snext.ptr); return; } - + /* if(p->next){ // Construct list in t2 order if(H::t2 < p->next->t2){ @@ -440,9 +440,23 @@ p->next = tmp; __bucket_insert_point(tmp); } - else - __bucket_insert_point(p->next); + */ + // insert bucket before current bucket + if(H::t2 < p->t2){ + bucket* tmp = new bucket(); + // copy current bucket contents into new bucket + tmp->next = p->next; + tmp->t2 = p->t2; + tmp->snext.ptr = p->snext.ptr; + p->next = tmp; + p->t2 = IFLAG; + p->snext.ptr=0; + __bucket_insert_point(p); + return; } + + if(p->next) + __bucket_insert_point(p->next); else { p->next = new bucket(); __bucket_insert_point(p->next); @@ -658,6 +672,7 @@ // // T1 - T1 hash token // t1 - t1 hash key (t1 range 0..2^29-1) +// %buckets+points% numElements in row for ARRAY encoding // T2 - T2 token // t2 - t2 hash key (range 1..2^32-6) // p - point identifier (range 0..2^32-1) @@ -669,7 +684,7 @@ // {...}^L - repeat argument L times // // FORMAT2 Regular expression: -// { [T1 t1 [T2 t2 p+]+ ]* E }^L +// { [T1 t1 %buckets+points% [T2 t2 p+]+ ]* E }^L // // Serial header constructors @@ -1099,17 +1114,29 @@ } void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){ +#ifdef _LSH_DEBUG_ + Uns32T last_t2 = 0; +#endif while(b && b->t2!=IFLAG){ if(!b->snext.ptr){ fclose(dbFile); error("Empty collision chain in serial_write_hashtable_row_format2()"); } t2 = O2_SERIAL_TOKEN_T2; + if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){ fclose(dbFile); error("write error in serial_write_hashtable_row_format2()"); } t2 = b->t2; +#ifdef _LSH_DEBUG_ + if(t2 < last_t2){ + fclose(dbFile); + error("t2<last_t2 in serial_write_hashtable_row_format2()"); + } + last_t2 = t2; +#endif + if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){ fclose(dbFile); error("write error in serial_write_hashtable_row_format2()"); @@ -1390,16 +1417,19 @@ error("Read error: numElements","unserialize_lsh_hashtables_format2()"); } +#ifdef _LSH_DEBUG_ + cout << "[" << x << "," << y << "] numElements(disk) = " << numElements; +#endif // BACKWARD COMPATIBILITY: check to see if T2 or END token was read if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){ forMerge=true; // Force use of dynamic linked list core format token = numElements; } - if(forMerge) + if(forMerge) // FOR INDEXING use dynamic linked list data structure // Use linked list CORE format token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token); - else + else // FOR QUERY use static array data structure // Use ARRAY CORE format with numElements counter token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements); #else @@ -1420,7 +1450,7 @@ H::t1 = token; } #ifdef _LSH_DEBUG_ - cout << "table " << x << " pointCount = " << tablesPointCount << endl; + cout << " pointCount = " << tablesPointCount << endl; sumTablesPointCount+=tablesPointCount; #endif } @@ -1494,7 +1524,7 @@ // // We store the values of numPoints and numBuckets in separate fields of the first bucket // rowPtr->t2 // numPoints -// (Uns32T)(rowPtr->snext) // numBuckets +// (rowPtr->snext.numBuckets) // numBuckets // // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*)) // To get to the fist bucket, we use @@ -1534,13 +1564,16 @@ // Initialize new row if(!*rowPP){ *rowPP = new bucket(); +#ifdef _LSH_DEBUG_ + cout << " () "; +#endif #ifdef LSH_LIST_HEAD_COUNTERS (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row (*rowPP)->next = 0; #endif } bucket* rowPtr = *rowPP; - + Uns32T last_t2 = 0; READ_UNS32T(&(H::t2),"t2"); TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2"); // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements @@ -1550,8 +1583,12 @@ while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){ numPointsThisBucket = 0;// reset bucket point counter secondPtr = 0; // reset second-point pointer - TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2"); - READ_UNS32T(&(H::t2), "Read error t2"); + TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2"); + READ_UNS32T(&(H::t2), "Read error t2"); + if(H::t2<last_t2) + cout << "last_t2=" << last_t2 << ", t2=" << H::t2 << endl; + TEST_TOKEN(H::t2<last_t2, "t2 tokens not in ascending order"); + last_t2 = H::t2; *ap++ = H::t2; // Insert t2 value into array numBuckets++; READ_UNS32T(&(H::p), "Read error H::p"); @@ -1586,6 +1623,9 @@ // Allocate a new dynamic list head at the end of the array bucket** listPtr = reinterpret_cast<bucket**> (ap); *listPtr = 0; +#ifdef _LSH_DEBUG_ + cout << " numBuckets=" << numBuckets << " numPoints=" << numPoints << " numElements(array) " << numBuckets+numPoints << " " << endl; +#endif H::tablesPointCount += numPoints; // Return current token return H::t2; // return H::t2 which holds current token [E or T1]
--- a/lshlib.h Wed Jan 28 05:18:14 2009 +0000 +++ b/lshlib.h Wed Jan 28 16:02:17 2009 +0000 @@ -94,7 +94,7 @@ #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\ fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);} -#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load +//#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load #define _LSH_DEBUG_ // turn on debugging information //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use