# HG changeset patch # User mas01mc # Date 1233158537 0 # Node ID 469b50a3dd842ea1357e61dc1ab13432e45afc73 # Parent 83e37b76b483040baf8d96cd02d17fa51247bc59 Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted. diff -r 83e37b76b483 -r 469b50a3dd84 Makefile --- a/Makefile Wed Jan 28 05:18:14 2009 +0000 +++ b/Makefile Wed Jan 28 16:02:17 2009 +0000 @@ -17,7 +17,7 @@ MINORVERSION=0 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION) -override CFLAGS+=-O3 -g -fPIC +override CFLAGS+=-ggdb -g -fPIC # set to DUMP hashtables on QUERY load #override CFLAGS+=-DLSH_DUMP_CORE_TABLES diff -r 83e37b76b483 -r 469b50a3dd84 lshlib.cpp --- a/lshlib.cpp Wed Jan 28 05:18:14 2009 +0000 +++ b/lshlib.cpp Wed Jan 28 16:02:17 2009 +0000 @@ -431,7 +431,7 @@ __sbucket_insert_point(p->snext.ptr); return; } - + /* if(p->next){ // Construct list in t2 order if(H::t2 < p->next->t2){ @@ -440,9 +440,23 @@ p->next = tmp; __bucket_insert_point(tmp); } - else - __bucket_insert_point(p->next); + */ + // insert bucket before current bucket + if(H::t2 < p->t2){ + bucket* tmp = new bucket(); + // copy current bucket contents into new bucket + tmp->next = p->next; + tmp->t2 = p->t2; + tmp->snext.ptr = p->snext.ptr; + p->next = tmp; + p->t2 = IFLAG; + p->snext.ptr=0; + __bucket_insert_point(p); + return; } + + if(p->next) + __bucket_insert_point(p->next); else { p->next = new bucket(); __bucket_insert_point(p->next); @@ -658,6 +672,7 @@ // // T1 - T1 hash token // t1 - t1 hash key (t1 range 0..2^29-1) +// %buckets+points% numElements in row for ARRAY encoding // T2 - T2 token // t2 - t2 hash key (range 1..2^32-6) // p - point identifier (range 0..2^32-1) @@ -669,7 +684,7 @@ // {...}^L - repeat argument L times // // FORMAT2 Regular expression: -// { [T1 t1 [T2 t2 p+]+ ]* E }^L +// { [T1 t1 %buckets+points% [T2 t2 p+]+ ]* E }^L // // Serial header constructors @@ -1099,17 +1114,29 @@ } void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){ +#ifdef _LSH_DEBUG_ + Uns32T last_t2 = 0; +#endif while(b && b->t2!=IFLAG){ if(!b->snext.ptr){ fclose(dbFile); error("Empty collision chain in serial_write_hashtable_row_format2()"); } t2 = O2_SERIAL_TOKEN_T2; + if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){ fclose(dbFile); error("write error in serial_write_hashtable_row_format2()"); } t2 = b->t2; +#ifdef _LSH_DEBUG_ + if(t2 < last_t2){ + fclose(dbFile); + error("t2t2 = 0; // Use t2 as a collision counter for the row (*rowPP)->next = 0; #endif } bucket* rowPtr = *rowPP; - + Uns32T last_t2 = 0; READ_UNS32T(&(H::t2),"t2"); TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2"); // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements @@ -1550,8 +1583,12 @@ while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){ numPointsThisBucket = 0;// reset bucket point counter secondPtr = 0; // reset second-point pointer - TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2"); - READ_UNS32T(&(H::t2), "Read error t2"); + TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2"); + READ_UNS32T(&(H::t2), "Read error t2"); + if(H::t2 (ap); *listPtr = 0; +#ifdef _LSH_DEBUG_ + cout << " numBuckets=" << numBuckets << " numPoints=" << numPoints << " numElements(array) " << numBuckets+numPoints << " " << endl; +#endif H::tablesPointCount += numPoints; // Return current token return H::t2; // return H::t2 which holds current token [E or T1] diff -r 83e37b76b483 -r 469b50a3dd84 lshlib.h --- a/lshlib.h Wed Jan 28 05:18:14 2009 +0000 +++ b/lshlib.h Wed Jan 28 16:02:17 2009 +0000 @@ -94,7 +94,7 @@ #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\ fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);} -#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load +//#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load #define _LSH_DEBUG_ // turn on debugging information //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use