changeset 524:469b50a3dd84 multiprobeLSH

Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted.
author mas01mc
date Wed, 28 Jan 2009 16:02:17 +0000
parents 83e37b76b483
children 11dd6eab15c8
files Makefile lshlib.cpp lshlib.h
diffstat 3 files changed, 53 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Wed Jan 28 05:18:14 2009 +0000
+++ b/Makefile	Wed Jan 28 16:02:17 2009 +0000
@@ -17,7 +17,7 @@
 MINORVERSION=0
 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION)
 
-override CFLAGS+=-O3 -g -fPIC
+override CFLAGS+=-ggdb -g -fPIC
 
 # set to DUMP hashtables on QUERY load
 #override CFLAGS+=-DLSH_DUMP_CORE_TABLES
--- a/lshlib.cpp	Wed Jan 28 05:18:14 2009 +0000
+++ b/lshlib.cpp	Wed Jan 28 16:02:17 2009 +0000
@@ -431,7 +431,7 @@
     __sbucket_insert_point(p->snext.ptr);
     return;
   }
-
+  /*
   if(p->next){
     // Construct list in t2 order
     if(H::t2 < p->next->t2){
@@ -440,9 +440,23 @@
       p->next = tmp;
       __bucket_insert_point(tmp);
     }
-    else
-      __bucket_insert_point(p->next);
+  */
+  // insert bucket before current bucket
+  if(H::t2 < p->t2){
+      bucket* tmp = new bucket();
+      // copy current bucket contents into new bucket
+      tmp->next = p->next;
+      tmp->t2 = p->t2;
+      tmp->snext.ptr = p->snext.ptr;
+      p->next = tmp;
+      p->t2 = IFLAG;
+      p->snext.ptr=0;
+      __bucket_insert_point(p);
+      return;
   }
+
+  if(p->next)
+    __bucket_insert_point(p->next);  
   else {
     p->next = new bucket();
     __bucket_insert_point(p->next);
@@ -658,6 +672,7 @@
 //
 // T1 - T1 hash token 
 // t1 - t1 hash key (t1 range 0..2^29-1) 
+// %buckets+points% numElements in row for ARRAY encoding
 // T2 - T2 token
 // t2 - t2 hash key (range 1..2^32-6)
 // p  - point identifier (range 0..2^32-1)
@@ -669,7 +684,7 @@
 // {...}^L - repeat argument L times
 //
 // FORMAT2 Regular expression:
-// { [T1 t1 [T2 t2 p+]+ ]* E }^L
+// { [T1 t1 %buckets+points% [T2 t2 p+]+ ]* E }^L
 //
 
 // Serial header constructors
@@ -1099,17 +1114,29 @@
 }
 
 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
+#ifdef _LSH_DEBUG_
+  Uns32T last_t2 = 0;
+#endif
   while(b && b->t2!=IFLAG){
     if(!b->snext.ptr){
       fclose(dbFile);
       error("Empty collision chain in serial_write_hashtable_row_format2()");
     }
     t2 = O2_SERIAL_TOKEN_T2;
+
     if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
       fclose(dbFile);
       error("write error in serial_write_hashtable_row_format2()");
     }
     t2 = b->t2;
+#ifdef _LSH_DEBUG_
+    if(t2 < last_t2){
+      fclose(dbFile);
+      error("t2<last_t2 in serial_write_hashtable_row_format2()");
+    }
+    last_t2 = t2;
+#endif      
+
     if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
       fclose(dbFile);
       error("write error in serial_write_hashtable_row_format2()");
@@ -1390,16 +1417,19 @@
 	  error("Read error: numElements","unserialize_lsh_hashtables_format2()");
 	}
 	
+#ifdef _LSH_DEBUG_
+	cout << "[" << x << "," << y << "] numElements(disk) = " << numElements;
+#endif    	
 	// BACKWARD COMPATIBILITY: check to see if T2 or END token was read
 	if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){
 	  forMerge=true; // Force use of dynamic linked list core format
 	  token = numElements;
 	}
 
-	if(forMerge)
+	if(forMerge) // FOR INDEXING use dynamic linked list data structure
 	  // Use linked list CORE format
 	  token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token);
-	else
+	else // FOR QUERY use static array data structure
 	  // Use ARRAY CORE format with numElements counter
 	  token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements);
 #else
@@ -1420,7 +1450,7 @@
 	  H::t1 = token;
       }
 #ifdef _LSH_DEBUG_
-    cout << "table " << x << " pointCount = " << tablesPointCount << endl;
+    cout << " pointCount = " << tablesPointCount << endl;
     sumTablesPointCount+=tablesPointCount;
 #endif    
   }
@@ -1494,7 +1524,7 @@
 //
 // We store the values of numPoints and numBuckets in separate fields of the first bucket
 // rowPtr->t2 // numPoints
-// (Uns32T)(rowPtr->snext) // numBuckets
+// (rowPtr->snext.numBuckets) // numBuckets
 //
 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*))
 // To get to the fist bucket, we use 
@@ -1534,13 +1564,16 @@
   // Initialize new row
   if(!*rowPP){
     *rowPP = new bucket();
+#ifdef _LSH_DEBUG_
+    cout << " () ";
+#endif
 #ifdef LSH_LIST_HEAD_COUNTERS
     (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row
     (*rowPP)->next = 0;
 #endif
   }
   bucket* rowPtr = *rowPP;
-
+  Uns32T last_t2 = 0;
   READ_UNS32T(&(H::t2),"t2");
   TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2");
   // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements
@@ -1550,8 +1583,12 @@
   while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){
     numPointsThisBucket = 0;// reset bucket point counter
     secondPtr = 0; // reset second-point pointer
-    TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");
-    READ_UNS32T(&(H::t2), "Read error t2");
+    TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");    
+    READ_UNS32T(&(H::t2), "Read error t2");    
+    if(H::t2<last_t2)
+      cout << "last_t2=" << last_t2 << ", t2=" << H::t2 << endl;
+    TEST_TOKEN(H::t2<last_t2, "t2 tokens not in ascending order");
+    last_t2 = H::t2;
     *ap++ = H::t2; // Insert t2 value into array
     numBuckets++;
     READ_UNS32T(&(H::p), "Read error H::p"); 
@@ -1586,6 +1623,9 @@
   // Allocate a new dynamic list head at the end of the array
   bucket** listPtr = reinterpret_cast<bucket**> (ap);
   *listPtr = 0;
+#ifdef _LSH_DEBUG_
+  cout << " numBuckets=" << numBuckets << " numPoints=" << numPoints << " numElements(array) " << numBuckets+numPoints << " " << endl;
+#endif
   H::tablesPointCount += numPoints;
   // Return current token
   return H::t2; // return H::t2 which holds current token [E or T1]
--- a/lshlib.h	Wed Jan 28 05:18:14 2009 +0000
+++ b/lshlib.h	Wed Jan 28 16:02:17 2009 +0000
@@ -94,7 +94,7 @@
 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\
   fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);}	
 
-#define LSH_DUMP_CORE_TABLES  // set to dump hashtables on load
+//#define LSH_DUMP_CORE_TABLES  // set to dump hashtables on load
 #define _LSH_DEBUG_             // turn on debugging information
 
 //#define USE_U_FUNCTIONS       // set to use partial hashfunction re-use