# HG changeset patch
# User mas01mc
# Date 1233158537 0
# Node ID 469b50a3dd842ea1357e61dc1ab13432e45afc73
# Parent  83e37b76b483040baf8d96cd02d17fa51247bc59
Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted.

diff -r 83e37b76b483 -r 469b50a3dd84 Makefile
--- a/Makefile	Wed Jan 28 05:18:14 2009 +0000
+++ b/Makefile	Wed Jan 28 16:02:17 2009 +0000
@@ -17,7 +17,7 @@
 MINORVERSION=0
 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION)
 
-override CFLAGS+=-O3 -g -fPIC
+override CFLAGS+=-ggdb -g -fPIC
 
 # set to DUMP hashtables on QUERY load
 #override CFLAGS+=-DLSH_DUMP_CORE_TABLES
diff -r 83e37b76b483 -r 469b50a3dd84 lshlib.cpp
--- a/lshlib.cpp	Wed Jan 28 05:18:14 2009 +0000
+++ b/lshlib.cpp	Wed Jan 28 16:02:17 2009 +0000
@@ -431,7 +431,7 @@
     __sbucket_insert_point(p->snext.ptr);
     return;
   }
-
+  /*
   if(p->next){
     // Construct list in t2 order
     if(H::t2 < p->next->t2){
@@ -440,9 +440,23 @@
       p->next = tmp;
       __bucket_insert_point(tmp);
     }
-    else
-      __bucket_insert_point(p->next);
+  */
+  // insert bucket before current bucket
+  if(H::t2 < p->t2){
+      bucket* tmp = new bucket();
+      // copy current bucket contents into new bucket
+      tmp->next = p->next;
+      tmp->t2 = p->t2;
+      tmp->snext.ptr = p->snext.ptr;
+      p->next = tmp;
+      p->t2 = IFLAG;
+      p->snext.ptr=0;
+      __bucket_insert_point(p);
+      return;
   }
+
+  if(p->next)
+    __bucket_insert_point(p->next);  
   else {
     p->next = new bucket();
     __bucket_insert_point(p->next);
@@ -658,6 +672,7 @@
 //
 // T1 - T1 hash token 
 // t1 - t1 hash key (t1 range 0..2^29-1) 
+// %buckets+points% numElements in row for ARRAY encoding
 // T2 - T2 token
 // t2 - t2 hash key (range 1..2^32-6)
 // p  - point identifier (range 0..2^32-1)
@@ -669,7 +684,7 @@
 // {...}^L - repeat argument L times
 //
 // FORMAT2 Regular expression:
-// { [T1 t1 [T2 t2 p+]+ ]* E }^L
+// { [T1 t1 %buckets+points% [T2 t2 p+]+ ]* E }^L
 //
 
 // Serial header constructors
@@ -1099,17 +1114,29 @@
 }
 
 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
+#ifdef _LSH_DEBUG_
+  Uns32T last_t2 = 0;
+#endif
   while(b && b->t2!=IFLAG){
     if(!b->snext.ptr){
       fclose(dbFile);
       error("Empty collision chain in serial_write_hashtable_row_format2()");
     }
     t2 = O2_SERIAL_TOKEN_T2;
+
     if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
       fclose(dbFile);
       error("write error in serial_write_hashtable_row_format2()");
     }
     t2 = b->t2;
+#ifdef _LSH_DEBUG_
+    if(t2 < last_t2){
+      fclose(dbFile);
+      error("t2<last_t2 in serial_write_hashtable_row_format2()");
+    }
+    last_t2 = t2;
+#endif      
+
     if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
       fclose(dbFile);
       error("write error in serial_write_hashtable_row_format2()");
@@ -1390,16 +1417,19 @@
 	  error("Read error: numElements","unserialize_lsh_hashtables_format2()");
 	}
 	
+#ifdef _LSH_DEBUG_
+	cout << "[" << x << "," << y << "] numElements(disk) = " << numElements;
+#endif    	
 	// BACKWARD COMPATIBILITY: check to see if T2 or END token was read
 	if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){
 	  forMerge=true; // Force use of dynamic linked list core format
 	  token = numElements;
 	}
 
-	if(forMerge)
+	if(forMerge) // FOR INDEXING use dynamic linked list data structure
 	  // Use linked list CORE format
 	  token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token);
-	else
+	else // FOR QUERY use static array data structure
 	  // Use ARRAY CORE format with numElements counter
 	  token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements);
 #else
@@ -1420,7 +1450,7 @@
 	  H::t1 = token;
       }
 #ifdef _LSH_DEBUG_
-    cout << "table " << x << " pointCount = " << tablesPointCount << endl;
+    cout << " pointCount = " << tablesPointCount << endl;
     sumTablesPointCount+=tablesPointCount;
 #endif    
   }
@@ -1494,7 +1524,7 @@
 //
 // We store the values of numPoints and numBuckets in separate fields of the first bucket
 // rowPtr->t2 // numPoints
-// (Uns32T)(rowPtr->snext) // numBuckets
+// (rowPtr->snext.numBuckets) // numBuckets
 //
 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*))
 // To get to the fist bucket, we use 
@@ -1534,13 +1564,16 @@
   // Initialize new row
   if(!*rowPP){
     *rowPP = new bucket();
+#ifdef _LSH_DEBUG_
+    cout << " () ";
+#endif
 #ifdef LSH_LIST_HEAD_COUNTERS
     (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row
     (*rowPP)->next = 0;
 #endif
   }
   bucket* rowPtr = *rowPP;
-
+  Uns32T last_t2 = 0;
   READ_UNS32T(&(H::t2),"t2");
   TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2");
   // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements
@@ -1550,8 +1583,12 @@
   while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){
     numPointsThisBucket = 0;// reset bucket point counter
     secondPtr = 0; // reset second-point pointer
-    TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");
-    READ_UNS32T(&(H::t2), "Read error t2");
+    TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");    
+    READ_UNS32T(&(H::t2), "Read error t2");    
+    if(H::t2<last_t2)
+      cout << "last_t2=" << last_t2 << ", t2=" << H::t2 << endl;
+    TEST_TOKEN(H::t2<last_t2, "t2 tokens not in ascending order");
+    last_t2 = H::t2;
     *ap++ = H::t2; // Insert t2 value into array
     numBuckets++;
     READ_UNS32T(&(H::p), "Read error H::p"); 
@@ -1586,6 +1623,9 @@
   // Allocate a new dynamic list head at the end of the array
   bucket** listPtr = reinterpret_cast<bucket**> (ap);
   *listPtr = 0;
+#ifdef _LSH_DEBUG_
+  cout << " numBuckets=" << numBuckets << " numPoints=" << numPoints << " numElements(array) " << numBuckets+numPoints << " " << endl;
+#endif
   H::tablesPointCount += numPoints;
   // Return current token
   return H::t2; // return H::t2 which holds current token [E or T1]
diff -r 83e37b76b483 -r 469b50a3dd84 lshlib.h
--- a/lshlib.h	Wed Jan 28 05:18:14 2009 +0000
+++ b/lshlib.h	Wed Jan 28 16:02:17 2009 +0000
@@ -94,7 +94,7 @@
 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\
   fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);}	
 
-#define LSH_DUMP_CORE_TABLES  // set to dump hashtables on load
+//#define LSH_DUMP_CORE_TABLES  // set to dump hashtables on load
 #define _LSH_DEBUG_             // turn on debugging information
 
 //#define USE_U_FUNCTIONS       // set to use partial hashfunction re-use