changeset 336:fe4d5b763086

converted read/write into fread/fwrite for LSH hashtable serialize and unserialize. INDEXING is now faster.
author mas01mc
date Fri, 05 Sep 2008 14:16:21 +0000
parents 69d5649d3e1c
children ccf15a8c3e46
files lshlib.cpp lshlib.h
diffstat 2 files changed, 72 insertions(+), 62 deletions(-) [+]
line wrap: on
line diff
--- a/lshlib.cpp	Tue Sep 02 17:10:59 2008 +0000
+++ b/lshlib.cpp	Fri Sep 05 14:16:21 2008 +0000
@@ -436,10 +436,13 @@
 
   // Format2 always needs unserializing
   if(lshHeader->flags&O2_SERIAL_FILEFORMAT2 && lshInCoreFlag){
-    unserialize_lsh_hashtables_format2(dbfid);
+    FILE* dbFile = fdopen(dbfid, "rb");
+    if(!dbFile)
+      error("Cannot open LSH file for reading", filename);
+    unserialize_lsh_hashtables_format2(dbFile);
   }
-
-  close(dbfid);}
+  serial_close(dbfid);
+}
 
 G::~G(){
   delete lshHeader;
@@ -658,10 +661,15 @@
   // Write the hashtables in the requested format
   if(serialFormat == O2_SERIAL_FILEFORMAT1)
     serialize_lsh_hashtables_format1(dbfid, !dbIsNew);
-  else
-    serialize_lsh_hashtables_format2(dbfid, !dbIsNew);
+  else{
+    FILE* dbFile = fdopen(dbfid, "r+b");
+    if(!dbFile)
+      error("Cannot open LSH file for writing",filename);
+    serialize_lsh_hashtables_format2(dbFile, !dbIsNew);
+    fflush(dbFile);
+  }
 
-  if(!dbIsNew){
+  if(!dbIsNew) { 
     db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
     //serial_get_header(db);           // read header
     cout << "maxp = " << H::maxp << endl;
@@ -671,9 +679,8 @@
       lshHeader->flags|=O2_SERIAL_FILEFORMAT1;
     memcpy((char*)db, (char*)lshHeader, sizeof(SerialHeaderT));
     serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
-  }
-
-  serial_close(dbfid);
+  }  
+    serial_close(dbfid);
 }
 
 // Test to see if core structure and requested format is
@@ -874,7 +881,7 @@
   return;
 }
 
-int G::serialize_lsh_hashtables_format2(int fid, int merge){
+int G::serialize_lsh_hashtables_format2(FILE* dbFile, int merge){
   Uns32T x,y;
 
   if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
@@ -882,10 +889,13 @@
 
   // We must pereform FORMAT2 merges in core
   if(merge)
-    unserialize_lsh_hashtables_format2(fid);
+    unserialize_lsh_hashtables_format2(dbFile);
 
   Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
-  lseek(fid, get_serial_hashtable_offset(), SEEK_SET);
+  if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
+    fclose(dbFile);
+    error("fSeek error in serialize_lsh_hashtables_format2");
+  }
 
   // Write the hash tables
   for( x = 0 ; x < H::L ; x++ ){
@@ -901,29 +911,29 @@
 	// Check for empty row (even though row was allocated)
 #ifdef LSH_BLOCK_FULL_ROWS
 	if(bPtr->next->t2==IFLAG){
-	  close(fid);
+	  fclose(dbFile);
 	  error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
 	}
 #else
 	if(bPtr->t2==IFLAG){
-	  close(fid);
+	  fclose(dbFile);
 	  error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
 	}
 #endif
 	t1 = O2_SERIAL_TOKEN_T1;
-	if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){
-	  close(fid);
+	if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){
+	  fclose(dbFile);
 	  error("write error in serial_write_hashtable_format2() [T1]");
 	}
 	t1 = y;
-	if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){
-	  close(fid);
+	if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){
+	  fclose(dbFile);
 	  error("write error in serial_write_hashtable_format2() [t1]");
 	}
 #ifdef LSH_BLOCK_FULL_ROWS
-	serial_write_hashtable_row_format2(fid, bPtr->next, colCount); // skip collision counter bucket
+	serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket
 #else
-      serial_write_hashtable_row_format2(fid, bPtr, colCount);
+      serial_write_hashtable_row_format2(dbFile, bPtr, colCount);
 #endif
       }
       if(colCount){
@@ -937,8 +947,8 @@
     }
     // Write END of table marker
     t1 = O2_SERIAL_TOKEN_ENDTABLE;
-    if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){
-      close(fid);
+    if( fwrite(&t1, sizeof(Uns32T), 1, dbFile ) != 1 ){
+      fclose(dbFile);
       error("write error in serial_write_hashtable_format2() [end]");
     }
 
@@ -952,31 +962,31 @@
   return 1;
 }
 
-void G::serial_write_hashtable_row_format2(int fid, bucket* b, Uns32T& colCount){
+void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
   while(b && b->t2!=IFLAG){
     if(!b->snext){
-      close(fid);
+      fclose(dbFile);
       error("Empty collision chain in serial_write_hashtable_row_format2()");
     }
     t2 = O2_SERIAL_TOKEN_T2;
-    if( write(fid, &t2, sizeof(Uns32T)) != sizeof(Uns32T) ){
-      close(fid);
+    if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
+      fclose(dbFile);
       error("write error in serial_write_hashtable_row_format2()");
     }
     t2 = b->t2;
-    if( write(fid, &t2, sizeof(Uns32T)) != sizeof(Uns32T) ){
-      close(fid);
+    if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
+      fclose(dbFile);
       error("write error in serial_write_hashtable_row_format2()");
     }
-    serial_write_element_format2(fid, b->snext, colCount);
+    serial_write_element_format2(dbFile, b->snext, colCount);
     b=b->next;
   }
 }
 
-void G::serial_write_element_format2(int fid, sbucket* sb, Uns32T& colCount){
+void G::serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount){
   while(sb){
-    if(write(fid, &sb->pointID, sizeof(Uns32T))!=sizeof(Uns32T)){
-      close(fid);
+    if(fwrite(&sb->pointID, sizeof(Uns32T), 1, dbFile) != 1){
+      fclose(dbFile);
       error("Write error in serial_write_element_format2()");
     }
     colCount++;
@@ -1197,19 +1207,19 @@
   }
 }
  
-void G::unserialize_lsh_hashtables_format2(int fid){
+void G::unserialize_lsh_hashtables_format2(FILE* dbFile){
   Uns32T x=0,y=0;
 
   // Seek to hashtable base offset
-  if(lseek(fid, get_serial_hashtable_offset(), SEEK_SET)!=get_serial_hashtable_offset()){
-    close(fid);
-    error("Seek error in unserialize_lsh_hashtables_format2");
+  if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
+    fclose(dbFile);
+    error("fSeek error in unserialize_lsh_hashtables_format2");
   }
 
   // Read the hash tables into core (structure is given in header) 
   while( x < H::L){
-    if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){
-      close(fid);
+    if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
+      fclose(dbFile);
       error("Read error","unserialize_lsh_hashtables_format2()");
     }
     if(H::t1==O2_SERIAL_TOKEN_ENDTABLE)
@@ -1218,19 +1228,19 @@
       while(y < H::N){
 	// Read a row and move file pointer to beginning of next row or _bittable
 	if(!(H::t1==O2_SERIAL_TOKEN_T1)){
-	  close(fid);
+	  fclose(dbFile);
 	  error("State matchine error T1","unserialize_lsh_hashtables_format2()");
 	}
-	if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){
-	  close(fid);
+	if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
+	  fclose(dbFile);
 	  error("Read error: t1","unserialize_lsh_hashtables_format2()");
 	}
 	y = H::t1;
 	if(y>=H::N){
-	  close(fid);
+	  fclose(dbFile);
 	  error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
 	}
-	Uns32T token = unserialize_hashtable_row_format2(fid, h[x]+y);
+	Uns32T token = unserialize_hashtable_row_format2(dbFile, h[x]+y);
 	
 #ifdef __LSH_DUMP_CORE_TABLES__
 	printf("C[%d,%d]", x, y);
@@ -1238,7 +1248,7 @@
 #endif
 	// Check that token is valid
 	if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
-	  close(fid);
+	  fclose(dbFile);
 	  error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()");
 	}
 	// Check for end of table flag
@@ -1253,37 +1263,37 @@
   }
 }
  
-Uns32T G::unserialize_hashtable_row_format2(int fid, bucket** b){
+Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b){
   bool pointFound = false;
-  if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){
-    close(fid);
+  if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
+    fclose(dbFile);
     error("Read error T2 token","unserialize_hashtable_row_format2");
   }
   if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
-    close(fid);
+    fclose(dbFile);
     error("State machine error: expected E or T2");
   }
   while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
     pointFound=false;
     // Check for T2 token
     if(H::t2!=O2_SERIAL_TOKEN_T2){
-      close(fid);
+      fclose(dbFile);
       error("State machine error T2 token", "unserialize_hashtable_row_format2()");
     }
     // Read t2 value
-    if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){
-      close(fid);
+    if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
+      fclose(dbFile);
       error("Read error t2","unserialize_hashtable_row_format2");
     }
-    if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){
-      close(fid);
+    if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
+      fclose(dbFile);
       error("Read error H::p","unserialize_hashtable_row_format2");
     }
     while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
       pointFound=true;
       bucket_insert_point(b);
-      if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){
-	close(fid);
+      if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
+	fclose(dbFile);
 	error("Read error H::p","unserialize_hashtable_row_format2");
       }
     }
@@ -1325,7 +1335,7 @@
   serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
 
   if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
-    close(dbfid);
+    serial_close(dbfid);
     error("serial_retrieve_point_set is for SERIAL_FILEFORMAT1 only");
   }
 
@@ -1362,7 +1372,7 @@
   serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
 
   if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
-    close(dbfid);
+    serial_close(dbfid);
     error("serial_retrieve_point is for SERIAL_FILEFORMAT1 only");
   }
 
--- a/lshlib.h	Tue Sep 02 17:10:59 2008 +0000
+++ b/lshlib.h	Fri Sep 05 14:16:21 2008 +0000
@@ -287,9 +287,9 @@
   int serial_can_merge(Uns32T requestedFormat); // Test to see whether core and on-disk structures are compatible
 
   // Functions to write hashtables to disk in format2 (optimized for in-core retrieval)
-  int serialize_lsh_hashtables_format2(int fid, int merge);
-  void serial_write_hashtable_row_format2(int fid, bucket* h, Uns32T& colCount);
-  void serial_write_element_format2(int fid, sbucket* sb, Uns32T& colCount);
+  int serialize_lsh_hashtables_format2(FILE* dbFile, int merge);
+  void serial_write_hashtable_row_format2(FILE* dbFile, bucket* h, Uns32T& colCount);
+  void serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount);
 
   // Functions to read serial header and hash functions (format1 and format2)
   int unserialize_lsh_header(char* filename);            // read lsh header from disk into core
@@ -300,8 +300,8 @@
   void unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b); // read lsh hash table row into core
 
   // Functions to read hashtables in format2
-  void unserialize_lsh_hashtables_format2(int fid);       // read FORMAT2 hash tables into core (core format)
-  Uns32T unserialize_hashtable_row_format2(int fid, bucket** b); // read lsh hash table row into core
+  void unserialize_lsh_hashtables_format2(FILE* dbFile);       // read FORMAT2 hash tables into core (core format)
+  Uns32T unserialize_hashtable_row_format2(FILE* dbFile, bucket** b); // read lsh hash table row into core
 
   // Helper functions
   void serial_print_header(Uns32T requestedFormat);