diff lshlib.cpp @ 296:f922c234462f

fixed file size allocation for FORMAT2 files. Made LSH index size() in bytes an unsigned long long. Changed the name of lsh_inCore flag to lsh_on_disk (to reverse the sense of the 'flag').
author mas01mc
date Fri, 01 Aug 2008 15:04:31 +0000
parents 071a108580a4
children b10ad7b6427f
line wrap: on
line diff
--- a/lshlib.cpp	Thu Jul 31 19:26:04 2008 +0000
+++ b/lshlib.cpp	Fri Aug 01 15:04:31 2008 +0000
@@ -2,7 +2,7 @@
 
 //#define __LSH_DUMP_CORE_TABLES__
 //#define USE_U_FUNCTIONS
-//#define LSH_BLOCK_FULL_ROWS
+#define LSH_BLOCK_FULL_ROWS
 
 void err(char*s){cout << s << endl;exit(2);}
 
@@ -53,12 +53,6 @@
     k++; // make sure k is even
     cout << "warning: setting k even" << endl;
   }
-
-  cout << "file size: ~" << (((unsigned long long)L*N*C*sizeof(SerialElementT))/1000000UL) << "MB" << endl;
-  if(((unsigned long long)L*N*C*sizeof(SerialElementT))>4000000000UL)
-    error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB");
-  else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL)
-    cout << "warning: hash tables exceed 1000MB." << endl;
   
   // We have the necessary parameters, so construct hashfunction datastructures
   initialize_lsh_functions();
@@ -338,7 +332,7 @@
 }
 
 Uns32T H::bucket_insert_point(bucket **pp){
-  Uns32T collisionCount = 0;
+  collisionCount = 0;
   if(!*pp){
     *pp = new bucket();
 #ifdef LSH_BLOCK_FULL_ROWS
@@ -355,8 +349,8 @@
     __bucket_insert_point((*pp)->next); // First bucket holds collision count
   }
 #else
-    pointCount++;
-    __bucket_insert_point(*pp); // No collision count storage
+  pointCount++;
+  __bucket_insert_point(*pp); // No collision count storage
 #endif
   return collisionCount;
 }
@@ -452,8 +446,9 @@
 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
   Uns32T collisionCount = 0;
   H::p = pp;
-  if(pp>H::maxp)
-    H::maxp=pp; // Store highest pointID in database
+  if(pp<=H::maxp)
+    error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
+  H::maxp=pp; // Store highest pointID in database
   H::compute_hash_functions( v );
   for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
     H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 
@@ -574,22 +569,30 @@
 
 // Serial header constructors
 SerialHeader::SerialHeader(){;}
-SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT):
+SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
   lshMagic(O2_SERIAL_MAGIC),
   binWidth(W),
   numTables(L),
   numRows(N),
   numCols(C),
   elementSize(O2_SERIAL_ELEMENT_SIZE),
-  version(O2_SERIAL_VERSION),
-  size(L * align_up(N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn())   // hash tables
-       + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
-		  L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
+  version(O2_SERIAL_VERSION),  
+  size(0), // we are deprecating this value
   flags(FMT),
   dataDim(d),
   numFuns(k),
   radius(r),
-  maxp(p){;} // header
+  maxp(p),
+  size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn())   // hash tables
+	    + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
+		  (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
+  pointCount(pc){
+  
+  if(FMT==O2_SERIAL_FILEFORMAT2)
+    size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE 
+	     + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
+	      +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
+} // header
 
 float* G::get_serial_hashfunction_base(char* db){
   if(db&&lshHeader)
@@ -620,7 +623,7 @@
   // Check requested serialFormat
   if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
     error("Unrecognized serial file format request: ", "serialize()");
-
+ 
   // Test to see if file exists
   if((dbfid = open (filename, O_RDONLY)) < 0)
     // If it doesn't, then create the file (CREATE)
@@ -676,12 +679,12 @@
   if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) 
       || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
       || !( this->w == that->binWidth &&
-	 this->L == that->numTables &&
-	 this->N == that->numRows &&
-	 this->k == that->numFuns &&
-	 this->d == that->dataDim &&
-	 sizeof(SerialElementT) == that->elementSize &&
-	 this->radius == that->radius)){
+	    this->L == that->numTables &&
+	    this->N == that->numRows &&
+	    this->k == that->numFuns &&
+	    this->d == that->dataDim &&
+	    sizeof(SerialElementT) == that->elementSize &&
+	    this->radius == that->radius)){
     serial_print_header(format);
     return 0;
   }
@@ -974,8 +977,12 @@
   get_lock(dbfid, 1);
   
   // Make header first to get size of serialized database
-  lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT);  
-  
+  lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);  
+
+  cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
+  if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
+    error("Maximum size of LSH file exceded: > 4000MB");
+
   // go to the location corresponding to the last byte
   if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
     error("lseek error in db file", "", "lseek");
@@ -985,11 +992,11 @@
     error("write error", "", "write");
   
   char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
-
+  
   memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
-
+  
   serial_munmap(db, O2_SERIAL_HEADER_SIZE);
-
+  
   close(dbfid);
 
   std::cout << "done initializing tables." << endl;
@@ -1069,6 +1076,7 @@
   H::w = lshHeader->binWidth;
   H::radius = lshHeader->radius;
   H::maxp = lshHeader->maxp;
+  H::pointCount = lshHeader->pointCount;
 
   return dbfid;
 }