changeset 296:f922c234462f

fixed file size allocation for FORMAT2 files. Made LSH index size() in bytes an unsigned long long. Changed the name of lsh_inCore flag to lsh_on_disk (to reverse the sense of the 'flag').
author mas01mc
date Fri, 01 Aug 2008 15:04:31 +0000
parents 9347d74a2578
children 7907b50d0995
files audioDB.cpp gengetopt.in index.cpp lshlib.cpp lshlib.h
diffstat 5 files changed, 54 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB.cpp	Thu Jul 31 19:26:04 2008 +0000
+++ b/audioDB.cpp	Fri Aug 01 15:04:31 2008 +0000
@@ -375,7 +375,7 @@
     dbName=args_info.database_arg;
 
     // Whether to store LSH hash tables for query in core (FORMAT2)
-    lsh_in_core = args_info.lsh_inCore_flag;
+    lsh_in_core = args_info.lsh_on_disk_flag; // This flag is set to 0 if on_disk requested
 
     lsh_param_w = args_info.lsh_w_arg;
     if(!(lsh_param_w>0 && lsh_param_w<=O2_SERIAL_MAX_BINWIDTH))
@@ -397,7 +397,9 @@
     if(!(lsh_param_b>0 && lsh_param_b<=O2_SERIAL_MAX_TRACKBATCH))
       error("Indexing parameter b out of range (1 <= b <= 10000)");
     
-    lsh_param_ncols = args_info.lsh_ncols_arg;
+    lsh_param_ncols = args_info.lsh_ncols_arg;    
+    if(lsh_in_core) // We don't want to block rows with FORMAT2 indexing
+      lsh_param_ncols = O2_SERIAL_MAX_COLS;
     if( !(lsh_param_ncols>0 && lsh_param_ncols<=O2_SERIAL_MAX_COLS))
       error("Indexing parameter ncols out of range (1 <= ncols <= 1000");
 
@@ -464,8 +466,8 @@
         error("queryPoint out of range: 0 <= queryPoint <= 10000");
     }
 
-    // Whether to pre-load LSH hash tables for query
-    lsh_in_core = args_info.lsh_inCore_flag;
+    // Whether to pre-load LSH hash tables for query (default on, if flag set then off)
+    lsh_in_core = args_info.lsh_on_disk_flag;
 
     // Whether to perform exact evaluation of points returned by LSH
     lsh_exact = args_info.lsh_exact_flag;
--- a/gengetopt.in	Thu Jul 31 19:26:04 2008 +0000
+++ b/gengetopt.in	Fri Aug 01 15:04:31 2008 +0000
@@ -69,7 +69,7 @@
 option "lsh_b" - "number of tracks per indexing iteration" int typestr="size" default="500" dependon="INDEX" optional
 option "lsh_ncols" - "number of columns (collisions) to allocate in LSH serialization" int typestr="size" default="250" dependon="INDEX" optional
 option "lsh_exact" - "use exact evaluation of points retrieved by LSH." flag off dependon="QUERY" optional
-option "lsh_inCore" - "LSH hash tables resident in core (INDEX/QUERY)" flag on optional
+option "lsh_on_disk" - "Construct LSH hash tables for on-disk query (INDEX/QUERY)" flag on optional
 option "lsh_use_u_functions" - "use m independent hash functions combinatorically to approximate L independent hash functions." flag off optional
 
 section "Normalization control parameters" sectiondesc="These parameters control the normalization of feaures at query time\n"
--- a/index.cpp	Thu Jul 31 19:26:04 2008 +0000
+++ b/index.cpp	Fri Aug 01 15:04:31 2008 +0000
@@ -122,6 +122,7 @@
   printf("INDEX: lsh_k %d\n", lsh_param_k);
   printf("INDEX: lsh_m %d\n", lsh_param_m);
   printf("INDEX: lsh_N %d\n", lsh_param_N);
+  printf("INDEX: lsh_C %d\n", lsh_param_ncols);
   printf("INDEX: lsh_b %d\n", lsh_param_b);
   printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); 
   fflush(stdout);
--- a/lshlib.cpp	Thu Jul 31 19:26:04 2008 +0000
+++ b/lshlib.cpp	Fri Aug 01 15:04:31 2008 +0000
@@ -2,7 +2,7 @@
 
 //#define __LSH_DUMP_CORE_TABLES__
 //#define USE_U_FUNCTIONS
-//#define LSH_BLOCK_FULL_ROWS
+#define LSH_BLOCK_FULL_ROWS
 
 void err(char*s){cout << s << endl;exit(2);}
 
@@ -53,12 +53,6 @@
     k++; // make sure k is even
     cout << "warning: setting k even" << endl;
   }
-
-  cout << "file size: ~" << (((unsigned long long)L*N*C*sizeof(SerialElementT))/1000000UL) << "MB" << endl;
-  if(((unsigned long long)L*N*C*sizeof(SerialElementT))>4000000000UL)
-    error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB");
-  else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL)
-    cout << "warning: hash tables exceed 1000MB." << endl;
   
   // We have the necessary parameters, so construct hashfunction datastructures
   initialize_lsh_functions();
@@ -338,7 +332,7 @@
 }
 
 Uns32T H::bucket_insert_point(bucket **pp){
-  Uns32T collisionCount = 0;
+  collisionCount = 0;
   if(!*pp){
     *pp = new bucket();
 #ifdef LSH_BLOCK_FULL_ROWS
@@ -355,8 +349,8 @@
     __bucket_insert_point((*pp)->next); // First bucket holds collision count
   }
 #else
-    pointCount++;
-    __bucket_insert_point(*pp); // No collision count storage
+  pointCount++;
+  __bucket_insert_point(*pp); // No collision count storage
 #endif
   return collisionCount;
 }
@@ -452,8 +446,9 @@
 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
   Uns32T collisionCount = 0;
   H::p = pp;
-  if(pp>H::maxp)
-    H::maxp=pp; // Store highest pointID in database
+  if(pp<=H::maxp)
+    error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
+  H::maxp=pp; // Store highest pointID in database
   H::compute_hash_functions( v );
   for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
     H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 
@@ -574,22 +569,30 @@
 
 // Serial header constructors
 SerialHeader::SerialHeader(){;}
-SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT):
+SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
   lshMagic(O2_SERIAL_MAGIC),
   binWidth(W),
   numTables(L),
   numRows(N),
   numCols(C),
   elementSize(O2_SERIAL_ELEMENT_SIZE),
-  version(O2_SERIAL_VERSION),
-  size(L * align_up(N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn())   // hash tables
-       + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
-		  L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
+  version(O2_SERIAL_VERSION),  
+  size(0), // we are deprecating this value
   flags(FMT),
   dataDim(d),
   numFuns(k),
   radius(r),
-  maxp(p){;} // header
+  maxp(p),
+  size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn())   // hash tables
+	    + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
+		  (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
+  pointCount(pc){
+  
+  if(FMT==O2_SERIAL_FILEFORMAT2)
+    size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE 
+	     + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
+	      +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
+} // header
 
 float* G::get_serial_hashfunction_base(char* db){
   if(db&&lshHeader)
@@ -620,7 +623,7 @@
   // Check requested serialFormat
   if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
     error("Unrecognized serial file format request: ", "serialize()");
-
+ 
   // Test to see if file exists
   if((dbfid = open (filename, O_RDONLY)) < 0)
     // If it doesn't, then create the file (CREATE)
@@ -676,12 +679,12 @@
   if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) 
       || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
       || !( this->w == that->binWidth &&
-	 this->L == that->numTables &&
-	 this->N == that->numRows &&
-	 this->k == that->numFuns &&
-	 this->d == that->dataDim &&
-	 sizeof(SerialElementT) == that->elementSize &&
-	 this->radius == that->radius)){
+	    this->L == that->numTables &&
+	    this->N == that->numRows &&
+	    this->k == that->numFuns &&
+	    this->d == that->dataDim &&
+	    sizeof(SerialElementT) == that->elementSize &&
+	    this->radius == that->radius)){
     serial_print_header(format);
     return 0;
   }
@@ -974,8 +977,12 @@
   get_lock(dbfid, 1);
   
   // Make header first to get size of serialized database
-  lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT);  
-  
+  lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);  
+
+  cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
+  if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
+    error("Maximum size of LSH file exceded: > 4000MB");
+
   // go to the location corresponding to the last byte
   if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
     error("lseek error in db file", "", "lseek");
@@ -985,11 +992,11 @@
     error("write error", "", "write");
   
   char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
-
+  
   memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
-
+  
   serial_munmap(db, O2_SERIAL_HEADER_SIZE);
-
+  
   close(dbfid);
 
   std::cout << "done initializing tables." << endl;
@@ -1069,6 +1076,7 @@
   H::w = lshHeader->binWidth;
   H::radius = lshHeader->radius;
   H::maxp = lshHeader->maxp;
+  H::pointCount = lshHeader->pointCount;
 
   return dbfid;
 }
--- a/lshlib.h	Thu Jul 31 19:26:04 2008 +0000
+++ b/lshlib.h	Fri Aug 01 15:04:31 2008 +0000
@@ -59,10 +59,11 @@
 #define O2_SERIAL_ELEMENT_SIZE sizeof(SerialElementT)
 #define O2_SERIAL_MAX_TABLES (200)
 #define O2_SERIAL_MAX_ROWS (1000000)
-#define O2_SERIAL_MAX_COLS (1000)
+#define O2_SERIAL_MAX_COLS (100000)
 #define O2_SERIAL_MAX_DIM (2000)
 #define O2_SERIAL_MAX_FUNS (100)
 #define O2_SERIAL_MAX_BINWIDTH (200)
+#define O2_SERIAL_MAXFILESIZE (4000000000UL)
 
 // Flags for Serial Header
 #define O2_SERIAL_FILEFORMAT1 (0x1U)       // Optimize for on-disk search
@@ -118,12 +119,11 @@
   Uns32T numFuns;     // number of independent hash functions
   float radius;       // 32-bit floating point radius
   Uns32T maxp;        // number of unique IDs in the database
-  Uns32T value_14;    // spare value
-  Uns32T value_15;    // spare value
-  Uns32T value_16;    // spare value
+  unsigned long long size_long; // long version of size
+  Uns32T pointCount;     // number of points in the database
 
   SerialHeader();
-  SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float radius, Uns32T p, Uns32T FMT);
+  SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float radius, Uns32T p, Uns32T FMT, Uns32T pointCount);
 
   float get_binWidth(){return binWidth;}  
   Uns32T get_numTables(){return numTables;}
@@ -132,10 +132,11 @@
   Uns32T get_elementSize(){return elementSize;}
   Uns32T get_version(){return version;}
   Uns32T get_flags(){return flags;}
-  Uns32T get_size(){return size;}
+  unsigned long long get_size(){return size_long;}
   Uns32T get_dataDim(){return dataDim;}
   Uns32T get_numFuns(){return numFuns;}
   Uns32T get_maxp(){return maxp;}
+  Uns32T get_pointCount(){return pointCount;}
 };
 
 #define IFLAG 0xFFFFFFFF
@@ -200,6 +201,7 @@
   Uns32T maxp; // highest pointID stored in database
   Uns32T bucketCount;  // count of number of point buckets allocated
   Uns32T pointCount;    // count of number of points inserted
+  Uns32T collisionCount; // number of points collided in a hash-table row
 
   Uns32T t1;       // first hash table key
   Uns32T t2;       // second hash table key