diff insert.cpp @ 324:c93be2f3a674

Merge of branches/large_adb -r 514:524 onto the trunk. No conflicts. Added LARGE_ADB support. Turn on with --ntracks 20001 or greater. Use --adb_feature_root to locate feature files at QUERY time. A bug fix in LSH indexing that was incorrectly thresholding large numbers of shingles.
author mas01mc
date Thu, 21 Aug 2008 21:28:33 +0000
parents 74824093c1c4
children 521812d63516
line wrap: on
line diff
--- a/insert.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/insert.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -11,7 +11,7 @@
 }
 
 bool audioDB::enough_data_space_free(off_t size) {
-  return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
+    return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
 }
 
 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
@@ -23,6 +23,9 @@
   forWrite = true;
   initTables(dbName, inFile);
 
+  if(dbH->flags & O2_FLAG_LARGE_ADB)
+    error("Single-feature inserts not allowed with LARGE audioDB instances");
+
   if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
     error("Must use timestamps with timestamped database","use --times");
 
@@ -49,6 +52,7 @@
 
   if(alreadyInserted) {
     VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
+    // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08
     return;
   }
   
@@ -64,7 +68,7 @@
     return;
   }
 
-  strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key));
+  INSERT_FILETABLE_STRING(fileTable, key);
 
   off_t insertoffset = dbH->length;// Store current state
 
@@ -153,14 +157,14 @@
 }
 
 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
-  if (usingPower) {
+  if(usingPower){
     if (!(dbH->flags & O2_FLAG_POWER)) {
       error("Cannot insert power data on non-power DB", dbName);
     }
-
+    
     int one;
     unsigned int count;
-
+    
     count = read(powerfd, &one, sizeof(unsigned int));
     if (count != sizeof(unsigned int)) {
       error("powerfd read failed", "int", "read");
@@ -168,7 +172,7 @@
     if (one != 1) {
       error("dimensionality of power file not 1", powerFileName);
     }
-
+    
     // FIXME: should check that the powerfile is the right size for
     // this.  -- CSR, 2007-10-30
     count = read(powerfd, powerdata, numVectors * sizeof(double));
@@ -183,6 +187,12 @@
   forWrite = true;
   initDBHeader(dbName);
 
+  // Treat large ADB instances differently
+  if( dbH->flags & O2_FLAG_LARGE_ADB ){
+    batchinsert_large_adb(dbName, inFile) ;
+    return;
+  }
+    
   if(!key)
     key=inFile;
   std::ifstream *filesIn = 0;
@@ -289,8 +299,9 @@
             close(thispowerfd);
           }
         }
-	strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey));
-  
+
+	INSERT_FILETABLE_STRING(fileTable, thisKey);
+
 	off_t insertoffset = dbH->length;// Store current state
 
 	// Increment file count
@@ -301,7 +312,7 @@
   
 	// Update track to file index map
 	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
-	
+
 	insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
 	
 	// Norm the vectors on input if the database is already L2 normed
@@ -334,3 +345,171 @@
   // Report status
   status(dbName);
 }
+
+
+// BATCHINSERT_LARGE_ADB
+//
+// This method inserts file pointers into the ADB instance rather than the actual feature data
+//
+// This method is intended for databases that are large enough to only support indexed query
+// So exhaustive searching across all feature vectors will not be performed
+//
+// We insert featureFileName, [powerFileName], [timesFileName]
+//
+// l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time
+//
+// LIMITS:
+//
+// We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles
+//
+void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) {
+
+  if(!key)
+    key=inFile;
+  std::ifstream *filesIn = 0;
+  std::ifstream *keysIn = 0;
+  std::ifstream* thisTimesFile = 0;
+  int thispowerfd = 0;
+
+  if(!(filesIn = new std::ifstream(inFile)))
+    error("Could not open batch in file", inFile);
+  if(key && key!=inFile)
+    if(!(keysIn = new std::ifstream(key)))
+      error("Could not open batch key file",key);
+  
+  if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
+    error("Must use timestamps with timestamped database","use --times");
+
+  if(!usingPower && (dbH->flags & O2_FLAG_POWER))
+    error("Must use power with power-enabled database", dbName);
+
+  unsigned totalVectors=0;
+  char *thisFile = new char[MAXSTR];
+  char *thisKey = 0;
+  if (key && (key != inFile)) {
+    thisKey = new char[MAXSTR];
+  }
+  char *thisTimesFileName = new char[MAXSTR];
+  char *thisPowerFileName = new char[MAXSTR];
+
+  std::set<std::string> s;
+
+  for (unsigned k = 0; k < dbH->numFiles; k++) {
+    s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
+  }
+
+  do {
+    filesIn->getline(thisFile,MAXSTR);
+    if(key && key!=inFile) {
+      keysIn->getline(thisKey,MAXSTR);
+    } else {
+      thisKey = thisFile;
+    }
+    if(usingTimes) {
+      timesFile->getline(thisTimesFileName,MAXSTR);
+    }
+    if(usingPower) {
+      powerFile->getline(thisPowerFileName, MAXSTR);
+    }
+    
+    if(filesIn->eof()) {
+      break;
+    }
+    
+    initInputFile(thisFile, false);
+
+    if(!enough_per_file_space_free()) {
+      error("batchinsert failed: no more room for metadata", thisFile);
+    }
+
+    if(s.count(thisKey)) {
+      VERB_LOG(0, "key already exists in database: %s\n", thisKey);
+    } else {
+      s.insert(thisKey);
+      // Make a track index table of features to file indexes
+      unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
+      if(!numVectors) {
+        VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
+      }
+      else{
+	// Check that time-stamp file exists
+	if(usingTimes){
+	  if(timesFile->eof()) {
+	    error("not enough timestamp files in timesList", timesFileName);
+	  }
+	  thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
+	  if(!thisTimesFile->is_open()) {
+	    error("Cannot open timestamp file", thisTimesFileName);
+	  }
+	  if(thisTimesFile)
+	    delete thisTimesFile;
+	}
+
+	// Check that power file exists        
+        if (usingPower) {
+          if(powerFile->eof()) {
+            error("not enough power files in powerList", powerFileName);
+          }
+          thispowerfd = open(thisPowerFileName, O_RDONLY);
+          if (thispowerfd < 0) {
+            error("failed to open power file", thisPowerFileName);
+          }
+          if (0 < thispowerfd) {
+            close(thispowerfd);
+          }
+        }
+
+	// persist links to the feature files for reading from filesystem later
+	
+	// Primary Keys
+	INSERT_FILETABLE_STRING(fileTable, thisKey);
+	
+	// Feature Vector fileNames
+	INSERT_FILETABLE_STRING(featureFileNameTable, thisFile);
+	
+	// Time Stamp fileNames
+	if(usingTimes)
+	  INSERT_FILETABLE_STRING(timesFileNameTable, thisTimesFileName);
+
+
+	// Power fileNames
+	if(usingPower)
+	  INSERT_FILETABLE_STRING(powerFileNameTable, thisPowerFileName);
+
+	// Increment file count
+	dbH->numFiles++;  
+  
+	// Update Header information
+	dbH->length+=(statbuf.st_size-sizeof(int));
+  
+	// Update track to file index map
+	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
+
+	totalVectors+=numVectors;
+
+	// Copy the header back to the database
+	memcpy (db, dbH, sizeof(dbTableHeaderT));  
+      }
+    }
+    // CLEAN UP
+    if(indata)
+      munmap(indata,statbuf.st_size);
+    if(infid>0)
+      close(infid);
+  } while(!filesIn->eof());
+
+  VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
+
+  delete [] thisPowerFileName;
+  if(key && (key != inFile)) {
+    delete [] thisKey;
+  }
+  delete [] thisFile;
+  delete [] thisTimesFileName;
+  
+  delete filesIn;
+  delete keysIn;
+
+  // Report status
+  status(dbName);
+}