diff insert.cpp @ 239:2cc06e5b05a5

Merge refactoring branch. Bug fixes: * 64-bit powertable bug; * -inf - -inf bug; * use new times information; * plus short track, O2_MAXFILES and structure padding ABI fixes (already backported) Major code changes: * split source into functional units, known as 'files'; * Reporter class for accumulating and reporting on query results; * much OAOOization, mostly from above: net 800 LOC (25%) shorter.
author mas01cr
date Thu, 13 Dec 2007 14:23:32 +0000
parents
children a6c9a1c68646 abfb26e08d9c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/insert.cpp	Thu Dec 13 14:23:32 2007 +0000
@@ -0,0 +1,304 @@
+#include "audioDB.h"
+
+bool audioDB::enough_data_space_free(off_t size) {
+  return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
+}
+
+void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
+  lseek(dbfid, dbH->dataOffset + offset, SEEK_SET);
+  write(dbfid, buffer, size);
+}
+
+void audioDB::insert(const char* dbName, const char* inFile) {
+  forWrite = true;
+  initTables(dbName, inFile);
+
+  if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
+    error("Must use timestamps with timestamped database","use --times");
+
+  if(!usingPower && (dbH->flags & O2_FLAG_POWER))
+    error("Must use power with power-enabled database", dbName);
+
+  if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
+    error("Insert failed: no more room in database", inFile);
+  }
+
+  if(!key)
+    key=inFile;
+  // Linear scan of filenames check for pre-existing feature
+  unsigned alreadyInserted=0;
+  for(unsigned k=0; k<dbH->numFiles; k++)
+    if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key)+1)==0){
+      alreadyInserted=1;
+      break;
+    }
+
+  if(alreadyInserted) {
+    VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
+    return;
+  }
+  
+  // Make a track index table of features to file indexes
+  unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
+  if(!numVectors) {
+    VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key);
+
+    // CLEAN UP
+    munmap(indata,statbuf.st_size);
+    munmap(db,dbH->dbSize);
+    close(infid);
+    return;
+  }
+
+  strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key));
+
+  off_t insertoffset = dbH->length;// Store current state
+
+  // Check times status and insert times from file
+  unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
+  double *timesdata = timesTable + 2*indexoffset;
+
+  if(2*(indexoffset + numVectors) > timesTableLength) {
+    error("out of space for times", key);
+  }
+  
+  if (usingTimes) {
+    insertTimeStamps(numVectors, timesFile, timesdata);
+  }
+
+  double *powerdata = powerTable + indexoffset;
+  insertPowerData(numVectors, powerfd, powerdata);
+
+  // Increment file count
+  dbH->numFiles++;
+
+  // Update Header information
+  dbH->length+=(statbuf.st_size-sizeof(int));
+
+  // Update track to file index map
+  memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned));  
+
+  insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
+  
+  // Norm the vectors on input if the database is already L2 normed
+  if(dbH->flags & O2_FLAG_L2NORM)
+    unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
+
+  // Report status
+  status(dbName);
+  VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int)));
+
+  // Copy the header back to the database
+  memcpy (db, dbH, sizeof(dbTableHeaderT));  
+
+  // CLEAN UP
+  munmap(indata,statbuf.st_size);
+  close(infid);
+}
+
+void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) {
+  assert(usingTimes);
+
+  unsigned numtimes = 0;
+
+  if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) {
+    dbH->flags=dbH->flags|O2_FLAG_TIMES;
+  } else if(!(dbH->flags & O2_FLAG_TIMES)) {
+    error("Timestamp file used with non-timestamped database", timesFileName);
+  }
+   
+  if(!timesFile->is_open()) {
+    error("problem opening times file on timestamped database", timesFileName);
+  }
+
+  double timepoint, next;
+  *timesFile >> timepoint;
+  if (timesFile->eof()) {
+    error("no entries in times file", timesFileName);
+  }
+  numtimes++;
+  do {
+    *timesFile >> next;
+    if (timesFile->eof()) {
+      break;
+    }
+    numtimes++;
+    timesdata[0] = timepoint;
+    timepoint = (timesdata[1] = next);
+    timesdata += 2;
+  } while (numtimes < numVectors + 1);
+
+  if (numtimes < numVectors + 1) {
+    error("too few timepoints in times file", timesFileName);
+  }
+
+  *timesFile >> next;
+  if (!timesFile->eof()) {
+    error("too many timepoints in times file", timesFileName);
+  }
+}
+
+void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
+  if (usingPower) {
+    if (!(dbH->flags & O2_FLAG_POWER)) {
+      error("Cannot insert power data on non-power DB", dbName);
+    }
+
+    int one;
+    unsigned int count;
+
+    count = read(powerfd, &one, sizeof(unsigned int));
+    if (count != sizeof(unsigned int)) {
+      error("powerfd read failed", "int", "read");
+    }
+    if (one != 1) {
+      error("dimensionality of power file not 1", powerFileName);
+    }
+
+    // FIXME: should check that the powerfile is the right size for
+    // this.  -- CSR, 2007-10-30
+    count = read(powerfd, powerdata, numVectors * sizeof(double));
+    if (count != numVectors * sizeof(double)) {
+      error("powerfd read failed", "double", "read");
+    }
+  }
+}
+
+void audioDB::batchinsert(const char* dbName, const char* inFile) {
+
+  forWrite = true;
+  initDBHeader(dbName);
+
+  if(!key)
+    key=inFile;
+  std::ifstream *filesIn = 0;
+  std::ifstream *keysIn = 0;
+  std::ifstream* thisTimesFile = 0;
+  int thispowerfd = 0;
+
+  if(!(filesIn = new std::ifstream(inFile)))
+    error("Could not open batch in file", inFile);
+  if(key && key!=inFile)
+    if(!(keysIn = new std::ifstream(key)))
+      error("Could not open batch key file",key);
+  
+  if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
+    error("Must use timestamps with timestamped database","use --times");
+
+  if(!usingPower && (dbH->flags & O2_FLAG_POWER))
+    error("Must use power with power-enabled database", dbName);
+
+  unsigned totalVectors=0;
+  char *thisKey = new char[MAXSTR];
+  char *thisFile = new char[MAXSTR];
+  char *thisTimesFileName = new char[MAXSTR];
+  char *thisPowerFileName = new char[MAXSTR];
+  
+  do{
+    filesIn->getline(thisFile,MAXSTR);
+    if(key && key!=inFile)
+      keysIn->getline(thisKey,MAXSTR);
+    else
+      thisKey = thisFile;
+    if(usingTimes)
+      timesFile->getline(thisTimesFileName,MAXSTR);	  
+    if(usingPower)
+      powerFile->getline(thisPowerFileName, MAXSTR);
+    
+    if(filesIn->eof())
+      break;
+
+    initInputFile(thisFile);
+
+    if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
+      error("batchinsert failed: no more room in database", thisFile);
+    }
+    
+    // Linear scan of filenames check for pre-existing feature
+    unsigned alreadyInserted=0;
+  
+    for(unsigned k=0; k<dbH->numFiles; k++)
+      if(strncmp(fileTable + k*O2_FILETABLESIZE, thisKey, strlen(thisKey)+1)==0){
+	alreadyInserted=1;
+	break;
+      }
+  
+    if(alreadyInserted) {
+      VERB_LOG(0, "key already exists in database: %s\n", thisKey);
+    } else {
+      // Make a track index table of features to file indexes
+      unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
+      if(!numVectors) {
+        VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
+      }
+      else{
+	if(usingTimes){
+	  if(timesFile->eof()) {
+	    error("not enough timestamp files in timesList", timesFileName);
+	  }
+	  thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
+	  if(!thisTimesFile->is_open()) {
+	    error("Cannot open timestamp file", thisTimesFileName);
+	  }
+	  off_t insertoffset = dbH->length;
+	  unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double));
+	  double *timesdata = timesTable + 2*indexoffset;
+          if(2*(indexoffset + numVectors) > timesTableLength) {
+            error("out of space for times", key);
+          }
+	  insertTimeStamps(numVectors, thisTimesFile, timesdata);
+	  if(thisTimesFile)
+	    delete thisTimesFile;
+	}
+        
+        if (usingPower) {
+          if(powerFile->eof()) {
+            error("not enough power files in powerList", powerFileName);
+          }
+          thispowerfd = open(thisPowerFileName, O_RDONLY);
+          if (thispowerfd < 0) {
+            error("failed to open power file", thisPowerFileName);
+          }
+          off_t insertoffset = dbH->length;
+          unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double));
+          double *powerdata = powerTable + poweroffset;
+          insertPowerData(numVectors, thispowerfd, powerdata);
+          if (0 < thispowerfd) {
+            close(thispowerfd);
+          }
+        }
+	strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey));
+  
+	off_t insertoffset = dbH->length;// Store current state
+
+	// Increment file count
+	dbH->numFiles++;  
+  
+	// Update Header information
+	dbH->length+=(statbuf.st_size-sizeof(int));
+  
+	// Update track to file index map
+	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
+	
+	insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
+	
+	// Norm the vectors on input if the database is already L2 normed
+	if(dbH->flags & O2_FLAG_L2NORM)
+	  unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
+	
+	totalVectors+=numVectors;
+
+	// Copy the header back to the database
+	memcpy (db, dbH, sizeof(dbTableHeaderT));  
+      }
+    }
+    // CLEAN UP
+    munmap(indata,statbuf.st_size);
+    close(infid);
+  } while(!filesIn->eof());
+
+  VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
+  
+  // Report status
+  status(dbName);
+}