changeset 404:1fb8bee777e5 api-inversion

Begin working towards inverting audioDB::insert() / audiodb_insert(). New data type audiodb_datum_t, roughly corresponding to a "track" in current audioDB parlance; it contains exactly the feature information and metadata to record. New function audiodb_insert_datum() to insert one of these audiodb_datum_t objects into the database; the intention is that not only can insertion of feature files be implemented in terms of this function, but that it will be a useful function in its own right, callable perhaps from PD, Max/MSP, and/or a VAMP plugin. This function is complicated enough that it actually gets a comment. Implement audioDB::insert() in terms of audiodb_insert_datum(), via a wrapper which handles the slightly wacky error/non-error case of attempting to insert features with a key that already exists in the database. Delete whole rafts of code. We can't quite delete everything because there's batchinsert / batchinsert_large_adb to sort out; the good news is that the batchinsert operation can simply be implemented as a loop around audiodb_insert_datum() without loss of efficiency. (There's also a stray extra audiodb_insert() in libtests/0027/, found through an earlier iteration of this patch.)
author mas01cr
date Fri, 05 Dec 2008 22:32:43 +0000
parents 7038f31124d1
children ef4792df8f93
files audioDB.h audioDB_API.h insert.cpp libtests/0027/prog1.c
diffstat 4 files changed, 240 insertions(+), 206 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB.h	Wed Dec 03 17:40:17 2008 +0000
+++ b/audioDB.h	Fri Dec 05 22:32:43 2008 +0000
@@ -358,9 +358,8 @@
   void release_lock(int fd);
   void create(const char* dbName);
   bool enough_per_file_space_free();
-  bool enough_data_space_free(off_t size);
-  void insert_data_vectors(off_t offset, void *buffer, size_t size);
   void insert(const char* dbName, const char* inFile);
+  void insertDatum(const char *inFile, std::ifstream *timesFile, int powerfd, const char *key);
   void batchinsert(const char* dbName, const char* inFile);
   void batchinsert_large_adb(const char* dbName, const char* inFile);
   void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0);
--- a/audioDB_API.h	Wed Dec 03 17:40:17 2008 +0000
+++ b/audioDB_API.h	Fri Dec 05 22:32:43 2008 +0000
@@ -21,6 +21,16 @@
    that we should prefer "audiodb_" */
 typedef struct adb adb_t, *adb_ptr;
 
+struct adb_datum {
+  uint32_t nvectors;
+  uint32_t dim;
+  const char *key;
+  double *data;
+  double *power;
+  double *times;
+};
+typedef struct adb_datum adb_datum_t;
+
 //used for both insert and batchinsert
 struct adbinsert {
 
@@ -102,6 +112,7 @@
 int audiodb_power(adb_ptr mydb);
 
 /* insert functions */
+int audiodb_insert_datum(adb_t *, adb_datum_t *);
 int audiodb_insert(adb_ptr mydb, adb_insert_ptr ins);
 int audiodb_batchinsert(adb_ptr mydb, adb_insert_ptr ins, unsigned int size);
 
--- a/insert.cpp	Wed Dec 03 17:40:17 2008 +0000
+++ b/insert.cpp	Fri Dec 05 22:32:43 2008 +0000
@@ -1,4 +1,162 @@
 #include "audioDB.h"
+extern "C" {
+#include "audioDB_API.h"
+}
+#include "audioDB-internals.h"
+
+static bool audiodb_enough_data_space_free(adb_t *adb, off_t size) {
+  adb_header_t *header = adb->header;
+  /* FIXME: timesTableOffset isn't necessarily the next biggest offset
+     after dataOffset.  Maybe make the offsets into an array that we
+     can iterate over... */
+  return (header->timesTableOffset > 
+          header->dataOffset + header->length + size);
+}
+
+static bool audiodb_enough_per_file_space_free(adb_t *adb) {
+  /* FIXME: the comment above about the ordering of the tables applies
+     here too. */
+  adb_header_t *header = adb->header;
+  off_t file_table_length = header->trackTableOffset - header->fileTableOffset;
+  off_t track_table_length = header->dataOffset - header->trackTableOffset;
+  int fmaxfiles = file_table_length / O2_FILETABLE_ENTRY_SIZE;
+  int tmaxfiles = track_table_length / O2_TRACKTABLE_ENTRY_SIZE;
+  /* maxfiles is the _minimum_ of the two.  Do not be confused... */
+  unsigned int maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles;
+  return (header->numFiles < maxfiles);
+}
+
+/*
+ * Hey, look, a comment.  Normally I wouldn't bother, as the code
+ * should be self-documenting, but a lot of logic is concentrated in
+ * this one place, so let's give an overview beforehand.  To insert a
+ * datum into the database, we:
+ *
+ *  1. check write permission;
+ *  2. check !O2_FLAG_LARGE_ADB;
+ *  3. check for enough space;
+ *  4. check that datum->dim and adb->header->dim agree (or that the
+ *     header dimension is zero, in which case write datum->dim to
+ *     adb->header->dim).
+ *  5. check for presence of datum->key in adb->keys;
+ *  6. check for consistency between power and O2_FLAG_POWER, and 
+ *     times and O2_FLAG_TIMES;
+ *  7. write in data, power, times as appropriate; add to track
+ *     and key tables too;
+ *  8. if O2_FLAG_L2NORM, compute norms and fill in table;
+ *  9. update adb->keys and adb->header;
+ * 10. sync adb->header with disk.
+ *
+ * Step 10 essentially commits the transaction; until we update
+ * header->length, nothing will recognize the newly-written data.
+ * In principle, if it fails, we should roll back, which we can in
+ * fact do on the assumption that nothing in step 9 can ever fail;
+ * on the other hand, if it's failed, then it's unlikely that
+ * rolling back by syncing the original header back to disk is going
+ * to work desperately well.
+ */
+int audiodb_insert_datum(adb_t *adb, adb_datum_t *datum) {
+
+  off_t size, offset, nfiles;
+  double *l2norm_buffer, *lp, *dp;
+
+  /* 1. check write permission; */
+  if(!(adb->flags & O_RDWR)) {
+    return 1;
+  }
+  /* 2. check !O2_FLAG_LARGE_ADB; */
+  if(adb->header->flags & O2_FLAG_LARGE_ADB) {
+    return 1;
+  }
+  /* 3. check for enough space; */
+  size = sizeof(double) * datum->nvectors * datum->dim;
+  if(!audiodb_enough_data_space_free(adb, size)) {
+    return 1;
+  }
+  if(!audiodb_enough_per_file_space_free(adb)) {
+    return 1;
+  }
+  /* 4. check that datum->dim and adb->header->dim agree (or that the
+   *    header dimension is zero, in which case write datum->dim to
+   *    adb->header->dim).
+   */
+  if(adb->header->dim == 0) {
+    adb->header->dim = datum->dim;
+  } else if (adb->header->dim != datum->dim) {
+    return 1;
+  }
+  /* 5. check for presence of datum->key in adb->keys; */
+  if(adb->keys->count(datum->key)) {
+    /* not part of an explicit API/ABI, but we need a distinguished
+       value in this circumstance to preserve somewhat wonky behaviour
+       of audioDB::batchinsert. */
+    return 2;
+  }
+  /* 6. check for consistency between power and O2_FLAG_POWER, and
+   *    times and O2_FLAG_TIMES; 
+   */
+  if((datum->power && !(adb->header->flags & O2_FLAG_POWER)) ||
+     ((adb->header->flags & O2_FLAG_POWER) && !datum->power)) {
+    return 1;
+  }
+  if(datum->times && !(adb->header->flags & O2_FLAG_TIMES)) {
+    if(adb->header->numFiles == 0) {
+      adb->header->flags |= O2_FLAG_TIMES;
+    } else {
+      return 1;
+    }
+  } else if ((adb->header->flags & O2_FLAG_TIMES) && !datum->times) {
+    return 1;
+  }
+  /* 7. write in data, power, times as appropriate; add to track
+   *    and key tables too;
+   */
+  offset = adb->header->length;
+  nfiles = adb->header->numFiles;
+
+  /* FIXME: checking for all these lseek()s and write()s */
+  lseek(adb->fd, adb->header->dataOffset + offset, SEEK_SET);
+  write(adb->fd, datum->data, sizeof(double) * datum->nvectors * datum->dim);
+  if(datum->power) {
+    lseek(adb->fd, adb->header->powerTableOffset + offset / datum->dim, SEEK_SET);
+    write(adb->fd, datum->power, sizeof(double) * datum->nvectors);
+  }
+  if(datum->times) {
+    lseek(adb->fd, adb->header->timesTableOffset + offset / datum->dim * 2, SEEK_SET);
+    write(adb->fd, datum->times, sizeof(double) * datum->nvectors * 2);
+  }
+  lseek(adb->fd, adb->header->trackTableOffset + nfiles * O2_TRACKTABLE_ENTRY_SIZE, SEEK_SET);
+  write(adb->fd, &datum->nvectors, O2_TRACKTABLE_ENTRY_SIZE);
+  lseek(adb->fd, adb->header->fileTableOffset + nfiles * O2_FILETABLE_ENTRY_SIZE, SEEK_SET);
+  write(adb->fd, datum->key, strlen(datum->key)+1);
+
+  /* 8. if O2_FLAG_L2NORM, compute norms and fill in table; */
+  l2norm_buffer = (double *) malloc(datum->nvectors * sizeof(double));
+
+  /* FIXME: shared code with audiodb_norm_existing() */
+  dp = datum->data;
+  lp = l2norm_buffer;
+  for(size_t i = 0; i < datum->nvectors; i++) {
+    *lp = 0;
+    for(unsigned int k = 0; k < datum->dim; k++) {
+      *lp += (*dp)*(*dp);
+      dp++;
+    }
+    lp++;
+  }
+  lseek(adb->fd, adb->header->l2normTableOffset + offset / datum->dim, SEEK_SET);
+  write(adb->fd, l2norm_buffer, sizeof(double) * datum->nvectors);
+  free(l2norm_buffer);
+
+  adb->keys->insert(datum->key);
+  adb->header->numFiles += 1;
+  adb->header->length += sizeof(double) * datum->nvectors * datum->dim;
+
+  return audiodb_sync_header(adb);
+
+ error:
+  return 1;
+}
 
 bool audioDB::enough_per_file_space_free() {
   unsigned int fmaxfiles, tmaxfiles;
@@ -10,110 +168,66 @@
   return(dbH->numFiles < maxfiles);
 }
 
-bool audioDB::enough_data_space_free(off_t size) {
-    return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
-}
-
-void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
-  if(lseek(dbfid, dbH->dataOffset + offset, SEEK_SET) == (off_t) -1) {
-    error("error seeking to offset", "", "lseek");
+void audioDB::insertDatum(const char *inFile, std::ifstream *timesFile, int powerfd, const char *key) {
+  adb_datum_t datum;
+  int fd;
+  struct stat statbuf;
+  off_t size;
+  int err;
+  
+  datum.times = 0;
+  datum.power = 0;
+  
+  if((fd = open(inFile, O_RDONLY)) == -1) {
+    error("failed to open input file", inFile);
   }
-  CHECKED_WRITE(dbfid, buffer, size);
+  if(fstat(fd, &statbuf)) {
+    error("failed to stat input file", inFile);
+  }
+  read(fd, &(datum.dim), sizeof(uint32_t));
+  size = statbuf.st_size - sizeof(uint32_t);
+  datum.nvectors = size / (sizeof(double) * datum.dim);
+  datum.data = (double *) malloc(size);
+  if(!datum.data) {
+    error("failed to allocate memory");
+  }
+  read(fd, datum.data, size);
+  close(fd);
+  if(timesFile) {
+    datum.times = (double *) malloc(sizeof(double) * datum.nvectors * 2);
+    if(!datum.times) {
+      error("failed to allocate memory");
+    }
+    insertTimeStamps(datum.nvectors, timesFile, datum.times);
+  }
+  if(powerfd) {
+    datum.power = (double *) malloc(sizeof(double) * datum.nvectors);
+    if(!datum.power) {
+      error("failed to allocate memory");
+    }
+    insertPowerData(datum.nvectors, powerfd, datum.power);
+  }
+  datum.key = key ? key : inFile;
+  err = audiodb_insert_datum(adb, &datum);
+  if(err && (err != 2)) {
+    error("failed to insert data for file", inFile);
+  }
 }
 
 void audioDB::insert(const char* dbName, const char* inFile) {
-  forWrite = true;
-  initTables(dbName, inFile);
-
-  if(dbH->flags & O2_FLAG_LARGE_ADB)
-    error("Single-feature inserts not allowed with LARGE audioDB instances");
-
-  if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
-    error("Must use timestamps with timestamped database","use --times");
-
-  if(!usingPower && (dbH->flags & O2_FLAG_POWER))
-    error("Must use power with power-enabled database", dbName);
-
-  if(!enough_per_file_space_free()) {
-    error("Insert failed: no more room for metadata", inFile);
+  if(!adb) {
+    if(!(adb = audiodb_open(dbName, O_RDWR))) {
+      error("failed to open database", dbName);
+    }
   }
-
-  if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
-    error("Insert failed: no more room in database", inFile);
+  if(adb->header->flags & O2_FLAG_LARGE_ADB) {
+    
+  } else {
+    /* at this point, we have powerfd (an fd), timesFile (a
+     * std::ifstream *) and inFile (a char *).  Wacky, huh? */
+    insertDatum(inFile, timesFile, powerfd, key);
   }
-
-  if(!key)
-    key=inFile;
-  // Linear scan of filenames check for pre-existing feature
-  unsigned alreadyInserted=0;
-  for(unsigned k=0; k<dbH->numFiles; k++)
-    if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)+1)==0){
-      alreadyInserted=1;
-      break;
-    }
-
-  if(alreadyInserted) {
-    VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
-    // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08
-    return;
-  }
-  
-  // Make a track index table of features to file indexes
-  unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
-  if(!numVectors) {
-    VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key);
-
-    // CLEAN UP
-    munmap(indata,statbuf.st_size);
-    munmap(db,dbH->dbSize);
-    close(infid);
-    return;
-  }
-
-  INSERT_FILETABLE_STRING(fileTable, key);
-
-  off_t insertoffset = dbH->length;// Store current state
-
-  // Check times status and insert times from file
-  unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
-  double *timesdata = timesTable + 2*indexoffset;
-
-  if(2*(indexoffset + numVectors) > timesTableLength) {
-    error("out of space for times", key);
-  }
-  
-  if (usingTimes) {
-    insertTimeStamps(numVectors, timesFile, timesdata);
-  }
-
-  double *powerdata = powerTable + indexoffset;
-  insertPowerData(numVectors, powerfd, powerdata);
-
-  // Increment file count
-  dbH->numFiles++;
-
-  // Update Header information
-  dbH->length+=(statbuf.st_size-sizeof(int));
-
-  // Update track to file index map
-  memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned));  
-
-  insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
-  
-  // Norm the vectors on input if the database is already L2 normed
-  if(dbH->flags & O2_FLAG_L2NORM)
-    unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors);
-
-  // Report status
   status(dbName);
-  VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int)));
-
-  // Copy the header back to the database
-  memcpy (db, dbH, sizeof(dbTableHeaderT));  
-
-  // CLEAN UP
-  munmap(indata,statbuf.st_size);
-  close(infid);
 }
 
 void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) {
@@ -121,12 +235,6 @@
 
   unsigned numtimes = 0;
 
-  if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) {
-    dbH->flags=dbH->flags|O2_FLAG_TIMES;
-  } else if(!(dbH->flags & O2_FLAG_TIMES)) {
-    error("Timestamp file used with non-timestamped database", timesFileName);
-  }
-   
   if(!timesFile->is_open()) {
     error("problem opening times file on timestamped database", timesFileName);
   }
@@ -160,10 +268,6 @@
 
 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
   if(usingPower){
-    if (!(dbH->flags & O2_FLAG_POWER)) {
-      error("Cannot insert power data on non-power DB", dbName);
-    }
-    
     int one;
     unsigned int count;
     
@@ -185,7 +289,6 @@
 }
 
 void audioDB::batchinsert(const char* dbName, const char* inFile) {
-
   forWrite = true;
   initDBHeader(dbName);
 
@@ -207,12 +310,6 @@
   if(key && key!=inFile)
     if(!(keysIn = new std::ifstream(key)))
       error("Could not open batch key file",key);
-  
-  if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
-    error("Must use timestamps with timestamped database","use --times");
-
-  if(!usingPower && (dbH->flags & O2_FLAG_POWER))
-    error("Must use power with power-enabled database", dbName);
 
   unsigned totalVectors=0;
   char *thisFile = new char[MAXSTR];
@@ -223,12 +320,6 @@
   char *thisTimesFileName = new char[MAXSTR];
   char *thisPowerFileName = new char[MAXSTR];
 
-  std::set<std::string> s;
-
-  for (unsigned k = 0; k < dbH->numFiles; k++) {
-    s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
-  }
-
   do {
     filesIn->getline(thisFile,MAXSTR);
     if(key && key!=inFile) {
@@ -246,93 +337,26 @@
     if(filesIn->eof()) {
       break;
     }
-    initInputFile(thisFile);
-
-    if(!enough_per_file_space_free()) {
-      error("batchinsert failed: no more room for metadata", thisFile);
+    if(usingTimes){
+      if(timesFile->eof()) {
+        error("not enough timestamp files in timesList", timesFileName);
+      }
+      thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
     }
-
-    if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
-      error("batchinsert failed: no more room in database", thisFile);
-    }
-    
-    if(s.count(thisKey)) {
-      VERB_LOG(0, "key already exists in database: %s\n", thisKey);
-    } else {
-      s.insert(thisKey);
-      // Make a track index table of features to file indexes
-      unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
-      if(!numVectors) {
-        VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
+    if (usingPower) {
+      if(powerFile->eof()) {
+        error("not enough power files in powerList", powerFileName);
       }
-      else{
-	if(usingTimes){
-	  if(timesFile->eof()) {
-	    error("not enough timestamp files in timesList", timesFileName);
-	  }
-	  thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
-	  if(!thisTimesFile->is_open()) {
-	    error("Cannot open timestamp file", thisTimesFileName);
-	  }
-	  off_t insertoffset = dbH->length;
-	  unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double));
-	  double *timesdata = timesTable + 2*indexoffset;
-          if(2*(indexoffset + numVectors) > timesTableLength) {
-            error("out of space for times", key);
-          }
-	  insertTimeStamps(numVectors, thisTimesFile, timesdata);
-	  if(thisTimesFile)
-	    delete thisTimesFile;
-	}
-        
-        if (usingPower) {
-          if(powerFile->eof()) {
-            error("not enough power files in powerList", powerFileName);
-          }
-          thispowerfd = open(thisPowerFileName, O_RDONLY);
-          if (thispowerfd < 0) {
-            error("failed to open power file", thisPowerFileName);
-          }
-          off_t insertoffset = dbH->length;
-          unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double));
-          double *powerdata = powerTable + poweroffset;
-          insertPowerData(numVectors, thispowerfd, powerdata);
-          if (0 < thispowerfd) {
-            close(thispowerfd);
-          }
-        }
-
-	INSERT_FILETABLE_STRING(fileTable, thisKey);
-
-	off_t insertoffset = dbH->length;// Store current state
-
-	// Increment file count
-	dbH->numFiles++;  
-  
-	// Update Header information
-	dbH->length+=(statbuf.st_size-sizeof(int));
-  
-	// Update track to file index map
-	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
-
-	insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
-	
-	// Norm the vectors on input if the database is already L2 normed
-	if(dbH->flags & O2_FLAG_L2NORM)
-	  unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors);
-	
-	totalVectors+=numVectors;
-
-	// Copy the header back to the database
-	memcpy (db, dbH, sizeof(dbTableHeaderT));  
+      thispowerfd = open(thisPowerFileName, O_RDONLY);
+      if (thispowerfd < 0) {
+        error("failed to open power file", thisPowerFileName);
       }
     }
-
-    // CLEAN UP
-    munmap(indata,statbuf.st_size);
-    indata = NULL;
-    close(infid);
-    infid = 0;
+    insertDatum(thisFile, thisTimesFile, thispowerfd, thisKey);
+    if(thisTimesFile)
+      delete thisTimesFile;
+    if(thispowerfd)
+      close(thispowerfd);
   } while(!filesIn->eof());
 
   VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
--- a/libtests/0027/prog1.c	Wed Dec 03 17:40:17 2008 +0000
+++ b/libtests/0027/prog1.c	Fri Dec 05 22:32:43 2008 +0000
@@ -76,7 +76,7 @@
 //${AUDIODB} -d testdb -I -f testfeature -w testpower
     myinsert.features="testfeature";
     myinsert.power="testpower";
-    myerr=audiodb_insert(mydbp,&myinsert);   
+    //myerr=audiodb_insert(mydbp,&myinsert);   
     if (audiodb_insert(mydbp,&myinsert)){ returnval=-1; }