changeset 108:bc141fd1dc41

New on-disk format! * new magic number: "o2db" rather than "O2DB". Check for the old one and give a helpful error message. (We could in principle handle the old databases transparently, but since only MC has actually used them and he has indicated that this is not desperately necessary...) * add fields to header: a file format version (currently 0; we have 2^32-1 revisions to make before we have to worry...) and fields for the offsets of the various tables. This is hopefully a little bit more futureproof: we can move the default locations of the tables around, and even adding new optional tables can be done easily in a fairly transparent manner (if the relevant header field is 0, don't use that feature). * align regions to the appropriate 32-word boundary. This gives us some space to breathe for in the header (admittedly only 15 words on 64-bit architectures...)
author mas01cr
date Fri, 05 Oct 2007 14:21:43 +0000
parents a0e422e3c553
children 74fb7524999c
files audioDB.cpp audioDB.h
diffstat 2 files changed, 53 insertions(+), 45 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB.cpp	Fri Oct 05 11:37:56 2007 +0000
+++ b/audioDB.cpp	Fri Oct 05 14:21:43 2007 +0000
@@ -420,11 +420,17 @@
   assert(dbH);
 
   // Initialize header
-  dbH->magic=O2_MAGIC;
-  dbH->numFiles=0;
-  dbH->length=0;
-  dbH->dim=0;
-  dbH->flags=0; //O2_FLAG_L2NORM;
+  dbH->magic = O2_MAGIC;
+  dbH->version = O2_FORMAT_VERSION;
+  dbH->numFiles = 0;
+  dbH->dim = 0;
+  dbH->flags = 0;
+  dbH->length = 0;
+  dbH->fileTableOffset = ALIGN_UP(O2_HEADERSIZE, 8);
+  dbH->trackTableOffset = ALIGN_UP(dbH->fileTableOffset + O2_FILETABLESIZE*O2_MAXFILES, 8);
+  dbH->dataOffset = ALIGN_UP(dbH->trackTableOffset + O2_TRACKTABLESIZE*O2_MAXFILES, 8);
+  dbH->l2normTableOffset = ALIGN_DOWN(O2_DEFAULTDBSIZE - O2_MAXFILES*O2_MEANNUMVECTORS*sizeof(double), 8);
+  dbH->timesTableOffset = ALIGN_DOWN(dbH->l2normTableOffset - O2_MAXFILES*O2_MEANNUMVECTORS*sizeof(double), 8);
 
   memcpy (db, dbH, O2_HEADERSIZE);
   if(verbosity) {
@@ -434,8 +440,7 @@
 
 
 void audioDB::drop(){
-    
-    
+  // FIXME: drop something?  Should we even allow this?
 }
 
 // initTables - memory map files passed as arguments
@@ -462,15 +467,19 @@
     error("error reading db header", dbName, "read");
   }
 
-  fileTableOffset = O2_HEADERSIZE;
-  trackTableOffset = fileTableOffset + O2_FILETABLESIZE*O2_MAXFILES;
-  dataoffset = trackTableOffset + O2_TRACKTABLESIZE*O2_MAXFILES;
-  l2normTableOffset = O2_DEFAULTDBSIZE - O2_MAXFILES*O2_MEANNUMVECTORS*sizeof(double);
-  timesTableOffset = l2normTableOffset - O2_MAXFILES*O2_MEANNUMVECTORS*sizeof(double);
+  if(dbH->magic == O2_OLD_MAGIC) {
+    // FIXME: if anyone ever complains, write the program to convert
+    // from the old audioDB format to the new...
+    error("database file has old O2 header", dbName);
+  }
 
   if(dbH->magic != O2_MAGIC) {
     cerr << "expected: " << O2_MAGIC << ", got: " << dbH->magic << endl;
-    error("database file has incorrect header",dbName);
+    error("database file has incorrect header", dbName);
+  }
+
+  if(dbH->version != O2_FORMAT_VERSION) {
+    error("database file has incorect version", dbName);
   }
 
   if(inFile)
@@ -497,11 +506,11 @@
     error("mmap error for initting tables of database", "", "mmap");
 
   // Make some handy tables with correct types
-  fileTable= (char*)(db+fileTableOffset);
-  trackTable = (unsigned*)(db+trackTableOffset);
-  dataBuf  = (double*)(db+dataoffset);
-  l2normTable = (double*)(db+l2normTableOffset);
-  timesTable = (double*)(db+timesTableOffset);
+  fileTable= (char*)(db+dbH->fileTableOffset);
+  trackTable = (unsigned*)(db+dbH->trackTableOffset);
+  dataBuf  = (double*)(db+dbH->dataOffset);
+  l2normTable = (double*)(db+dbH->l2normTableOffset);
+  timesTable = (double*)(db+dbH->timesTableOffset);
 }
 
 void audioDB::insert(const char* dbName, const char* inFile){
@@ -569,11 +578,11 @@
   memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
 
   // Update the feature database
-  memcpy (db+dataoffset+insertoffset, indata+sizeof(int), statbuf.st_size-sizeof(int));
+  memcpy (db+dbH->dataOffset+insertoffset, indata+sizeof(int), statbuf.st_size-sizeof(int));
   
   // Norm the vectors on input if the database is already L2 normed
   if(dbH->flags & O2_FLAG_L2NORM)
-    unitNormAndInsertL2((double*)(db+dataoffset+insertoffset), dbH->dim, numVectors, 1); // append
+    unitNormAndInsertL2((double*)(db+dbH->dataOffset+insertoffset), dbH->dim, numVectors, 1); // append
 
   // Report status
   status(dbName);
@@ -668,17 +677,10 @@
   if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
     error("Must use timestamps with timestamped database","use --times");
 
-  fileTableOffset = O2_HEADERSIZE;
-  trackTableOffset = fileTableOffset + O2_FILETABLESIZE*O2_MAXFILES;
-  dataoffset = trackTableOffset + O2_TRACKTABLESIZE*O2_MAXFILES;
-  l2normTableOffset = O2_DEFAULTDBSIZE - O2_MAXFILES*O2_MEANNUMVECTORS*sizeof(double);
-  timesTableOffset = l2normTableOffset - O2_MAXFILES*O2_MEANNUMVECTORS*sizeof(double);
-
   if(dbH->magic!=O2_MAGIC){
     cerr << "expected:" << O2_MAGIC << ", got:" << dbH->magic << endl;
     error("database file has incorrect header",dbName);
   }
-
   
   unsigned totalVectors=0;
   char *thisKey = new char[MAXSTR];
@@ -711,11 +713,11 @@
       error("mmap error for batchinsert into database", "", "mmap");
     
     // Make some handy tables with correct types
-    fileTable= (char*)(db+fileTableOffset);
-    trackTable = (unsigned*)(db+trackTableOffset);
-    dataBuf  = (double*)(db+dataoffset);
-    l2normTable = (double*)(db+l2normTableOffset);
-    timesTable = (double*)(db+timesTableOffset);
+    fileTable= (char*)(db+dbH->fileTableOffset);
+    trackTable = (unsigned*)(db+dbH->trackTableOffset);
+    dataBuf  = (double*)(db+dbH->dataOffset);
+    l2normTable = (double*)(db+dbH->l2normTableOffset);
+    timesTable = (double*)(db+dbH->timesTableOffset);
 
     // Check that there is room for at least 1 more file
     if((char*)timesTable<((char*)dataBuf+(dbH->length+statbuf.st_size-sizeof(int))))
@@ -795,11 +797,11 @@
 	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
 	
 	// Update the feature database
-	memcpy (db+dataoffset+insertoffset, indata+sizeof(int), statbuf.st_size-sizeof(int));
+	memcpy (db+dbH->dataOffset+insertoffset, indata+sizeof(int), statbuf.st_size-sizeof(int));
 	
 	// Norm the vectors on input if the database is already L2 normed
 	if(dbH->flags & O2_FLAG_L2NORM)
-	  unitNormAndInsertL2((double*)(db+dataoffset+insertoffset), dbH->dim, numVectors, 1); // append
+	  unitNormAndInsertL2((double*)(db+dbH->dataOffset+insertoffset), dbH->dim, numVectors, 1); // append
 	
 	totalVectors+=numVectors;
       }
@@ -895,11 +897,11 @@
     cout << "data dim:" << dbH->dim <<endl;
     if(dbH->dim>0){
       cout << "total vectors:" << dbH->length/(sizeof(double)*dbH->dim)<<endl;
-      cout << "vectors available:" << (timesTableOffset-(dataoffset+dbH->length))/(sizeof(double)*dbH->dim) << endl;
+      cout << "vectors available:" << (dbH->timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << endl;
     }
-    cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(timesTableOffset-dataoffset) << "%)" << endl;
-    cout << "bytes available:" << timesTableOffset-(dataoffset+dbH->length) << " (" <<
-      (100.0*(timesTableOffset-(dataoffset+dbH->length)))/(timesTableOffset-dataoffset) << "%)" << endl;
+    cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << endl;
+    cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" <<
+      (100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << endl;
     cout << "flags:" << dbH->flags << endl;
     
     cout << "null count: " << nullCount << " small sequence count " << dudCount-nullCount << endl;    
--- a/audioDB.h	Fri Oct 05 11:37:56 2007 +0000
+++ b/audioDB.h	Fri Oct 05 14:21:43 2007 +0000
@@ -44,7 +44,9 @@
 #define COM_KEYLIST "--keyList"
 #define COM_TIMES "--times"
 
-#define O2_MAGIC 1111765583 // 'B'<<24|'D'<<16|'2'<<8|'O' reads O2DB in little endian order
+#define O2_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24)
+#define O2_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24)
+#define O2_FORMAT_VERSION (0U)
 
 #define O2_DEFAULT_POINTNN (10U)
 #define O2_DEFAULT_TRACKNN  (10U)
@@ -78,15 +80,24 @@
 // Macros
 #define O2_ACTION(a) (strcmp(command,a)==0)
 
+#define ALIGN_UP(x,w) ((x) + ((1<<w)-1) & ~((1<<w)-1))
+#define ALIGN_DOWN(x,w) ((x) & ~((1<<w)-1))
+
 using namespace std;
 
 // 64 byte header
 typedef struct dbTableHeader{
   unsigned magic;
+  unsigned version;
   unsigned numFiles;
   unsigned dim;
-  unsigned length;
   unsigned flags;
+  size_t length;
+  size_t fileTableOffset;
+  size_t trackTableOffset;
+  size_t dataOffset;
+  size_t l2normTableOffset;
+  size_t timesTableOffset;
 } dbTableHeaderT, *dbTableHeaderPtr;
 
 
@@ -111,11 +122,6 @@
   char* indata;
   struct stat statbuf;  
   dbTableHeaderPtr dbH;
-  size_t fileTableOffset;
-  size_t trackTableOffset;
-  size_t dataoffset;
-  size_t l2normTableOffset;
-  size_t timesTableOffset;
   
   char *fileTable;
   unsigned* trackTable;