changeset 324:c93be2f3a674

Merge of branches/large_adb -r 514:524 onto the trunk. No conflicts. Added LARGE_ADB support. Turn on with --ntracks 20001 or greater. Use --adb_feature_root to locate feature files at QUERY time. A bug fix in LSH indexing that was incorrectly thresholding large numbers of shingles.
author mas01mc
date Thu, 21 Aug 2008 21:28:33 +0000
parents d2c56d4f841e
children 57fde215d913
files audioDB.cpp audioDB.h common.cpp create.cpp dump.cpp gengetopt.in index.cpp insert.cpp lshlib.cpp lshlib.h query.cpp reporter.h sample.cpp soap.cpp
diffstat 14 files changed, 793 insertions(+), 228 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/audioDB.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -1,19 +1,21 @@
 #include "audioDB.h"
 
 LSH* SERVER_LSH_INDEX_SINGLETON;
+char* SERVER_ADB_ROOT;
+char* SERVER_ADB_FEATURE_ROOT;
 
 PointPair::PointPair(Uns32T a, Uns32T b, Uns32T c):trackID(a),qpos(b),spos(c){};
 
 bool operator<(const PointPair& a, const PointPair& b){
-  return ( (a.qpos<b.qpos) || 
-	   ((a.qpos==b.qpos) && 
-	    ( (a.trackID<b.trackID)) || ((a.trackID==b.trackID)&&(a.spos<b.spos)) ) );	    
+  return ( (a.trackID<b.trackID) ||
+	   ( (a.trackID==b.trackID) &&  
+	     ( (a.spos<b.spos) || ( (a.spos==b.spos) && (a.qpos < b.qpos) )) ) );
 }
 
 bool operator>(const PointPair& a, const PointPair& b){
-  return ( (a.qpos>b.qpos) || 
-	   ((a.qpos==b.qpos) && 
-	    ( (a.trackID>b.trackID)) || ((a.trackID==b.trackID)&&(a.spos>b.spos)) ) );
+  return ( (a.trackID>b.trackID) ||
+	   ( (a.trackID==b.trackID) &&  
+	     ( (a.spos>b.spos) || ( (a.spos==b.spos) && (a.qpos > b.qpos) )) ) );
 }
 
 bool operator==(const PointPair& a, const PointPair& b){
@@ -34,6 +36,10 @@
     error("No command found");
   }
 
+  // Perform database prefix substitution
+  if(adb_root)
+    prefix_name((char** const)&dbName, adb_root);
+
   if(O2_ACTION(COM_SERVER))
     startServer();
 
@@ -86,6 +92,9 @@
   try {
     isServer = 1; // FIXME: Hack
     processArgs(argc, argv);
+    // Perform database prefix substitution
+    if(adb_root)
+      prefix_name((char** const)&dbName, adb_root);
     assert(O2_ACTION(COM_QUERY));
     query(dbName, inFile, adbQueryResponse);
   } catch(char *err) {
@@ -99,6 +108,9 @@
   try {
     isServer = 1; // FIXME: Hack
     processArgs(argc, argv);
+    // Perform database prefix substitution
+    if(adb_root)
+      prefix_name((char** const)&dbName, adb_root);
     assert(O2_ACTION(COM_STATUS));
     status(dbName, adbStatusResponse);
   } catch(char *err) {
@@ -125,6 +137,12 @@
     munmap(powerTable, powerTableLength);
   if(l2normTable)
     munmap(l2normTable, l2normTableLength);
+  if(featureFileNameTable)
+    munmap(featureFileNameTable, fileTableLength);
+  if(timesFileNameTable)
+    munmap(timesFileNameTable, fileTableLength);
+  if(powerFileNameTable)
+    munmap(powerFileNameTable, fileTableLength);
   if(trackOffsetTable)
     delete trackOffsetTable;
   if(reporter)
@@ -237,6 +255,20 @@
     relative_threshold = args_info.relative_threshold_arg;
   }
 
+  if (args_info.adb_root_given){
+    adb_root = args_info.adb_root_arg;
+  }
+
+  if (args_info.adb_feature_root_given){
+    adb_feature_root = args_info.adb_feature_root_arg;
+  }
+
+  // perform dbName path prefix SERVER-side subsitution
+  if(SERVER_ADB_ROOT && !adb_root)
+    adb_root = SERVER_ADB_ROOT;
+  if(SERVER_ADB_FEATURE_ROOT && !adb_feature_root)
+    adb_feature_root = SERVER_ADB_FEATURE_ROOT;
+  
   if(args_info.SERVER_given){
     command=COM_SERVER;
     port=args_info.SERVER_arg;
@@ -527,15 +559,23 @@
     std::cout << "data dim:" << dbH->dim <<std::endl;
     if(dbH->dim>0){
       std::cout << "total vectors:" << dbH->length/(sizeof(double)*dbH->dim)<<std::endl;
-      std::cout << "vectors available:" << (dbH->timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << std::endl;
+      if(dbH->flags & O2_FLAG_LARGE_ADB)
+	std::cout << "vectors available:" << O2_MAX_VECTORS - (dbH->length / (sizeof(double)*dbH->dim)) << std::endl;
+      else
+	std::cout << "vectors available:" << (dbH->timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << std::endl;
     }
-    std::cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl;
-    std::cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" <<
-      (100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl;
+    if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
+      std::cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl;
+      std::cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" <<
+	(100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl;
+    }
     std::cout << "flags:" << " l2norm[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_L2NORM)
 	      << "] minmax[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_MINMAX)
 	      << "] power[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_POWER)
-	      << "] times[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_TIMES) << "]" << endl;    
+	      << "] times[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_TIMES) 
+	      << "] largeADB[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_LARGE_ADB)
+	      << "]" << endl;    
+              
     std::cout << "null count: " << nullCount << " small sequence count " << dudCount-nullCount << std::endl;    
   } else {
     adbStatusResponse->result.numFiles = dbH->numFiles;
@@ -550,7 +590,7 @@
 void audioDB::l2norm(const char* dbName) {
   forWrite = true;
   initTables(dbName, 0);
-  if(dbH->length>0){
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB ) && (dbH->length>0) ){
     /* FIXME: should probably be uint64_t */
     unsigned numVectors = dbH->length/(sizeof(double)*dbH->dim);
     CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength);
@@ -563,8 +603,8 @@
 
 void audioDB::power_flag(const char *dbName) {
   forWrite = true;
-  initTables(dbName, 0);
-  if (dbH->length > 0) {
+  initTables(dbName, 0);  
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB ) && (dbH->length>0) ){
     error("cannot turn on power storage for non-empty database", dbName);
   }
   dbH->flags |= O2_FLAG_POWER;
@@ -583,7 +623,7 @@
 
   assert(l2normTable);
 
-  if( !append && (dbH->flags & O2_FLAG_L2NORM) )
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB) && !append && (dbH->flags & O2_FLAG_L2NORM) )
     error("Database is already L2 normed", "automatic norm on insert is enabled");
 
   VERB_LOG(2, "norming %u vectors...", n);
@@ -624,5 +664,7 @@
 // so it is a good place to set any global state variables
 int main(const unsigned argc, char* const argv[]){
   SERVER_LSH_INDEX_SINGLETON = 0; // Initialize global variables
+  SERVER_ADB_ROOT = 0;            // Server-side database root prefix
+  SERVER_ADB_FEATURE_ROOT = 0;    // Server-side features root prefix
   audioDB(argc, argv);
 }
--- a/audioDB.h	Tue Aug 12 14:25:51 2008 +0000
+++ b/audioDB.h	Thu Aug 21 21:28:33 2008 +0000
@@ -80,16 +80,20 @@
 #define O2_DEFAULTDBSIZE (2000000000) // 2GB table size
 
 // Bit masks for packing (trackID,pointID) into 32-bit unsigned int
-#define LSH_N_POINT_BITS 14
-#define LSH_TRACK_MASK 0xFFFFC000U // 2^18 = 262144 tracks
-#define LSH_POINT_MASK 0x00003FFFU // 2^14 = 16384 points per track
+// This can be controlled at compile time
+#define O2_DEFAULT_LSH_N_POINT_BITS 14
+
+// Override the default point bit width for large database support
+#ifndef LSH_N_POINT_BITS
+#define LSH_N_POINT_BITS O2_DEFAULT_LSH_N_POINT_BITS
+#endif
 
 // LIMIT PARAMETERS
 #define O2_DEFAULT_DATASIZE (1355U) // in MB
 #define O2_DEFAULT_NTRACKS (20000U)
 #define O2_DEFAULT_DATADIM (9U)
 #define O2_REALTYPE (double)
-#define O2_MAXFILES (20000U)
+#define O2_MAXFILES (1000000U)
 #define O2_MAXFILESTR (256U)
 #define O2_FILETABLE_ENTRY_SIZE (O2_MAXFILESTR)
 #define O2_TRACKTABLE_ENTRY_SIZE (sizeof(unsigned))
@@ -98,17 +102,21 @@
 #define O2_MAXDIM (2000U)
 #define O2_MAXNN (1000000U)
 #define O2_MAXSEQLEN (8000U)            // maximum feature vectors in a sequence
-#define O2_MAXTRACKS (10000U)           // maximum number of tracks
-#define O2_MAXTRACKLEN ((LSH_POINT_MASK+1)) // maximum shingles in a track
+#define O2_MAXTRACKS (1000000U)           // maximum number of tracks
+#define O2_MAXTRACKLEN (1<<LSH_N_POINT_BITS) // maximum shingles in a track
 #define O2_MAXDOTPRODUCTMEMORY (sizeof(O2_REALTYPE)*O2_MAXSEQLEN*O2_MAXSEQLEN) // 512MB
 #define O2_DISTANCE_TOLERANCE (1e-6)
-#define O2_SERIAL_MAX_TRACKBATCH (10000)
+#define O2_SERIAL_MAX_TRACKBATCH (1000000)
+#define O2_LARGE_ADB_SIZE (O2_DEFAULT_DATASIZE+1) // datasize at which features are kept externally (in Mbytes)
+#define O2_LARGE_ADB_NTRACKS (O2_DEFAULT_NTRACKS+1) // ntracks at which features are kept externally
+#define O2_MAX_VECTORS ( O2_MEANNUMVECTORS * O2_MAXTRACKS )
 
 // Flags
 #define O2_FLAG_L2NORM (0x1U)
 #define O2_FLAG_MINMAX (0x2U)
 #define O2_FLAG_POWER (0x4U)
 #define O2_FLAG_TIMES (0x20U)
+#define O2_FLAG_LARGE_ADB (0x40U)
 #define DISPLAY_FLAG(x) (x?"on":"off")
 
 // Query types
@@ -146,7 +154,17 @@
     fflush(stderr); \
   }
 
+// We will only use this in a 32-bit address space
+// So map the off_t down to 32-bits first
+#define INSERT_FILETABLE_STRING(TABLE, STR) \
+    strncpy(TABLE + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, STR, strlen(STR));
+
+#define SAFE_DELETE(PTR) delete PTR; PTR=0;
+#define SAFE_DELETE_ARRAY(PTR) delete[] PTR; PTR=0;
+
 extern LSH* SERVER_LSH_INDEX_SINGLETON;
+extern char* SERVER_ADB_ROOT;
+extern char* SERVER_ADB_FEATURE_ROOT;
 
 typedef struct dbTableHeader {
   uint32_t magic;
@@ -192,8 +210,10 @@
   std::ifstream *timesFile;
   const char *powerFileName;
   std::ifstream *powerFile;
+  const char* adb_root;
+  const char* adb_feature_root;
+
   int powerfd;
-
   int dbfid;
   int lshfid;
   bool forWrite;
@@ -205,15 +225,19 @@
 
   gsl_rng *rng;
   
-  char *fileTable;
+  char* fileTable;
   unsigned* trackTable;
-  off_t *trackOffsetTable;
+  off_t* trackOffsetTable;
   double* dataBuf;
   double* inBuf;
   double* l2normTable;
   double* timesTable;
   double* powerTable;
 
+  char* featureFileNameTable;
+  char* timesFileNameTable;
+  char* powerFileNameTable;
+
   size_t fileTableLength;
   size_t trackTableLength;
   off_t dataBufLength;
@@ -269,7 +293,7 @@
 
   void initialize_arrays(int track, unsigned int numVectors, double *query, double *data_buffer, double **D, double **DD);
   void delete_arrays(int track, unsigned int numVectors, double **D, double **DD);
-  void read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p);
+  void read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p);
   void set_up_query(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned int *nvp);
   void set_up_query_from_key(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned *nvp, Uns32T queryIndex);
   void set_up_db(double **snp, double **vsnp, double **spp, double **vspp, double **mddp, unsigned int *dvp);
@@ -278,7 +302,7 @@
   double dot_product_points(double* q, double* p, Uns32T  L);
   void initRNG();
   void initDBHeader(const char *dbName);
-  void initInputFile(const char *inFile);
+  void initInputFile(const char *inFile, bool loadData = true);
   void initTables(const char* dbName, const char* inFile = 0);
   void initTablesFromKey(const char* dbName, const Uns32T queryIndex);
   void unitNorm(double* X, unsigned d, unsigned n, double* qNorm);
@@ -286,6 +310,8 @@
   void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata);
   void insertPowerData(unsigned n, int powerfd, double *powerdata);
   unsigned getKeyPos(char* key);
+  void prefix_name(char** const name, const char* prefix);
+
  public:
   audioDB(const unsigned argc, char* const argv[]);
   audioDB(const unsigned argc, char* const argv[], adb__queryResponse *adbQueryResponse);
@@ -301,6 +327,7 @@
   void insert_data_vectors(off_t offset, void *buffer, size_t size);
   void insert(const char* dbName, const char* inFile);
   void batchinsert(const char* dbName, const char* inFile);
+  void batchinsert_large_adb(const char* dbName, const char* inFile);
   void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0);
   void status(const char* dbName, adb__statusResponse *adbStatusResponse=0);
   unsigned random_track(unsigned *propTable, unsigned total);
@@ -322,6 +349,8 @@
   Uns32T lsh_param_N; // Number of rows per hash table
   Uns32T lsh_param_b; // Batch size, in number of tracks, per indexing iteration
   Uns32T lsh_param_ncols; // Maximum number of collision in a hash-table row
+  Uns32T lsh_n_point_bits; // How many bits to use to encode point ID within a track
+
 
   // LSH vector<> containers for one in-core copy of a set of feature vectors
   vector<float>::iterator vi; // feature vector iterator
@@ -342,13 +371,14 @@
   char* index_get_name(const char*dbName, double radius, Uns32T sequenceLength);
   static void index_add_point_approximate(void* instance, Uns32T pointID, Uns32T qpos, float dist); // static point reporter callback method
   static void index_add_point_exact(void* instance, Uns32T pointID, Uns32T qpos, float dist); // static point reporter callback method
-  static Uns32T index_to_trackID(Uns32T lshID);  // Convert lsh point index to audioDB trackID
-  static Uns32T index_to_trackPos(Uns32T lshID); // Convert lsh point index to audioDB trackPos (spos)
-  static Uns32T index_from_trackInfo(Uns32T, Uns32T); // Convert audioDB trackID and trackPos to an lsh point index
+  static Uns32T index_to_trackID(Uns32T lshID, Uns32T nPntBits);  // Convert lsh point index to audioDB trackID
+  static Uns32T index_to_trackPos(Uns32T lshID, Uns32T nPntBits); // Convert lsh point index to audioDB trackPos (spos)
+  static Uns32T index_from_trackInfo(Uns32T trackID, Uns32T pntID, Uns32T nPntBits); // Convert audioDB trackID and trackPos to an lsh point index
   void initialize_exact_evalutation_queue();
   void index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos);
   LSH* index_allocate(char* indexName, bool load_hashTables);
-
+  void init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp);
+  
   // Web Services
   void startServer();
   void ws_status(const char*dbName, char* hostport);
@@ -370,7 +400,9 @@
     timesFile(0),				\
     powerFileName(0),				\
     powerFile(0),				\
-    powerfd(0),					\
+    adb_root(0),                                \
+    adb_feature_root(0),                        \
+    powerfd(0),                                 \
     dbfid(0),					\
     lshfid(0),					\
     forWrite(false),				\
@@ -386,6 +418,9 @@
     l2normTable(0),				\
     timesTable(0),				\
     powerTable(0),                              \
+    featureFileNameTable(0),                    \
+    timesFileNameTable(0),                      \
+    powerFileNameTable(0),                      \
     fileTableLength(0),				\
     trackTableLength(0),			\
     dataBufLength(0),				\
@@ -431,5 +466,6 @@
     lsh_param_N(0),				\
     lsh_param_b(0),				\
     lsh_param_ncols(0),                         \
+    lsh_n_point_bits(0),                        \
     vv(0)
 #endif
--- a/common.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/common.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -126,10 +126,18 @@
     } else {
       fileTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE);
       trackTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_TRACKTABLE_ENTRY_SIZE);
-      dataBufLength = ALIGN_PAGE_UP(dbH->length);
-      timesTableLength = ALIGN_PAGE_UP(2*(dbH->length / dbH->dim));
-      powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim);
-      l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim);
+      if( dbH->flags & O2_FLAG_LARGE_ADB ){
+	dataBufLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE);
+	timesTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE);
+	powerTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE);
+	l2normTableLength = 0;
+      }
+      else{
+	dataBufLength = ALIGN_PAGE_UP(dbH->length);
+	timesTableLength = ALIGN_PAGE_UP(2*(dbH->length / dbH->dim));
+	powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim);
+	l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim);
+      }
     }
     CHECKED_MMAP(char *, fileTable, dbH->fileTableOffset, fileTableLength);
     CHECKED_MMAP(unsigned *, trackTable, dbH->trackTableOffset, trackTableLength);
@@ -143,9 +151,18 @@
      *
      * CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength);
      */
-    CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength);
-    CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength);
-    CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength);
+    if( dbH->flags & O2_FLAG_LARGE_ADB ){
+      CHECKED_MMAP(char *, featureFileNameTable, dbH->dataOffset, fileTableLength);
+      if( dbH->flags & O2_FLAG_TIMES )
+	CHECKED_MMAP(char *, timesFileNameTable, dbH->timesTableOffset, fileTableLength);
+      if( dbH->flags & O2_FLAG_POWER )
+	CHECKED_MMAP(char *, powerFileNameTable, dbH->powerTableOffset, fileTableLength);
+    }
+    else{
+      CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength);
+      CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength);
+      CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength);
+    }
   }
 
   // build track offset table
@@ -154,10 +171,15 @@
   for(Uns32T k = 0; k < dbH->numFiles; k++){
     trackOffsetTable[k] = cumTrack;
     cumTrack += trackTable[k] * dbH->dim;
-  }  
+  }
+
+  // Assign correct number of point bits per track in LSH indexing / retrieval
+  lsh_n_point_bits = dbH->flags >> 28;
+  if( !lsh_n_point_bits )
+    lsh_n_point_bits = O2_DEFAULT_LSH_N_POINT_BITS;
 }
 
-void audioDB::initInputFile (const char *inFile) {
+void audioDB::initInputFile (const char *inFile, bool loadData) {
   if (inFile) {
     if ((infid = open(inFile, O_RDONLY)) < 0) {
       error("can't open input file for reading", inFile, "open");
@@ -189,7 +211,7 @@
       }
     }
     
-    if ((indata = (char *) mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, infid, 0)) == (caddr_t) -1) {
+    if (loadData && ((indata = (char *) mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, infid, 0)) == (caddr_t) -1)) {
       error("mmap error for input", inFile, "mmap");
     }
   }
@@ -208,3 +230,21 @@
     initInputFile(inFile);
 }
 
+// If name is relative path, side effect name with prefix/name
+// Do not free original pointer
+void audioDB::prefix_name(char** const name, const char* prefix){
+  // No prefix if prefix is empty
+  if(!prefix)
+    return;
+  // Allocate new memory, keep old memory
+  assert(name && *name);
+  if (strlen(*name) + strlen(prefix) + 1 > O2_MAXFILESTR)
+    error("error: path prefix + filename too long",prefix);
+  // Do not prefix absolute path+filename
+  if(**name=='/')
+    return;
+  // OK to prefix relative path+filename
+  char* prefixedName = (char*) malloc(O2_MAXFILESTR);
+  sprintf(prefixedName, "%s/%s", prefix, *name);
+  *name = prefixedName; // side effect new name to old name
+}
--- a/create.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/create.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -2,6 +2,7 @@
 
 /* Make a new database.
 
+IF size(featuredata) < O2_LARGE_ADB_SIZE 
    The database consists of:
 
    * a header (see dbTableHeader struct definition);
@@ -12,6 +13,16 @@
    * timesTable: (start,end) time points for each feature vector;
    * powerTable: associated power for each feature vector;
    * l2normTable: squared l2norms for each feature vector.
+   
+ELSE the database consists of:
+   
+   * a header (see dbTableHeader struct definition);
+   * keyTable: list of keys of tracks
+   * trackTable: sizes of tracks
+   * featureTable: list of feature file names
+   * timesTable: list of times file names
+   * powerTable: list of power file names
+
 */
 
 void audioDB::create(const char* dbName){
@@ -41,10 +52,31 @@
   off_t databytes = ((off_t) datasize) * 1024 * 1024;
   off_t auxbytes = databytes / datadim;
 
-  dbH->timesTableOffset = ALIGN_PAGE_UP(dbH->dataOffset + databytes);
-  dbH->powerTableOffset = ALIGN_PAGE_UP(dbH->timesTableOffset + 2*auxbytes);
-  dbH->l2normTableOffset = ALIGN_PAGE_UP(dbH->powerTableOffset + auxbytes);
-  dbH->dbSize = ALIGN_PAGE_UP(dbH->l2normTableOffset + auxbytes);
+  // For backward-compatibility, Record the point-encoding parameter for LSH indexing in the adb header
+  // If this value is 0 then it will be set to 14
+
+#if O2_LSH_N_POINT_BITS > 15
+#error "AudioDB Compile ERROR: consistency check of O2_LSH_POINT_BITS failed (>15)"
+#endif
+  
+  dbH->flags |= LSH_N_POINT_BITS << 28;
+
+  // If database will fit in a single file the vectors are copied into the AudioDB instance
+  // Else all the vectors are left on the FileSystem and we use the dataOffset as storage
+  // for the location of the features, powers and times files (assuming that arbitrary keys are used for the fileTable)
+  if(ntracks<O2_LARGE_ADB_NTRACKS && datasize<O2_LARGE_ADB_SIZE){
+    dbH->timesTableOffset = ALIGN_PAGE_UP(dbH->dataOffset + databytes);
+    dbH->powerTableOffset = ALIGN_PAGE_UP(dbH->timesTableOffset + 2*auxbytes);
+    dbH->l2normTableOffset = ALIGN_PAGE_UP(dbH->powerTableOffset + auxbytes);
+    dbH->dbSize = ALIGN_PAGE_UP(dbH->l2normTableOffset + auxbytes);
+  }
+  else{ // Create LARGE_ADB, features and powers kept on filesystem 
+    dbH->flags |= O2_FLAG_LARGE_ADB;
+    dbH->timesTableOffset = ALIGN_PAGE_UP(dbH->dataOffset + O2_FILETABLE_ENTRY_SIZE*ntracks);
+    dbH->powerTableOffset = ALIGN_PAGE_UP(dbH->timesTableOffset + O2_FILETABLE_ENTRY_SIZE*ntracks);
+    dbH->l2normTableOffset = ALIGN_PAGE_UP(dbH->powerTableOffset + O2_FILETABLE_ENTRY_SIZE*ntracks);
+    dbH->dbSize = dbH->l2normTableOffset;
+  } 
 
   write(dbfid, dbH, O2_HEADERSIZE);
 
--- a/dump.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/dump.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -5,6 +5,10 @@
     initTables(dbName, 0);
   }
 
+  if(dbH->flags & O2_FLAG_LARGE_ADB){
+    error("error: dump not supported for LARGE_ADB");
+  }
+
   if((mkdir(output, S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
     error("error making output directory", output, "mkdir");
   }
--- a/gengetopt.in	Tue Aug 12 14:25:51 2008 +0000
+++ b/gengetopt.in	Thu Aug 21 21:28:33 2008 +0000
@@ -8,6 +8,7 @@
 section "Database Operations" sectiondesc="All database operations require a database argument."
 
 option "database" d "database file required by Database commands." string typestr="filename" optional
+option "adb_root" - "path prefix for database" string typestr="path" dependon="database" optional
 
 section "Database Creation" sectiondesc="Creating a new database file."
 
@@ -23,7 +24,7 @@
 option "output" - "output directory" string dependon="DUMP" default="audioDB.dump" optional
 option "L2NORM" L "unit norm vectors and norm all future inserts." dependon="database" optional
 option "POWER"  P "turn on power flag for database." dependon="database" optional
-option "INDEX"  X "build an index for -d database at -R radius" dependon="database" dependon="radius" optional
+
 section "Database Information" sectiondesc="Information about databases."
 
 option "STATUS" S "output database information to stdout." dependon="database" optional
@@ -33,7 +34,7 @@
 section "Database Insertion" sectiondesc="The following commands insert feature files, with optional keys and timestamps.\n"
 
 option "INSERT"      I "add feature vectors to an existing database." dependon="features" optional
-option "UPDATE"      U "replace inserted vectors associated with key with new input vectors." dependon="features" dependon="key" dependon="database" optional hidden
+option "adb_feature_root" - "path prefix for feature files, times files and power files" string typestr="path" optional
 option "features" f "binary series of vectors file {int sz:ieee double[][sz]:eof}." string typestr="filename" dependon="database" optional
 option "times"    t "list of time points (ascii) for feature vectors." string typestr="filename" dependon="features" optional
 option "power"    w "binary power feature file." string typestr="filename" dependon="database" optional
@@ -62,6 +63,7 @@
 
 section "Locality-sensitive hashing (LSH) parameters" sectiondesc="These parameters control LSH indexing and retrieval\n"
 
+option "INDEX"  X "build an index for -d database at -R radius and -l sequenceLength" dependon="database" dependon="radius" optional
 option "lsh_w" - "width of LSH hash-function bins. " double default="4.0" dependon="INDEX" optional hidden
 option "lsh_k" - "even number of independent hash functions to employ with LSH" int typestr="size" default="8" dependon="INDEX" optional
 option "lsh_m" - "number of hash tables is m(m-1)/2" int typestr="size" default="5" dependon="INDEX" optional
@@ -79,9 +81,10 @@
 section "Web Services" sectiondesc="These commands enable the database process to establish a connection via the internet and operate as separate client and server processes.\n"
 
 option "SERVER" s "run as standalone web service on named port." int typestr="port" default="14475" optional
+option "load_index" - "make web service with memory-resident hashtables" flag off dependon="radius" optional
 option "client" c "run as a client using named host service." string typestr="hostname:port" optional
-option "load_index" - "make web service with memory-resident hashtables" flag off dependon="radius" optional
+
 
 text "
-Copyright (c) 2007 Michael Casey, Christophe Rhodes
+Copyright (c) 2007-2008 Michael Casey, Christophe Rhodes
                   Goldsmiths, University of London"
--- a/index.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/index.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -8,22 +8,27 @@
 //
 // Author: Michael Casey
 //   Date: 23 June 2008
+//
+// 19th August 2008 - added O2_FLAG_LARGE_ADB support
 
 #include "audioDB.h"
 #include "ReporterBase.h"
 
 
 /************************* LSH point index to audioDB conversion  *****************/
-Uns32T audioDB::index_to_trackID(Uns32T lshID){
-  return lshID>>LSH_N_POINT_BITS;
+Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
+  assert(nPntBits);
+  return lshID>>nPntBits;
 }
 
-Uns32T audioDB::index_to_trackPos(Uns32T lshID){
-  return lshID&LSH_POINT_MASK;
+Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
+  assert(nPntBits);
+  return lshID&((1<<nPntBits)-1);
 }
 
-Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){
-  return (trackID << LSH_N_POINT_BITS) | spos;
+Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
+  assert(nPntBits);
+  return (trackID << nPntBits) | spos;
 }
 
 /************************* LSH indexing and query initialization  *****************/
@@ -52,6 +57,10 @@
     return true;
 }
 
+// If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName())
+// If they match, i.e. path+dbName_resident == path+dbName_requested, use
+// the memory-resident index.
+// Else allocate a new LSH instance and load the index from disk
 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
   LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
   if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
@@ -78,19 +87,20 @@
 
 // Prepare the AudioDB database for read access and allocate auxillary memory
 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
+  if (!(dbH->flags & O2_FLAG_POWER)) {
+    error("INDEXed database must be power-enabled", dbName);
+  }
+
+  double *snpp = *snp, *sppp = 0;
+
   *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
   *snp = new double[*dvp];  // songs norm pointer: L2 norm table for each vector
-
-  double *snpp = *snp, *sppp = 0;
-  memcpy(*snp, l2normTable, *dvp * sizeof(double));
-
-  if (!(dbH->flags & O2_FLAG_POWER)) {
-    error("database not power-enabled", dbName);
-  }
   *spp = new double[*dvp]; // song powertable pointer
   sppp = *spp;
+  memcpy(*snp, l2normTable, *dvp * sizeof(double));
   memcpy(*spp, powerTable, *dvp * sizeof(double));
-
+  
+  
   for(Uns32T i = 0; i < dbH->numFiles; i++){
     if(trackTable[i] >= sequenceLength) {
       sequence_sum(snpp, trackTable[i], sequenceLength);
@@ -102,10 +112,10 @@
     snpp += trackTable[i];
     sppp += trackTable[i];
   }
-
+  
   *vsnp = *snp;
   *vspp = *spp;
-
+  
   // Move the feature vector read pointer to start of fetures in database
   lseek(dbfid, dbH->dataOffset, SEEK_SET);
 }
@@ -113,22 +123,28 @@
 
 /************************ LSH indexing ***********************************/
 void audioDB::index_index_db(const char* dbName){
-  
   char* newIndexName;
   double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
   Uns32T dbVectors = 0;
 
+
   printf("INDEX: initializing header\n");
   // Check if audioDB exists, initialize header and open database for read
   forWrite = false;
   initDBHeader(dbName);
 
+  if(dbH->flags & O2_FLAG_POWER)
+    usingPower = true;
+  
+  if(dbH->flags & O2_FLAG_TIMES)
+    usingTimes = true;
+
   newIndexName = index_get_name(dbName, radius, sequenceLength);
 
   // Set unit norming flag override
   audioDB::normalizedDistance = !audioDB::no_unit_norming;
 
-  printf("INDEX: dim %d\n", dbH->dim);
+  printf("INDEX: dim %d\n", (int)dbH->dim);
   printf("INDEX: R %f\n", radius);
   printf("INDEX: seqlen %d\n", sequenceLength);
   printf("INDEX: lsh_w %f\n", lsh_param_w);
@@ -141,8 +157,6 @@
   fflush(stdout);
 
 
-  index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
-  
   if((lshfid = open(newIndexName,O_RDONLY))<0){
     printf("INDEX: constructing new LSH index\n");  
     printf("INDEX: making index file %s\n", newIndexName);
@@ -160,7 +174,10 @@
     if( endTrack > dbH->numFiles)
       endTrack = dbH->numFiles;
     // Insert up to lsh_param_b tracks
-    index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);    
+    if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
+      index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);  
+    }
+    index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
     lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
     
     // Clean up
@@ -177,7 +194,7 @@
     // Get the lsh header info and find how many tracks are inserted already
     lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here
     assert(lsh);
-    Uns32T maxs = index_to_trackID(lsh->get_maxp())+1;
+    Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
     delete lsh;
     lsh = 0;
 
@@ -211,14 +228,66 @@
     exit(1);
   }
     
+  delete[] newIndexName;
+  delete[] sNorm;
+  delete[] sPower;
+}
 
-  delete[] newIndexName;
-  if(sNorm)
-    delete[] sNorm;
-  if(sPower)
-    delete[] sPower;
 
+// initialize auxillary track data from filesystem
+// pre-conditions:
+// dbH->flags & O2_FLAG_LARGE_ADB
+// feature data allocated and copied (fvp)
+//
+// post-conditions:
+// allocated power data
+// allocated l2norm data
+//
+void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){  
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
+    error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
 
+  // Allocate and read the power sequence
+  if(trackTable[trackID]>=sequenceLength){
+    
+    char* prefixedString = new char[O2_MAXFILESTR];
+    char* tmpStr = prefixedString;
+    // Open and check dimensions of power file
+    strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
+    prefix_name((char ** const)&prefixedString, adb_feature_root);
+    if(prefixedString!=tmpStr)
+      delete[] tmpStr;
+    powerfd = open(prefixedString, O_RDONLY);
+    if (powerfd < 0) {
+      error("failed to open power file", prefixedString);
+    }
+    if (fstat(powerfd, &statbuf) < 0) {
+      error("fstat error finding size of power file", prefixedString, "fstat");
+    }
+    
+    if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
+      error("Dimension mismatch: numPowers != numVectors", prefixedString);
+   
+    *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
+    assert(*sPowerp);
+    *spPtrp = *sPowerp;
+    insertPowerData(trackTable[trackID], powerfd, *sPowerp);
+    if (0 < powerfd) {
+      close(powerfd);
+    }
+    
+    sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
+    sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
+    powerTable = 0;
+
+    // Allocate and calculate the l2norm sequence
+    *sNormpp = new double[trackTable[trackID]];
+    assert(*sNormpp);
+    *snPtrp = *sNormpp;
+    unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp);
+    sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
+    sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
+  }
 }
 
 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
@@ -230,13 +299,35 @@
 
   VERB_LOG(1, "indexing tracks...");
 
-
+  int trackfd = dbfid;
   for(trackID = start_track ; trackID < end_track ; trackID++ ){
-    read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv
+    if( dbH->flags & O2_FLAG_LARGE_ADB ){
+      char* prefixedString = new char[O2_MAXFILESTR];
+      char* tmpStr = prefixedString;
+      // Open and check dimensions of feature file
+      strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
+      prefix_name((char ** const) &prefixedString, adb_feature_root);
+      if(prefixedString!=tmpStr)
+	delete[] tmpStr;
+      initInputFile(prefixedString, false); // nommap, file pointer at correct position
+      trackfd = infid;
+    }
+    read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv
     *fvpp = fvp; // Protect memory allocation and free() for track data
+    
+    if( dbH->flags & O2_FLAG_LARGE_ADB )
+      // Load power and calculate power and l2norm sequence sums
+      init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
+    
     if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
       break;    
-  }
+    if ( dbH->flags & O2_FLAG_LARGE_ADB ){
+      close(infid);
+      delete[] *sNormpp;
+      delete[] *sPowerp;
+      *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
+    }
+  } // end for(trackID = start_track ; ... )
   std::cout << "finished inserting." << endl;
 }
 
@@ -256,13 +347,17 @@
       numVecs = trackTable[trackID] - sequenceLength + 1;
     }
   }
-  vv = index_initialize_shingles(numVecs);
-
-  for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
-    index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
   
-  Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
-  Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp);
+  Uns32T numVecsAboveThreshold = 0, collisionCount = 0; 
+  if(numVecs){
+    vv = index_initialize_shingles(numVecs);
+    
+    for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
+      index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
+    
+    numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
+    collisionCount = index_insert_shingles(vv, trackID, *sppp);
+  }
   float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
 
   /* index_norm_shingles() only goes as far as the end of the
@@ -273,9 +368,11 @@
    * So let's be certain the pointers are in the correct place
    */
 
-  *snpp += trackTable[trackID];
-  *sppp += trackTable[trackID];
-  *fvpp += trackTable[trackID] * dbH->dim;
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
+    *snpp += trackTable[trackID];
+    *sppp += trackTable[trackID];
+    *fvpp += trackTable[trackID] * dbH->dim;
+  }
 
   std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
   std::cout.flush();  
@@ -285,10 +382,10 @@
 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
   Uns32T collisionCount = 0;
   cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
-  for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop)
-    if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){
-      collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID));
-      spp+=sequenceHop;
+  for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
+    if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
+      collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
+    spp+=sequenceHop;
     }
   return collisionCount;
 }
@@ -386,14 +483,14 @@
   if(lsh!=SERVER_LSH_INDEX_SINGLETON){  
     if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
       printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
-    printf("INDEX: dim %d\n", dbH->dim);
+    printf("INDEX: dim %d\n", (int)dbH->dim);
     printf("INDEX: R %f\n", lsh->get_radius());
     printf("INDEX: seqlen %d\n", sequenceLength);
     printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
     printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
     printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
     printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
-    printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp()));
+    printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
     printf("INDEX: Opened LSH index file %s\n", indexName);
     fflush(stdout);
   }
@@ -415,8 +512,8 @@
 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
   assert(instancePtr); // We need an instance for this callback
   audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
-  Uns32T trackID = index_to_trackID(pointID);
-  Uns32T spos = index_to_trackPos(pointID);
+  Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
+  Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
   // Skip identity in query_from_key
   if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
     myself->reporter->add_point(trackID, qpos, spos, dist);
@@ -427,8 +524,8 @@
 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
   assert(instancePtr); // We need an instance for this callback
   audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance  
-  Uns32T trackID = index_to_trackID(pointID);
-  Uns32T spos = index_to_trackPos(pointID);
+  Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
+  Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
   // Skip identity in query_from_key
   if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
     myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
@@ -449,10 +546,10 @@
 // return nqv: if index exists
 int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) {
   
-  unsigned int numVectors;
-  double *query, *query_data;
-  double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0;
-  double meanQdur;
+  unsigned int numVectors = 0;
+  double *query = 0, *query_data = 0;
+  double *qNorm = 0, *qnPtr = 0, *qPower = 0, *qpPtr = 0;
+  double meanQdur = 0;
   void (*add_point_func)(void*,Uns32T,Uns32T,float);
 
   // Set the point-reporter callback based on the value of lsh_exact
--- a/insert.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/insert.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -11,7 +11,7 @@
 }
 
 bool audioDB::enough_data_space_free(off_t size) {
-  return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
+    return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
 }
 
 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
@@ -23,6 +23,9 @@
   forWrite = true;
   initTables(dbName, inFile);
 
+  if(dbH->flags & O2_FLAG_LARGE_ADB)
+    error("Single-feature inserts not allowed with LARGE audioDB instances");
+
   if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
     error("Must use timestamps with timestamped database","use --times");
 
@@ -49,6 +52,7 @@
 
   if(alreadyInserted) {
     VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
+    // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08
     return;
   }
   
@@ -64,7 +68,7 @@
     return;
   }
 
-  strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key));
+  INSERT_FILETABLE_STRING(fileTable, key);
 
   off_t insertoffset = dbH->length;// Store current state
 
@@ -153,14 +157,14 @@
 }
 
 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
-  if (usingPower) {
+  if(usingPower){
     if (!(dbH->flags & O2_FLAG_POWER)) {
       error("Cannot insert power data on non-power DB", dbName);
     }
-
+    
     int one;
     unsigned int count;
-
+    
     count = read(powerfd, &one, sizeof(unsigned int));
     if (count != sizeof(unsigned int)) {
       error("powerfd read failed", "int", "read");
@@ -168,7 +172,7 @@
     if (one != 1) {
       error("dimensionality of power file not 1", powerFileName);
     }
-
+    
     // FIXME: should check that the powerfile is the right size for
     // this.  -- CSR, 2007-10-30
     count = read(powerfd, powerdata, numVectors * sizeof(double));
@@ -183,6 +187,12 @@
   forWrite = true;
   initDBHeader(dbName);
 
+  // Treat large ADB instances differently
+  if( dbH->flags & O2_FLAG_LARGE_ADB ){
+    batchinsert_large_adb(dbName, inFile) ;
+    return;
+  }
+    
   if(!key)
     key=inFile;
   std::ifstream *filesIn = 0;
@@ -289,8 +299,9 @@
             close(thispowerfd);
           }
         }
-	strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey));
-  
+
+	INSERT_FILETABLE_STRING(fileTable, thisKey);
+
 	off_t insertoffset = dbH->length;// Store current state
 
 	// Increment file count
@@ -301,7 +312,7 @@
   
 	// Update track to file index map
 	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
-	
+
 	insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
 	
 	// Norm the vectors on input if the database is already L2 normed
@@ -334,3 +345,171 @@
   // Report status
   status(dbName);
 }
+
+
+// BATCHINSERT_LARGE_ADB
+//
+// This method inserts file pointers into the ADB instance rather than the actual feature data
+//
+// This method is intended for databases that are large enough to only support indexed query
+// So exhaustive searching across all feature vectors will not be performed
+//
+// We insert featureFileName, [powerFileName], [timesFileName]
+//
+// l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time
+//
+// LIMITS:
+//
+// We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles
+//
+void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) {
+
+  if(!key)
+    key=inFile;
+  std::ifstream *filesIn = 0;
+  std::ifstream *keysIn = 0;
+  std::ifstream* thisTimesFile = 0;
+  int thispowerfd = 0;
+
+  if(!(filesIn = new std::ifstream(inFile)))
+    error("Could not open batch in file", inFile);
+  if(key && key!=inFile)
+    if(!(keysIn = new std::ifstream(key)))
+      error("Could not open batch key file",key);
+  
+  if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
+    error("Must use timestamps with timestamped database","use --times");
+
+  if(!usingPower && (dbH->flags & O2_FLAG_POWER))
+    error("Must use power with power-enabled database", dbName);
+
+  unsigned totalVectors=0;
+  char *thisFile = new char[MAXSTR];
+  char *thisKey = 0;
+  if (key && (key != inFile)) {
+    thisKey = new char[MAXSTR];
+  }
+  char *thisTimesFileName = new char[MAXSTR];
+  char *thisPowerFileName = new char[MAXSTR];
+
+  std::set<std::string> s;
+
+  for (unsigned k = 0; k < dbH->numFiles; k++) {
+    s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
+  }
+
+  do {
+    filesIn->getline(thisFile,MAXSTR);
+    if(key && key!=inFile) {
+      keysIn->getline(thisKey,MAXSTR);
+    } else {
+      thisKey = thisFile;
+    }
+    if(usingTimes) {
+      timesFile->getline(thisTimesFileName,MAXSTR);
+    }
+    if(usingPower) {
+      powerFile->getline(thisPowerFileName, MAXSTR);
+    }
+    
+    if(filesIn->eof()) {
+      break;
+    }
+    
+    initInputFile(thisFile, false);
+
+    if(!enough_per_file_space_free()) {
+      error("batchinsert failed: no more room for metadata", thisFile);
+    }
+
+    if(s.count(thisKey)) {
+      VERB_LOG(0, "key already exists in database: %s\n", thisKey);
+    } else {
+      s.insert(thisKey);
+      // Make a track index table of features to file indexes
+      unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
+      if(!numVectors) {
+        VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
+      }
+      else{
+	// Check that time-stamp file exists
+	if(usingTimes){
+	  if(timesFile->eof()) {
+	    error("not enough timestamp files in timesList", timesFileName);
+	  }
+	  thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
+	  if(!thisTimesFile->is_open()) {
+	    error("Cannot open timestamp file", thisTimesFileName);
+	  }
+	  if(thisTimesFile)
+	    delete thisTimesFile;
+	}
+
+	// Check that power file exists        
+        if (usingPower) {
+          if(powerFile->eof()) {
+            error("not enough power files in powerList", powerFileName);
+          }
+          thispowerfd = open(thisPowerFileName, O_RDONLY);
+          if (thispowerfd < 0) {
+            error("failed to open power file", thisPowerFileName);
+          }
+          if (0 < thispowerfd) {
+            close(thispowerfd);
+          }
+        }
+
+	// persist links to the feature files for reading from filesystem later
+	
+	// Primary Keys
+	INSERT_FILETABLE_STRING(fileTable, thisKey);
+	
+	// Feature Vector fileNames
+	INSERT_FILETABLE_STRING(featureFileNameTable, thisFile);
+	
+	// Time Stamp fileNames
+	if(usingTimes)
+	  INSERT_FILETABLE_STRING(timesFileNameTable, thisTimesFileName);
+
+
+	// Power fileNames
+	if(usingPower)
+	  INSERT_FILETABLE_STRING(powerFileNameTable, thisPowerFileName);
+
+	// Increment file count
+	dbH->numFiles++;  
+  
+	// Update Header information
+	dbH->length+=(statbuf.st_size-sizeof(int));
+  
+	// Update track to file index map
+	memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));  
+
+	totalVectors+=numVectors;
+
+	// Copy the header back to the database
+	memcpy (db, dbH, sizeof(dbTableHeaderT));  
+      }
+    }
+    // CLEAN UP
+    if(indata)
+      munmap(indata,statbuf.st_size);
+    if(infid>0)
+      close(infid);
+  } while(!filesIn->eof());
+
+  VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
+
+  delete [] thisPowerFileName;
+  if(key && (key != inFile)) {
+    delete [] thisKey;
+  }
+  delete [] thisFile;
+  delete [] thisTimesFileName;
+  
+  delete filesIn;
+  delete keysIn;
+
+  // Report status
+  status(dbName);
+}
--- a/lshlib.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/lshlib.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -771,9 +771,12 @@
     // Align each hash table to page boundary
     char* dbtable = serial_mmap(fid, hashTableSize, 1, 
 				align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
+#ifdef __CYGWIN__
+    // No madvise in CYGWIN
+#else
     if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
       error("could not advise hashtable memory","","madvise");
-    
+#endif
     maxColCount=0;
     minColCount=O2_SERIAL_MAX_COLS;
     meanColCount=0;
@@ -1161,8 +1164,12 @@
     // Align each hash table to page boundary
     char* dbtable = serial_mmap(fid, hashTableSize, 0, 
 				align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
+#ifdef __CYGWIN__
+    // No madvise in CYGWIN
+#else
     if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
       error("could not advise hashtable memory","","madvise");    
+#endif
     pt=(SerialElementT*)dbtable;
     for( y = 0 ; y < H::N ; y++ ){
       // Move disk pointer to beginning of row
@@ -1331,8 +1338,12 @@
     // memory map a single hash table for random access
     char* db = serial_mmap(dbfid, hashTableSize, 0, 
 			   align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
+#ifdef __CYGWIN__
+    // No madvise in CYGWIN
+#else
     if(madvise(db, hashTableSize, MADV_RANDOM)<0)
       error("could not advise local hashtable memory","","madvise");
+#endif
     SerialElementT* pe = (SerialElementT*)db ;
     for(Uns32T qpos=0; qpos<vv.size(); qpos++){
       H::compute_hash_functions(vv[qpos]);
@@ -1364,8 +1375,12 @@
     // memory map a single hash table for random access
     char* db = serial_mmap(dbfid, hashTableSize, 0, 
 			   align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
+#ifdef __CYGWIN__
+    // No madvise in CYGWIN
+#else
     if(madvise(db, hashTableSize, MADV_RANDOM)<0)
       error("could not advise local hashtable memory","","madvise");
+#endif
     SerialElementT* pe = (SerialElementT*)db ;
     H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j)); 
     serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
@@ -1384,8 +1399,12 @@
     // memory map a single hash table for random access
     char* db = serial_mmap(dbfid, hashTableSize, 0, 
 			   align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
+#ifdef __CYGWIN__
+    // No madvise in CYGWIN
+#else
     if(madvise(db, hashTableSize, MADV_SEQUENTIAL)<0)
       error("could not advise local hashtable memory","","madvise");
+#endif
     SerialElementT* pe = (SerialElementT*)db ;
     printf("*********** TABLE %d ***************\n", j);
     fflush(stdout);
--- a/lshlib.h	Tue Aug 12 14:25:51 2008 +0000
+++ b/lshlib.h	Thu Aug 21 21:28:33 2008 +0000
@@ -58,23 +58,25 @@
 #define O2_SERIAL_HEADER_SIZE sizeof(SerialHeaderT)
 #define O2_SERIAL_ELEMENT_SIZE sizeof(SerialElementT)
 #define O2_SERIAL_MAX_TABLES (200)
-#define O2_SERIAL_MAX_ROWS (1000000)
-#define O2_SERIAL_MAX_COLS (100000)
+#define O2_SERIAL_MAX_ROWS (1000000000)
+#define O2_SERIAL_MAX_COLS (1000000)
 #define O2_SERIAL_MAX_DIM (2000)
 #define O2_SERIAL_MAX_FUNS (100)
 #define O2_SERIAL_MAX_BINWIDTH (200)
 #define O2_SERIAL_MAXFILESIZE (4000000000UL)
 
 // Flags for Serial Header
-#define O2_SERIAL_FILEFORMAT1 (0x1U)       // Optimize for on-disk search
-#define O2_SERIAL_FILEFORMAT2 (0x2U)       // Optimize for in-core search
+#define O2_SERIAL_FILEFORMAT1 (0x1U)       // Optimize disk format for on-disk search
+#define O2_SERIAL_FILEFORMAT2 (0x2U)       // Optimize disk format for in-core search
+#define O2_SERIAL_COREFORMAT1 (0x4U)
+#define O2_SERIAL_COREFORMAT2 (0x8U)
 
 // Flags for serialization fileformat2: use high 3 bits of Uns32T
-#define O2_SERIAL_TOKEN_T1 (0xFFFFFFFC)
+#define O2_SERIAL_TOKEN_T1 (0xFFFFFFFCU)
 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU)
 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU)
 
-#define O2_INDEX_MAXSTR (512)
+#define O2_INDEX_MAXSTR (256)
 
 unsigned align_up(unsigned x, unsigned w);
 
@@ -320,7 +322,7 @@
 
   // Callback Function for point reporting
   void* calling_instance; // store calling object instance for member-function callback
-  void (*add_point_callback)(void*, Uns32T, Uns32T, float); // The callback
+  ReporterCallbackPtr add_point_callback; // Pointer to the callback function
 
  public:
   G(char* lshFile, bool lshInCore = false); // unserialize constructor
--- a/query.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/query.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -46,7 +46,7 @@
       if(index_exists(dbName, radius, sequenceLength)){
 	char* indexName = index_get_name(dbName, radius, sequenceLength);
 	lsh = index_allocate(indexName, false);
-	reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp())+1);
+	reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1);
 	delete[] indexName;
       }
       else
@@ -62,7 +62,7 @@
       if(index_exists(dbName, radius, sequenceLength)){
 	char* indexName = index_get_name(dbName, radius, sequenceLength);
 	lsh = index_allocate(indexName, false);
-	reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp())+1);
+	reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1);
 	delete[] indexName;
       }
       else
@@ -220,7 +220,7 @@
   }
 }
 
-void audioDB::read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p) {
+void audioDB::read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p) {
   if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) {
     if(*data_buffer_p) {
       free(*data_buffer_p);
@@ -235,7 +235,7 @@
     }
   }
 
-  read(dbfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim);
+  read(trkfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim);
 }
 
 // These names deserve some unpicking.  The names starting with a "q"
@@ -341,48 +341,70 @@
   
   VERB_LOG(1, "performing norms... ");
 
-  // Read query feature vectors from database
-  *qp = NULL;
-  lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET);
-  size_t allocatedSize = 0;
-  read_data(queryIndex, qp, &allocatedSize);
-  // Consistency check on allocated memory and query feature size
-  if(*nvp*sizeof(double)*dbH->dim != allocatedSize)
-    error("Query memory allocation failed consitency check","set_up_query_from_key");
-
-  Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors
-  // Copy L2 norm partial-sum coefficients
-  assert(*qnp = new double[*nvp]);
-  memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double));
-  sequence_sum(*qnp, *nvp, sequenceLength);
-  sequence_sqrt(*qnp, *nvp, sequenceLength);
-
-  if( usingPower ){
-    // Copy Power partial-sum coefficients
-    assert(*qpp = new double[*nvp]);
-    memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double));
-    sequence_sum(*qpp, *nvp, sequenceLength);
-    sequence_average(*qpp, *nvp, sequenceLength);
+  // For LARGE_ADB load query features from file
+  if( dbH->flags & O2_FLAG_LARGE_ADB ){
+    if(infid>0)
+      close(infid);
+    char* prefixedString = new char[O2_MAXFILESTR];
+    char* tmpStr = prefixedString;
+    strncpy(prefixedString, featureFileNameTable+queryIndex*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
+    prefix_name(&prefixedString, adb_feature_root);
+    if(tmpStr!=prefixedString)
+      delete[] tmpStr;
+    initInputFile(prefixedString, false); // nommap, file pointer at correct position
+    size_t allocatedSize = 0;
+    read_data(infid, queryIndex, qp, &allocatedSize); // over-writes qp and allocatedSize
+    // Consistency check on allocated memory and query feature size
+    if(*nvp*sizeof(double)*dbH->dim != allocatedSize)
+      error("Query memory allocation failed consitency check","set_up_query_from_key");
+    // Allocated and calculate auxillary sequences: l2norm and power
+    init_track_aux_data(queryIndex, *qp, qnp, vqnp, qpp, vqpp);
+  }
+  else{ // Load from self-contained ADB database
+    // Read query feature vectors from database
+    *qp = NULL;
+    lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET);
+    size_t allocatedSize = 0;
+    read_data(dbfid, queryIndex, qp, &allocatedSize);
+    // Consistency check on allocated memory and query feature size
+    if(*nvp*sizeof(double)*dbH->dim != allocatedSize)
+      error("Query memory allocation failed consitency check","set_up_query_from_key");
+    
+    Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors
+    // Copy L2 norm partial-sum coefficients
+    assert(*qnp = new double[*nvp]);
+    memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double));
+    sequence_sum(*qnp, *nvp, sequenceLength);
+    sequence_sqrt(*qnp, *nvp, sequenceLength);
+    
+    if( usingPower ){
+      // Copy Power partial-sum coefficients
+      assert(*qpp = new double[*nvp]);
+      memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double));
+      sequence_sum(*qpp, *nvp, sequenceLength);
+      sequence_average(*qpp, *nvp, sequenceLength);
+    }
+    
+    if (usingTimes) {
+      unsigned int k;
+      *mqdp = 0.0;
+      double *querydurs = new double[*nvp];
+      double *timesdata = new double[*nvp*2];
+      assert(querydurs && timesdata);
+      memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double));    
+      for(k = 0; k < *nvp; k++) {
+	querydurs[k] = timesdata[2*k+1] - timesdata[2*k];
+	*mqdp += querydurs[k];
+      }
+      *mqdp /= k;
+      
+      VERB_LOG(1, "mean query file duration: %f\n", *mqdp);
+      
+      delete [] querydurs;
+      delete [] timesdata;
+    }
   }
 
-  if (usingTimes) {
-    unsigned int k;
-    *mqdp = 0.0;
-    double *querydurs = new double[*nvp];
-    double *timesdata = new double[*nvp*2];
-    assert(querydurs && timesdata);
-    memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double));    
-    for(k = 0; k < *nvp; k++) {
-      querydurs[k] = timesdata[2*k+1] - timesdata[2*k];
-      *mqdp += querydurs[k];
-    }
-    *mqdp /= k;
-    
-    VERB_LOG(1, "mean query file duration: %f\n", *mqdp);
-    
-    delete [] querydurs;
-    delete [] timesdata;
-  }
   // Defaults, for exhaustive search (!usingQueryPoint)
   *vqp = *qp;
   *vqnp = *qnp;
@@ -487,7 +509,8 @@
   // Compute database info
   // FIXME: we more than likely don't need very much of the database
   // so make a new method to build these values per-track or, even better, per-point
-  set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors);
+  if( !( dbH->flags & O2_FLAG_LARGE_ADB) )
+    set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors);
 
   VERB_LOG(1, "matching points...");
 
@@ -495,48 +518,82 @@
   assert(trackNN>0 && trackNN<=O2_MAXNN);
 
   // We are guaranteed that the order of points is sorted by:
-  // qpos, trackID, spos
+  // trackID, spos, qpos
   // so we can be relatively efficient in initialization of track data.
   // Here we assume that points don't overlap, so we will use exhaustive dot
-  // product evaluation over the sequence
+  // product evaluation instead of memoization of partial sums which is used
+  // for exhaustive brute-force evaluation from smaller databases: e.g. query_loop()
   double dist;
   size_t data_buffer_size = 0;
   double *data_buffer = 0;
-  Uns32T trackOffset;
-  Uns32T trackIndexOffset;
+  Uns32T trackOffset = 0;
+  Uns32T trackIndexOffset = 0;
   Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range
   Uns32T npairs = exact_evaluation_queue->size();
   while(npairs--){
     PointPair pp = exact_evaluation_queue->top();
-    trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset
-    trackIndexOffset=trackOffset/dbH->dim;    // num vectors offset
-    if((!(usingPower) || powers_acceptable(qpPtr[usingQueryPoint?0:pp.qpos], sPower[trackIndexOffset+pp.spos])) &&
-       ((usingQueryPoint?0:pp.qpos) < numVectors-sequenceLength+1 && pp.spos < trackTable[pp.trackID]-sequenceLength+1)){
+    // Large ADB track data must be loaded here for sPower
+    if(dbH->flags & O2_FLAG_LARGE_ADB){
+      trackOffset=0;
+      trackIndexOffset=0;
       if(currentTrack!=pp.trackID){
+	char* prefixedString = new char[O2_MAXFILESTR];
+	char* tmpStr = prefixedString;
+	// On currentTrack change, allocate and load track data
 	currentTrack=pp.trackID;
-        lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET);
-	read_data(currentTrack, &data_buffer, &data_buffer_size);
+	SAFE_DELETE_ARRAY(sNorm);
+	SAFE_DELETE_ARRAY(sPower);
+	if(infid>0)
+	  close(infid);
+	// Open and check dimensions of feature file
+	strncpy(prefixedString, featureFileNameTable+pp.trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
+	prefix_name((char ** const) &prefixedString, adb_feature_root);
+	if (prefixedString!=tmpStr)
+	  delete[] tmpStr;
+	initInputFile(prefixedString, false); // nommap, file pointer at correct position
+	// Load the feature vector data for current track into data_buffer
+	read_data(infid, pp.trackID, &data_buffer, &data_buffer_size);	
+	// Load power and calculate power and l2norm sequence sums
+	init_track_aux_data(pp.trackID, data_buffer, &sNorm, &snPtr, &sPower, &spPtr);
       }
-      dist = dot_product_points(query+(usingQueryPoint?0:pp.qpos*dbH->dim), data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength);
+    }
+    else{
+      // These offsets are w.r.t. the entire database of feature vectors and auxillary variables
+      trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset
+      trackIndexOffset=trackOffset/dbH->dim;    // num vectors offset
+    }    
+    Uns32T qPos = usingQueryPoint?0:pp.qpos;// index for query point
+    Uns32T sPos = trackIndexOffset+pp.spos; // index into l2norm table
+    // Test power thresholds before computing distance
+    if( ( !usingPower || powers_acceptable(qpPtr[qPos], sPower[sPos])) &&
+	( qPos<numVectors-sequenceLength+1 && pp.spos<trackTable[pp.trackID]-sequenceLength+1 ) ){
+      // Non-large ADB track data is loaded inside power test for efficiency
+      if( !(dbH->flags & O2_FLAG_LARGE_ADB) && (currentTrack!=pp.trackID) ){
+	// On currentTrack change, allocate and load track data
+	currentTrack=pp.trackID;
+	lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET);
+	read_data(dbfid, currentTrack, &data_buffer, &data_buffer_size);
+      }
+      // Compute distance
+      dist = dot_product_points(query+qPos*dbH->dim, data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength);
+      double qn = qnPtr[qPos];
+      double sn = sNorm[sPos];
       if(normalizedDistance) 
-	dist = 2-(2/(qnPtr[usingQueryPoint?0:pp.qpos]*sNorm[trackIndexOffset+pp.spos]))*dist;
+	dist = 2 - (2/(qn*sn))*dist;
       else 
 	if(no_unit_norming)
-	  dist = qnPtr[usingQueryPoint?0:pp.qpos]*qnPtr[usingQueryPoint?0:pp.qpos]+sNorm[trackIndexOffset+pp.spos]*sNorm[trackIndexOffset+pp.spos] - 2*dist;
+	  dist = qn*qn + sn*sn - 2*dist;
       // else
       // dist = dist;      
       if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) 
-	reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist);
+	reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist);    
     }
     exact_evaluation_queue->pop();
   }
   // Cleanup
-  if(sNorm)
-    delete sNorm;
-  if(sPower)
-    delete sPower;
-  if(meanDBdur)
-    delete meanDBdur;
+  SAFE_DELETE_ARRAY(sNorm);
+  SAFE_DELETE_ARRAY(sPower);
+  SAFE_DELETE_ARRAY(meanDBdur);
 }
 
 // A completely unprotected dot-product method
@@ -555,6 +612,9 @@
   double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0;
   double meanQdur;
 
+  if( dbH->flags & O2_FLAG_LARGE_ADB )
+    error("error: LARGE_ADB requires indexed query");
+
   if(query_from_key)
     set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
   else
@@ -618,7 +678,7 @@
 
     trackIndexOffset=trackOffset/dbH->dim; // numVectors offset
 
-    read_data(track, &data_buffer, &data_buffer_size);
+    read_data(dbfid, track, &data_buffer, &data_buffer_size);
     if(sequenceLength <= trackTable[track]) {  // test for short sequences
       
       VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]);
--- a/reporter.h	Tue Aug 12 14:25:51 2008 +0000
+++ b/reporter.h	Thu Aug 21 21:28:33 2008 +0000
@@ -292,6 +292,7 @@
   }
   std::vector<NNresult>::reverse_iterator rit;
   std::priority_queue< NNresult, std::vector< NNresult>, std::greater<NNresult> > point_queue;      
+  NNresult rk;
 
   if(adbQueryResponse==0) {
     for(rit = v.rbegin(); rit < v.rend(); rit++) {
@@ -309,31 +310,57 @@
       }
       
       for(unsigned int k = 0; k < qsize; k++) {
-	NNresult rk = point_queue.top();
+	rk = point_queue.top();
 	std::cout << rk.dist << " " << rk.qpos << " " << rk.spos << std::endl;
 	point_queue.pop();
       }
     }
   } else {
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size];
-    ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size];
-    ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size];
-    ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size];
+   ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size*pointNN];
+    ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size*pointNN];
+    ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size*pointNN];
+    ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size*pointNN];
     unsigned int k = 0;
-    for(rit = v.rbegin(); rit < v.rend(); rit++, k++) {
+    // Loop over returned tracks
+    for(rit = v.rbegin(); rit < v.rend(); rit++) {
       r = *rit;
-      ((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR];
-      ((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = r.dist;
-      ((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = r.qpos;
-      ((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = r.spos;
-      if(fileTable)
-	snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE);
-      else
-	snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID);
+      // Reverse the order of the points stored in point_queues
+      unsigned int qsize=point_queues[r.trackID].size();
+      while(qsize--){
+	point_queue.push(point_queues[r.trackID].top());
+	point_queues[r.trackID].pop();
+      }
+      qsize=point_queue.size();
+      unsigned int numReports = pointNN;
+      while(numReports--){ // pop the rest of the points
+	if(qsize)
+	  rk = point_queue.top(); // Take one point from the top of the queue
+	else{
+	  rk.dist = 1000000000.0;
+	  rk.qpos = 0xFFFFFFFF;
+	  rk.spos = 0xFFFFFFFF;
+	}
+	  
+	((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR];
+	((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = rk.dist;
+	((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = rk.qpos;
+	((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = rk.spos;
+	if(qsize){
+	  if(fileTable)
+	    snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE);
+	  else
+	    snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID);	
+	  point_queue.pop();
+	  qsize--;
+	}
+	else
+	  snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "NULL");		  
+	k++;
+      }
     }
   }
   // clean up
@@ -641,17 +668,17 @@
     }
   }
  else {
-   ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size;
-    ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size];
-    ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size];
-    ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size];
-    ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size];
+   ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size*pointNN;
+    ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size*pointNN];
+    ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size*pointNN];
+    ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size*pointNN];
+    ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size*pointNN];
     unsigned int k = 0;
     // Loop over returned tracks
-    for(rit = v.rbegin(); rit < v.rend(); rit++, k++) {
+    for(rit = v.rbegin(); rit < v.rend(); rit++) {
       r = *rit;
       // Reverse the order of the points stored in point_queues
       unsigned int qsize=point_queues[r.trackID].size();
@@ -660,17 +687,32 @@
 	point_queues[r.trackID].pop();
       }
       qsize=point_queue.size();
-      rk = point_queue.top(); // Take one point from the top of the queue
-      ((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR];
-      ((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = rk.dist;
-      ((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = rk.qpos;
-      ((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = rk.spos;
-      if(fileTable)
-	snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE);
-      else
-	snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID);
-      while(qsize--) // pop the rest of the points
-	point_queue.pop();
+      unsigned int numReports = pointNN;
+      while(numReports--){ // pop the rest of the points
+	if(qsize)
+	  rk = point_queue.top(); // Take one point from the top of the queue
+	else{
+	  rk.dist = 1000000000.0;
+	  rk.qpos = 0xFFFFFFFF;
+	  rk.spos = 0xFFFFFFFF;
+	}
+	  
+	((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR];
+	((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = rk.dist;
+	((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = rk.qpos;
+	((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = rk.spos;
+	if(qsize){
+	  if(fileTable)
+	    snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE);
+	  else
+	    snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID);	
+	  point_queue.pop();
+	  qsize--;
+	}
+	else
+	  snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "NULL");
+	k++;
+      }
     }
  }
   delete[] point_queues;
--- a/sample.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/sample.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -56,7 +56,10 @@
 
 void audioDB::sample(const char *dbName) {
   initTables(dbName, 0);
-
+  if(dbH->flags & O2_FLAG_LARGE_ADB){
+    error("error: sample not yet supported for LARGE_ADB");
+  }
+    
   // build track offset table (FIXME: cut'n'pasted from query.cpp)
   off_t *trackOffsetTable = new off_t[dbH->numFiles];
   unsigned cumTrack=0;
--- a/soap.cpp	Tue Aug 12 14:25:51 2008 +0000
+++ b/soap.cpp	Thu Aug 21 21:28:33 2008 +0000
@@ -18,7 +18,7 @@
     std::cout << "length = " << adbStatusResponse.result.length << std::endl;
     std::cout << "dudCount = " << adbStatusResponse.result.dudCount << std::endl;
     std::cout << "nullCount = " << adbStatusResponse.result.nullCount << std::endl;
-    std::cout << "flags = " << adbStatusResponse.result.flags << std::endl;
+    std::cout << "flags = " << (adbStatusResponse.result.flags & 0x00FFFFFF) << std::endl;
   } else {
     soap_print_fault(&soap,stderr);
   }
@@ -126,8 +126,8 @@
     strncpy(queryType, "sequence", strlen("sequence"));
   else if(qType == O2_TRACK_QUERY)
     strncpy(queryType,"track", strlen("track"));
-  else
-    strncpy(queryType, "", strlen(""));
+  else if(qType == O2_N_SEQUENCE_QUERY)
+    strncpy(queryType,"nsequence", strlen("nsequence"));
 
   if(pointNN==0)
     pointNN=10;
@@ -285,6 +285,12 @@
 	fflush(stderr);
 	delete[] indexName;
       }
+      
+      // Server-side path prefix to databases and features
+      if(adb_root)
+	SERVER_ADB_ROOT = (char*)adb_root; // Server-side database root
+      if(adb_feature_root)
+	SERVER_ADB_FEATURE_ROOT = (char*)adb_feature_root; // Server-side features root
 
       for (int i = 1; ; i++)
 	{