diff index.cpp @ 319:b9eff6896943 large_adb

Added indexing support for O2_FLAG_LARGE_ADB. Tested on indexed query by features. No indexed query-by-key yet. No --lsh_exact yet.
author mas01mc
date Tue, 19 Aug 2008 20:27:15 +0000
parents cac5b3465318
children a995e5ad999a
line wrap: on
line diff
--- a/index.cpp	Tue Aug 19 15:50:26 2008 +0000
+++ b/index.cpp	Tue Aug 19 20:27:15 2008 +0000
@@ -8,22 +8,27 @@
 //
 // Author: Michael Casey
 //   Date: 23 June 2008
+//
+// 19th August 2008 - added O2_FLAG_LARGE_ADB support
 
 #include "audioDB.h"
 #include "ReporterBase.h"
 
 
 /************************* LSH point index to audioDB conversion  *****************/
-Uns32T audioDB::index_to_trackID(Uns32T lshID){
-  return lshID>>LSH_N_POINT_BITS;
+Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
+  assert(nPntBits);
+  return lshID>>nPntBits;
 }
 
-Uns32T audioDB::index_to_trackPos(Uns32T lshID){
-  return lshID&LSH_POINT_MASK;
+Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
+  assert(nPntBits);
+  return lshID&((1<<nPntBits)-1);
 }
 
-Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){
-  return (trackID << LSH_N_POINT_BITS) | spos;
+Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
+  assert(nPntBits);
+  return (trackID << nPntBits) | spos;
 }
 
 /************************* LSH indexing and query initialization  *****************/
@@ -78,19 +83,20 @@
 
 // Prepare the AudioDB database for read access and allocate auxillary memory
 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
+  if (!(dbH->flags & O2_FLAG_POWER)) {
+    error("INDEXed database must be power-enabled", dbName);
+  }
+
+  double *snpp = *snp, *sppp = 0;
+
   *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
   *snp = new double[*dvp];  // songs norm pointer: L2 norm table for each vector
-
-  double *snpp = *snp, *sppp = 0;
-  memcpy(*snp, l2normTable, *dvp * sizeof(double));
-
-  if (!(dbH->flags & O2_FLAG_POWER)) {
-    error("database not power-enabled", dbName);
-  }
   *spp = new double[*dvp]; // song powertable pointer
   sppp = *spp;
+  memcpy(*snp, l2normTable, *dvp * sizeof(double));
   memcpy(*spp, powerTable, *dvp * sizeof(double));
-
+  
+  
   for(Uns32T i = 0; i < dbH->numFiles; i++){
     if(trackTable[i] >= sequenceLength) {
       sequence_sum(snpp, trackTable[i], sequenceLength);
@@ -102,10 +108,10 @@
     snpp += trackTable[i];
     sppp += trackTable[i];
   }
-
+  
   *vsnp = *snp;
   *vspp = *spp;
-
+  
   // Move the feature vector read pointer to start of fetures in database
   lseek(dbfid, dbH->dataOffset, SEEK_SET);
 }
@@ -141,8 +147,6 @@
   fflush(stdout);
 
 
-  index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
-  
   if((lshfid = open(newIndexName,O_RDONLY))<0){
     printf("INDEX: constructing new LSH index\n");  
     printf("INDEX: making index file %s\n", newIndexName);
@@ -160,7 +164,12 @@
     if( endTrack > dbH->numFiles)
       endTrack = dbH->numFiles;
     // Insert up to lsh_param_b tracks
-    index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);    
+    if( dbH->flags & O2_FLAG_LARGE_ADB ){
+    }
+    else{
+      index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);  
+      index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
+    }
     lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
     
     // Clean up
@@ -177,7 +186,7 @@
     // Get the lsh header info and find how many tracks are inserted already
     lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here
     assert(lsh);
-    Uns32T maxs = index_to_trackID(lsh->get_maxp())+1;
+    Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
     delete lsh;
     lsh = 0;
 
@@ -221,6 +230,57 @@
 
 }
 
+
+// initialize auxillary track data from filesystem
+// pre-conditions:
+// dbH->flags & O2_FLAG_LARGE_ADB
+// feature data allocated and copied (fvp)
+//
+// post-conditions:
+// allocated power data
+// allocated l2norm data
+//
+void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){  
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
+    error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
+
+  // Allocate and read the power sequence
+  if(trackTable[trackID]>=sequenceLength){
+
+    // Open and check dimensions of power file
+    powerfd = open(powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O_RDONLY);
+    if (powerfd < 0) {
+      error("failed to open power file", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE);
+    }
+    if (fstat(powerfd, &statbuf) < 0) {
+      error("fstat error finding size of power file", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, "fstat");
+    }
+    
+    if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
+      error("Dimension mismatch: numPowers != numVectors", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE);
+    
+    *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
+    assert(*sPowerp);
+    *spPtrp = *sPowerp;
+    insertPowerData(trackTable[trackID], powerfd, *sPowerp);
+    if (0 < powerfd) {
+      close(powerfd);
+    }
+    
+    sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
+    sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
+    powerTable = 0;
+
+    // Allocate and calculate the l2norm sequence
+    *sNormpp = new double[trackTable[trackID]];
+    assert(*sNormpp);
+    *snPtrp = *sNormpp;
+    unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp);
+    sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
+    sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
+  }
+}
+
 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
 				  double** fvpp, double** sNormpp,double** snPtrp, 
 				  double** sPowerp, double** spPtrp){  
@@ -230,13 +290,29 @@
 
   VERB_LOG(1, "indexing tracks...");
 
-
+  int trackfd = dbfid;
   for(trackID = start_track ; trackID < end_track ; trackID++ ){
-    read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv
+    if( dbH->flags & O2_FLAG_LARGE_ADB ){
+      // Open and check dimensions of feature file
+      initInputFile(featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, false); // nommap, file pointer at correct position
+      trackfd = infid;
+    }
+    read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv
     *fvpp = fvp; // Protect memory allocation and free() for track data
+    
+    if( dbH->flags & O2_FLAG_LARGE_ADB )
+      // Load power and calculate power and l2norm sequence sums
+      init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
+    
     if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
       break;    
-  }
+    if ( dbH->flags & O2_FLAG_LARGE_ADB ){
+      close(infid);
+      delete *sNormpp;
+      delete *sPowerp;
+      *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
+    }
+  } // end for(trackID = start_track ; ... )
   std::cout << "finished inserting." << endl;
 }
 
@@ -256,13 +332,17 @@
       numVecs = trackTable[trackID] - sequenceLength + 1;
     }
   }
-  vv = index_initialize_shingles(numVecs);
-
-  for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
-    index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
   
-  Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
-  Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp);
+  Uns32T numVecsAboveThreshold = 0, collisionCount = 0; 
+  if(numVecs){
+    vv = index_initialize_shingles(numVecs);
+    
+    for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
+      index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
+    
+    numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
+    collisionCount = index_insert_shingles(vv, trackID, *sppp);
+  }
   float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
 
   /* index_norm_shingles() only goes as far as the end of the
@@ -273,9 +353,11 @@
    * So let's be certain the pointers are in the correct place
    */
 
-  *snpp += trackTable[trackID];
-  *sppp += trackTable[trackID];
-  *fvpp += trackTable[trackID] * dbH->dim;
+  if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
+    *snpp += trackTable[trackID];
+    *sppp += trackTable[trackID];
+    *fvpp += trackTable[trackID] * dbH->dim;
+  }
 
   std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
   std::cout.flush();  
@@ -287,7 +369,7 @@
   cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
   for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop)
     if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){
-      collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID));
+      collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
       spp+=sequenceHop;
     }
   return collisionCount;
@@ -393,7 +475,7 @@
     printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
     printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
     printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
-    printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp()));
+    printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
     printf("INDEX: Opened LSH index file %s\n", indexName);
     fflush(stdout);
   }
@@ -415,8 +497,8 @@
 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
   assert(instancePtr); // We need an instance for this callback
   audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
-  Uns32T trackID = index_to_trackID(pointID);
-  Uns32T spos = index_to_trackPos(pointID);
+  Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
+  Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
   // Skip identity in query_from_key
   if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
     myself->reporter->add_point(trackID, qpos, spos, dist);
@@ -427,8 +509,8 @@
 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
   assert(instancePtr); // We need an instance for this callback
   audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance  
-  Uns32T trackID = index_to_trackID(pointID);
-  Uns32T spos = index_to_trackPos(pointID);
+  Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
+  Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
   // Skip identity in query_from_key
   if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
     myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);