changeset 534:57e459f62788

Removed LSH_N_POINT_BITS coding for LSH index. Now uses binay search via STL lower_bound to locate tracks and positions from global pointID searching over cumulative track lengths. MERGE of branches/multiprobeLSH -r 819:821 onto trunk. This is a non backward-compatible change; WARNING generated on attempt to use INDEXING with older audioDB databases. Only INDEXES are broken, not ADB instances.
author mas01mc
date Wed, 04 Feb 2009 10:45:57 +0000
parents c7bdb7913762
children 77e63d5c6de0
files Makefile audioDB-internals.h audioDB.h create.cpp index.cpp lshlib.cpp query-indexed.cpp
diffstat 7 files changed, 36 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sat Jan 24 09:39:39 2009 +0000
+++ b/Makefile	Wed Feb 04 10:45:57 2009 +0000
@@ -17,7 +17,10 @@
 MINORVERSION=0
 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION)
 
-override CFLAGS+=-O3 -g -fPIC 
+override CFLAGS+=-g -O3 -fPIC 
+
+# set to generate profile (gprof) and coverage (gcov) info
+#override CFLAGS+=-fprofile-arcs -ftest-coverage -pg
 
 # set to DUMP hashtables on QUERY load
 #override CFLAGS+=-DLSH_DUMP_CORE_TABLES
--- a/audioDB-internals.h	Sat Jan 24 09:39:39 2009 +0000
+++ b/audioDB-internals.h	Wed Feb 04 10:45:57 2009 +0000
@@ -270,26 +270,25 @@
   return (*adb->keys)[index].c_str();
 }
 
-static inline uint32_t audiodb_index_to_track_id(uint32_t lshid, uint32_t n_point_bits) {
-  return (lshid >> n_point_bits);
+static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){
+  std::vector<off_t>::iterator it_b = (*adb->track_offsets).begin();
+  std::vector<off_t>::iterator it_e = (*adb->track_offsets).end();  
+  off_t test_id = lshid*adb->header->dim*sizeof(double);
+  std::vector<off_t>::iterator point_p = std::lower_bound(it_b, it_e, test_id);
+  if(*point_p == test_id)
+    return point_p - it_b; // lshid is first point in found track  
+  else    
+    return point_p - it_b - 1; // lshid is a point in the previous track  
 }
 
-static inline uint32_t audiodb_index_to_track_pos(uint32_t lshid, uint32_t n_point_bits) {
-  return (lshid & ((1 << n_point_bits) - 1));
+static inline uint32_t audiodb_index_to_track_pos(adb_t *adb, uint32_t track_id, uint32_t lshid) {
+  uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
+  return lshid - trackIndexOffset;
 }
 
-static inline uint32_t audiodb_index_from_trackinfo(uint32_t track_id, uint32_t track_pos, uint32_t n_point_bits) {
-  return ((track_id << n_point_bits) | track_pos);
-}
-
-#define ADB_FIXME_DEFAULT_LSH_N_POINT_BITS 14
-#ifndef ADB_FIXME_LSH_N_POINT_BITS
-#define ADB_FIXME_LSH_N_POINT_BITS ADB_FIXME_DEFAULT_LSH_N_POINT_BITS
-#endif
-
-static inline uint32_t audiodb_lsh_n_point_bits(adb_t *adb) {
-  uint32_t nbits = adb->header->flags >> 28;
-  return (nbits ? nbits : ADB_FIXME_LSH_N_POINT_BITS);
+static inline uint32_t audiodb_index_from_trackinfo(adb_t *adb, uint32_t track_id, uint32_t track_pos) {
+  uint32_t trackIndexOffset = (*adb->track_offsets)[track_id] / (adb->header->dim * sizeof(double));
+  return trackIndexOffset + track_pos;
 }
 
 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
@@ -325,8 +324,6 @@
 #define ADB_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24)
 #define ADB_FORMAT_VERSION (4U)
 
-#define ADB_LSH_MAXTRACKLEN (1 << ADB_FIXME_LSH_N_POINT_BITS)
-
 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
 #define align_down(x,w) ((x) & ~((1<<w)-1))
 
--- a/audioDB.h	Sat Jan 24 09:39:39 2009 +0000
+++ b/audioDB.h	Wed Feb 04 10:45:57 2009 +0000
@@ -92,7 +92,7 @@
 #define O2_MAXNN (1000000U)
 #define O2_MAXSEQLEN (8000U)            // maximum feature vectors in a sequence
 #define O2_MAXTRACKS (1000000U)           // maximum number of tracks
-#define O2_MAXTRACKLEN ADB_LSH_MAXTRACKLEN
+
 #define O2_MAXDOTPRODUCTMEMORY (sizeof(O2_REALTYPE)*O2_MAXSEQLEN*O2_MAXSEQLEN) // 512MB
 #define O2_SERIAL_MAX_TRACKBATCH (1000000)
 #define O2_LARGE_ADB_SIZE (O2_DEFAULT_DATASIZE+1) // datasize at which features are kept externally (in Mbytes)
--- a/create.cpp	Sat Jan 24 09:39:39 2009 +0000
+++ b/create.cpp	Wed Feb 04 10:45:57 2009 +0000
@@ -73,15 +73,6 @@
   databytes = ((off_t) datasize) * 1024 * 1024;
   auxbytes = databytes / datadim;
 
-  // For backward-compatibility, Record the point-encoding parameter for LSH indexing in the adb header
-  // If this value is 0 then it will be set to 14
-
-#if ADB_FIXME_LSH_N_POINT_BITS > 15
-#error "consistency check of ADB_FIXME_LSH_N_POINT_BITS failed (>31)"
-#endif
-
-  header->flags |= ADB_FIXME_LSH_N_POINT_BITS << 28;
-
   // If database will fit in a single file the vectors are copied into the AudioDB instance
   // Else all the vectors are left on the FileSystem and we use the dataOffset as storage
   // for the location of the features, powers and times files (assuming that arbitrary keys are used for the fileTable)
--- a/index.cpp	Sat Jan 24 09:39:39 2009 +0000
+++ b/index.cpp	Wed Feb 04 10:45:57 2009 +0000
@@ -128,7 +128,7 @@
     // Get the lsh header info and find how many tracks are inserted already
     lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
     assert(lsh);
-    Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
+    Uns32T maxs = audiodb_index_to_track_id(adb, lsh->get_maxp())+1;
     delete lsh;
     lsh = 0;
 
@@ -295,19 +295,13 @@
 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
   // Loop over the current input track's vectors
   Uns32T numVecs = 0;
-  if (trackTable[trackID] > O2_MAXTRACKLEN) {
-    if (O2_MAXTRACKLEN < sequenceLength - 1) {
-      numVecs = 0;
-    } else {
-      numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
-    }
+  
+  if (trackTable[trackID] < sequenceLength - 1) {
+    numVecs = 0;
   } else {
-    if (trackTable[trackID] < sequenceLength - 1) {
-      numVecs = 0;
-    } else {
-      numVecs = trackTable[trackID] - sequenceLength + 1;
-    }
+    numVecs = trackTable[trackID] - sequenceLength + 1;
   }
+
   
   Uns32T numVecsAboveThreshold = 0, collisionCount = 0; 
   if(numVecs){
@@ -351,7 +345,7 @@
   cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
   for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
     if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
-      collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
+      collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(adb, trackID, pointID));
     spp+=sequenceHop;
     }
   return collisionCount;
--- a/lshlib.cpp	Sat Jan 24 09:39:39 2009 +0000
+++ b/lshlib.cpp	Wed Feb 04 10:45:57 2009 +0000
@@ -138,7 +138,7 @@
     }
   }
 
-  // Storage for whole or partial function evaluation depdenting on USE_U_FUNCTIONS
+  // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS
   H::initialize_partial_functions();
 }
 
--- a/query-indexed.cpp	Sat Jan 24 09:39:39 2009 +0000
+++ b/query-indexed.cpp	Wed Feb 04 10:45:57 2009 +0000
@@ -44,9 +44,8 @@
   adb_qcallback_t *data = (adb_qcallback_t *) user_data;
   adb_t *adb = data->adb;
   adb_qstate_internal_t *qstate = data->qstate;
-  uint32_t nbits = audiodb_lsh_n_point_bits(adb);
-  uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
-  uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
+  uint32_t trackID = audiodb_index_to_track_id(adb, pointID);
+  uint32_t spos = audiodb_index_to_track_pos(adb, trackID, pointID);
   std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
   if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
     adb_result_t r;
@@ -64,9 +63,8 @@
   adb_qcallback_t *data = (adb_qcallback_t *) user_data;
   adb_t *adb = data->adb;
   adb_qstate_internal_t *qstate = data->qstate;
-  uint32_t nbits = audiodb_lsh_n_point_bits(adb);
-  uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
-  uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
+  uint32_t trackID = audiodb_index_to_track_id(adb, pointID);
+  uint32_t spos = audiodb_index_to_track_pos(adb, trackID, pointID);
   std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
   if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
     PointPair p(trackID, qpos, spos);
@@ -78,7 +76,9 @@
 // return 0: if index does not exist
 // return nqv: if index exists
 int audiodb_index_query_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
-  
+  if(adb->header->flags>>28)
+    cerr << "WARNING: Database created using deprecated LSH_N_POINT_BITS coding: REBUILD INDEXES..." << endl;
+
   double *query = 0, *query_data = 0;
   adb_qpointers_internal_t qpointers = {0};
   
@@ -120,7 +120,7 @@
     return -1;
   }
 
-  uint32_t Nq = (qpointers.nvectors > ADB_LSH_MAXTRACKLEN ? ADB_LSH_MAXTRACKLEN : qpointers.nvectors) - sequence_length + 1;
+  uint32_t Nq = qpointers.nvectors - sequence_length + 1;
   std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequence_length);
 
   // Construct shingles from query features