changeset 752:1731a6c457d7 fewerQueryDatumReads

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents bf89c80ec4cc
children
files Makefile audioDB-internals.h audioDB.cpp insert.cpp lshlib.cpp lshlib.h query.cpp
diffstat 7 files changed, 191 insertions(+), 54 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sun Feb 08 15:53:57 2009 +0000
+++ b/Makefile	Thu Nov 25 13:42:40 2010 +0000
@@ -17,7 +17,7 @@
 MINORVERSION=0
 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION)
 
-override CFLAGS+=-g -O3 -fPIC 
+override CFLAGS+=-g -ggdb -fPIC 
 
 # set to generate profile (gprof) and coverage (gcov) info
 #override CFLAGS+=-fprofile-arcs -ftest-coverage -pg
--- a/audioDB-internals.h	Sun Feb 08 15:53:57 2009 +0000
+++ b/audioDB-internals.h	Thu Nov 25 13:42:40 2010 +0000
@@ -284,7 +284,7 @@
 }
 
 static inline uint32_t audiodb_index_to_track_id(adb_t *adb, uint32_t lshid){
-  off_t offset = lshid*adb->header->dim*sizeof(double);
+  off_t offset = (off_t)lshid*adb->header->dim*sizeof(double);
   std::vector<off_t>::iterator b = (*adb->track_offsets).begin();
   std::vector<off_t>::iterator e = (*adb->track_offsets).end();  
   std::vector<off_t>::iterator p = std::upper_bound(b, e, offset);
@@ -302,8 +302,8 @@
 }
 
 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
-int audiodb_insert_create_datum_offset(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size, adb_fd_cache_t * cache=NULL);
-int audiodb_track_id_datum_offset(adb_t *, uint32_t , adb_datum_t *, off_t vector_offset, size_t vector_size, adb_fd_cache_t * cache=NULL);
+int audiodb_insert_create_datum_offset(adb_insert_t *, adb_datum_t *, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache=NULL);
+int audiodb_track_id_datum_offset(adb_t *, uint32_t , adb_datum_t *, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache=NULL);
 int audiodb_insert_create_datum(adb_insert_t * insert, adb_datum_t *datum);
 int audiodb_track_id_datum(adb_t * adb, uint32_t track_id, adb_datum_t *datum);
 int audiodb_free_datum(adb_datum_t *);
--- a/audioDB.cpp	Sun Feb 08 15:53:57 2009 +0000
+++ b/audioDB.cpp	Thu Nov 25 13:42:40 2010 +0000
@@ -873,10 +873,7 @@
   qspec.qid.flags = 0;
   qspec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE;
   qspec.qid.flags |= lsh_exact ? 0 : ADB_QID_FLAG_ALLOW_FALSE_POSITIVES;
-  if(!query_from_key && usingQueryPoint) // FIXME: query_from_key uses qspec.qid.sequence_start to locate data
-    qspec.qid.sequence_start = 0;
-  else
-    qspec.qid.sequence_start = queryPoint;
+  qspec.qid.sequence_start = queryPoint;
 
   switch(queryType) {
   case O2_POINT_QUERY:
--- a/insert.cpp	Sun Feb 08 15:53:57 2009 +0000
+++ b/insert.cpp	Thu Nov 25 13:42:40 2010 +0000
@@ -337,7 +337,7 @@
   return audiodb_insert_create_datum_offset(insert, datum, 0, 0, 0);
 }
 
-int audiodb_insert_create_datum_offset(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset, size_t data_size, adb_fd_cache_t *cache) {
+int audiodb_insert_create_datum_offset(adb_insert_t *insert, adb_datum_t *datum, off_t vector_offset, size_t num_vectors, adb_fd_cache_t *cache) {
   int fd = 0;
   FILE *file = NULL;
   struct stat st;
@@ -351,8 +351,9 @@
   }
 
   // STEP 1 check if we need to clear the cache
-  if(cache && (cache->fname && strncmp(cache->fname, insert->features, strlen(insert->features))!=0))
+  if(cache && (cache->fname && strncmp(cache->fname, insert->features, strlen(insert->features))!=0)){
     clear_cache = true;
+  }
 
   // STEP 2. Clear the cache if necessary
   if(cache && clear_cache){
@@ -388,10 +389,12 @@
   }
 
   // STEP 5. Allocate data memory if necessary, read the requested amount of data
-  if(data_size)
-    size = data_size;
-  else
+  if(num_vectors){
+    size = num_vectors*datum->dim*sizeof(double);
+  }
+  else{
     size = st.st_size - sizeof(uint32_t);
+  }
 
   datum->nvectors = size / (sizeof(double) * datum->dim);
 
@@ -403,13 +406,15 @@
     goto error;
   }
   
-  if(data_offset)
-    lseek(fd, sizeof(uint32_t) + data_offset, SEEK_SET);
+  if(vector_offset){
+    lseek(fd, sizeof(uint32_t) + vector_offset*datum->dim*sizeof(double), SEEK_SET);
+  }
   read_or_goto_error(fd, datum->data, size);
 
   // STEP 6. Close the file descriptor, unless we are caching it
-  if(!cache)
+  if(!cache){
     close(fd);
+  }
   fd = 0; // we're done with the data
 
   if(insert->power) {
@@ -422,8 +427,9 @@
     }
 
     // Use the cached file descriptor or open a new file descriptor
-    if (cache && cache->power_fd)
+    if (cache && cache->power_fd){
       fd = cache->power_fd;
+    }
     else if((fd = open(insert->power, O_RDONLY)) == -1) {
       goto error;
     }
@@ -459,7 +465,7 @@
      * I hate C.
      */
 
-    if( (!data_size) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) {
+    if( (!num_vectors) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) {
       goto error;
     }
 
@@ -469,8 +475,9 @@
       if(dim != 1) {
 	goto error;
       }
-      if(cache)
+      if(cache){
 	cache->power_fd = fd;
+      }
     }
 
     // Allocate data memory if necessary, read the requested amount of data
@@ -480,13 +487,15 @@
       goto error;
     }
 
-    if(data_offset)
-      lseek(fd, sizeof(uint32_t) + data_offset/datum->dim, SEEK_SET);
+    if(vector_offset){
+      lseek(fd, sizeof(uint32_t) + vector_offset*sizeof(double), SEEK_SET);
+    }
 
     read_or_goto_error(fd, datum->power, size / datum->dim);
 
-    if(!cache)
+    if(!cache){
       close(fd);
+    }
     fd = 0;
   }
 
@@ -500,31 +509,38 @@
     }
 
     // Use the cached file descriptor or open a new file descriptor and maybe cache
-    if (cache && cache->times_file)
+    if (cache && cache->times_file){
       file = cache->times_file;
+    }
     else{
       if(!(file = fopen(insert->times, "r"))) {
 	goto error;
       }
-      if(cache)
+      if(cache){
         cache->times_file = file;
+      }
     }
     
     // Allocate data memory if necessary, read the requested amount of data
-    if(!datum->times)
+    if(!datum->times){
       datum->times = (double *) malloc(2 * size / datum->dim);
+    }
     if(!datum->times) {
       goto error;
     }
 
     rewind(file);
+
     if(fscanf(file, " %lf", &t) != 1) {
       goto error;
     }
-    if(data_offset)
-      while(data_offset-- != 1 )
-	if(fscanf(file, " %lf", &t) != 1)
+    if(vector_offset){
+      while(vector_offset-- != 1 ){
+	if(fscanf(file, " %lf", &t) != 1){ 
 	  goto error;
+	}
+      }
+    }
     tp = datum->times;
     *tp++ = t;
     for(unsigned int n = 0; n < datum->nvectors - 1; n++) {
--- a/lshlib.cpp	Sun Feb 08 15:53:57 2009 +0000
+++ b/lshlib.cpp	Thu Nov 25 13:42:40 2010 +0000
@@ -1507,7 +1507,7 @@
     if(!pointFound)
       error("State machine error: point", "unserialize_hashtable_row_format2()");    
     H::t2 = H::p; // Copy last found token to t2
-  }  
+  }
   return H::t2; // holds current token
 }
 
@@ -1657,7 +1657,7 @@
  // Retrieval is performed by generating a hash key query_t2 for query point q
  // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry
  // point for retrieve_from_core_hashtable_array
-#define SKIP_BITS (0xC0000000)
+#define SKIP_BITS (0xC0000000U)
 void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){
   Uns32T skip;
   Uns32T t2;
@@ -1674,9 +1674,9 @@
     p2 = *p++;
     skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
     if( t2 == H::t2 ){
-      add_point_callback(calling_instance, p1 ^ (p1 & SKIP_BITS), qpos, radius);
+      add_point_callback(calling_instance, p1 & ~SKIP_BITS, qpos, radius);
       if(skip--){
-	add_point_callback(calling_instance, p2 ^ (p2 & SKIP_BITS), qpos, radius);
+	add_point_callback(calling_instance, p2 & ~SKIP_BITS, qpos, radius);
 	while(skip-- )
 	  add_point_callback(calling_instance, *p++, qpos, radius);
       }
@@ -1710,6 +1710,124 @@
     }
 }
 
+ void G::dump_core_row(Uns32T n){
+   if(!(n<H::N)){
+     printf("ROW OUT OF RANGE:%d (MAX:%d)\n", n, H::N-1);
+     return;
+   }
+   for(Uns32T j = 0 ; j < H::L ; j++ ){
+     bucket* bPtr = h[j][n];
+     if(bPtr){
+       printf("C[%d,%d]", j, n);
+#ifdef LSH_LIST_HEAD_COUNTERS
+       printf("[numBuckets=%d]",bPtr->snext.numBuckets);
+       if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
+	 dump_core_hashtable_array((Uns32T*)(bPtr->next));
+       } 
+       else {
+	 dump_hashtable_row(bPtr->next);
+       }
+#else
+       dump_hashtable_row(bPtr);
+#endif
+       printf("\n");
+     }
+   }
+ }
+
+ void G::dump_disk_row(char* filename, Uns32T n){
+  int dbfid = unserialize_lsh_header(filename);
+  if(dbfid<0){
+    cerr << "Could not read header from " << filename << endl;
+    return;
+  }
+  FILE* dbFile = 0;
+  dbFile = fdopen(dbfid, "rb");
+  if(!dbFile){
+    cerr << "Could not create FILE pointer from file:" << filename << " with fid:" << dbfid << endl;
+    close(dbfid);
+    return;
+  }
+
+  Uns32T x=0,y=0;
+
+  // Seek to hashtable base offset
+  if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
+    fclose(dbFile);
+    error("fSeek error in unserialize_lsh_hashtables_format2");
+  }
+  Uns32T token = 0;
+  Uns32T pointID;
+
+  // Read the hash tables into core (structure is given in header) 
+  while( x < H::L){
+    y=0;
+    if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
+      fclose(dbFile);
+      error("Read error T1","unserialize_lsh_hashtables_format2()");
+    }
+    while(token != O2_SERIAL_TOKEN_ENDTABLE){
+      if(token == O2_SERIAL_TOKEN_T1){
+	if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
+	  fclose(dbFile);
+	  error("Read error t1","unserialize_lsh_hashtables_format2()");
+	    }
+	y=token;
+	if(y==n){
+	  printf("D[%d,%d]", x, y);
+	  if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
+	    fclose(dbFile);
+	    error("Read error 2","unserialize_lsh_hashtables_format2()");
+	  }
+	  printf("[numElements=%d]", token);
+	  if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
+	    fclose(dbFile);
+	    error("Read error 3","unserialize_lsh_hashtables_format2()");
+	  }
+	  while(!(token==O2_SERIAL_TOKEN_ENDTABLE || token==O2_SERIAL_TOKEN_T1)){
+	    // Check for T2 token
+	    if(token!=O2_SERIAL_TOKEN_T2){
+	      printf("t2=%d",token);
+	      fclose(dbFile);
+	      error("State machine error T2 token", "unserialize_hashtable_row_format2()");
+	    }
+	    // Read t2 value
+	    if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
+	      fclose(dbFile);
+	      error("Read error t2","unserialize_hashtable_row_format2");
+	    }
+	    if(fread(&pointID, sizeof(Uns32T), 1, dbFile) != 1){
+	      fclose(dbFile);
+	      error("Read error pointID","unserialize_hashtable_row_format2");
+	    }
+	    while(!(pointID==O2_SERIAL_TOKEN_ENDTABLE || pointID==O2_SERIAL_TOKEN_T1 || pointID==O2_SERIAL_TOKEN_T2 )){
+	      printf("(%0X,%u)", token, pointID);
+	      if(fread(&pointID, sizeof(Uns32T), 1, dbFile) != 1){
+		fclose(dbFile);
+		error("Read error H::p","unserialize_hashtable_row_format2");
+	      }
+	    }
+	    token = pointID; // Copy last found token
+	  }
+	  printf("\n");
+	}
+	else{ // gobble up rest of row
+	  while(!(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE)){
+	    if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
+	      fclose(dbFile);
+	      error("Read error 4","unserialize_lsh_hashtables_format2()");
+	    }
+	  }
+	}
+      }
+    }
+    if(token==O2_SERIAL_TOKEN_ENDTABLE){
+      x++;
+    }
+  }
+  close(dbfid);
+ }
+
  void G::dump_core_hashtable_array(Uns32T* p){
    Uns32T skip;
    Uns32T t2;
@@ -1721,11 +1839,11 @@
     p1 = *p++;
     p2 = *p++;
     skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
-    printf("(%0x, %0x)", t2, p1 ^ (p1 & SKIP_BITS));
+    printf("(%0X, %u)", t2, p1 & ~SKIP_BITS);
     if(skip--){
-      printf("(%0x, %0x)", t2, p2 ^ (p2 & SKIP_BITS));
+      printf("(%0X, %u)", t2, p2 & ~SKIP_BITS);
       while(skip-- )
-	printf("(%0x, %0x)", t2, *p++);
+	printf("(%0X, %u)", t2, *p++);
     }
   }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
  }
--- a/lshlib.h	Sun Feb 08 15:53:57 2009 +0000
+++ b/lshlib.h	Thu Nov 25 13:42:40 2010 +0000
@@ -393,7 +393,8 @@
   float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; }
   char* get_indexName(){return indexName;}
   void dump_hashtables();
-
+  void dump_core_row(Uns32T n);
+  void dump_disk_row(char*, Uns32T n);
 };
 
 typedef class G LSH;
--- a/query.cpp	Sun Feb 08 15:53:57 2009 +0000
+++ b/query.cpp	Thu Nov 25 13:42:40 2010 +0000
@@ -203,31 +203,35 @@
 }
 
 int audiodb_track_id_datum_offset(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset, size_t num_vectors, adb_fd_cache_t* cache){
-  off_t track_offset = (*adb->track_offsets)[track_id];
-  
   if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
     /* create a reference/insert, then use adb_insert_create_datum() */
     adb_reference_t *reference = NULL;
     if(! (cache && cache->reference) ){
       reference = (adb_reference_t *) malloc(sizeof(adb_reference_t));
       reference->features = (char*) malloc(ADB_MAXSTR*sizeof(char));
-      if(adb->header->flags & ADB_HEADER_FLAG_POWER) 
+      if(adb->header->flags & ADB_HEADER_FLAG_POWER) {
 	reference->power = (char*) malloc(ADB_MAXSTR*sizeof(char));
-      else
+      }
+      else{
 	reference->power = NULL;
-      if(adb->header->flags & ADB_HEADER_FLAG_TIMES) 
+      }
+      if(adb->header->flags & ADB_HEADER_FLAG_TIMES){
 	reference->times = (char*)malloc(ADB_MAXSTR*sizeof(char));
-      else
+      }
+      else{
 	reference->times = NULL;
-      if(cache)
+      }
+      if(cache){
 	cache->reference = reference;
+      }
     }
-    else
+    else{
       reference = cache->reference;
-
+    }
     if(! (cache && cache->track_id==track_id) ){
-      if(cache)
+      if(cache){
 	cache->track_id = track_id;
+      }
       lseek(adb->fd, adb->header->dataOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
       read_or_goto_error(adb->fd, (void *)reference->features, ADB_MAXSTR);
       if(adb->header->flags & ADB_HEADER_FLAG_POWER) {
@@ -239,14 +243,15 @@
 	read_or_goto_error(adb->fd, (void *)reference->times, ADB_MAXSTR);
       }
     }
-
-    int retval = audiodb_insert_create_datum_offset(reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double), cache);
+    int retval = audiodb_insert_create_datum_offset(reference, d, vector_offset, num_vectors, cache);
     if(!cache){
       audiodb_free_datum_reference(reference);
       free(reference);      
     }
     return retval;
-  } else {
+  } 
+  else {
+    off_t track_offset = (*adb->track_offsets)[track_id];
     /* initialize from sources of data that we already have */
     if(num_vectors)
       d->nvectors = num_vectors;
@@ -353,7 +358,7 @@
 
   datum = spec->qid.datum;
   sequence_length = spec->qid.sequence_length;
-  sequence_start = spec->qid.sequence_start;
+  sequence_start = (spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) ? spec->qid.sequence_start : 0;
     
   if(datum->data) {
     if(datum->dim != adb->header->dim) {
@@ -386,11 +391,11 @@
   if(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) {
     /* the qpointers are already at the start, and so correct. */
   } else {
-    /* adjust the qpointers to point to the correct place in the sequence */
-    *vector = *vector_data + spec->qid.sequence_start * d.dim;
-    qpointers->l2norm = qpointers->l2norm_data + spec->qid.sequence_start;
+    /* qpointers are already at the start of the extracted sequence */
+    *vector = *vector_data; 
+    qpointers->l2norm = qpointers->l2norm_data;
     if(d.power) {
-      qpointers->power = qpointers->power_data + spec->qid.sequence_start;
+      qpointers->power = qpointers->power_data;
     }
     qpointers->nvectors = sequence_length;
   }