changeset 541:52d82badc544 multiprobeLSH

Added file caching for sparse datum reads. This required making a new type called adb_fd_cache_t and modifying the read methods audiodb_track_id_datum() and audiodb_insert_create_datum() to use the cache struct if one is provided by the user.
author mas01mc
date Sat, 07 Feb 2009 16:59:31 +0000
parents 1bf090279174
children 79ffab663ace
files audioDB-internals.h insert.cpp query.cpp
diffstat 3 files changed, 229 insertions(+), 46 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB-internals.h	Sat Feb 07 12:12:46 2009 +0000
+++ b/audioDB-internals.h	Sat Feb 07 16:59:31 2009 +0000
@@ -56,6 +56,18 @@
   LSH *lsh;
 } adb_qstate_internal_t;
 
+/* this struct is for caching file descriptors for multiple reads from a data file
+ */
+
+typedef struct adb_fd_cache {
+  uint32_t track_id;
+  adb_reference_t* reference;
+  char* fname;
+  int data_fd;
+  int power_fd;
+  FILE* times_file;
+} adb_fd_cache_t;
+
 /* this struct is the in-memory representation of the binary
  * information stored at the head of each adb file */
 typedef struct adbheader {
@@ -290,9 +302,11 @@
 }
 
 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
-int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size);
-int audiodb_track_id_datum(adb_t *, uint32_t , adb_datum_t , off_t vector_offset, size_t vector_size);
+int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size, adb_fd_cache_t * cache);
+int audiodb_track_id_datum(adb_t *, uint32_t , adb_datum_t , off_t vector_offset, size_t vector_size, adb_fd_cache_t * cache);
 int audiodb_free_datum(adb_datum_t *);
+int audiodb_free_datum_cache(adb_fd_cache_t *);
+int audiodb_free_datum_reference(adb_reference_t * reference);
 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
 int audiodb_datum_qpointers_partial(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *, adb_qstate_internal_t *);
 int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
--- a/insert.cpp	Sat Feb 07 12:12:46 2009 +0000
+++ b/insert.cpp	Sat Feb 07 16:59:31 2009 +0000
@@ -288,44 +288,145 @@
   return 0;
 }
 
-int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset=0, size_t data_size=0) {
+int audiodb_free_datum_cache(adb_fd_cache_t *cache){
+  if(cache){
+    if(cache->fname){
+      free(cache->fname);
+      cache->fname = NULL;
+    }
+    if(cache->data_fd){
+      close(cache->data_fd);
+      cache->data_fd = 0;
+    }
+    if(cache->power_fd){
+      close(cache->power_fd);
+      cache->power_fd = 0;
+    }
+    if(cache->times_file){
+      fclose(cache->times_file);
+      cache->times_file = NULL;
+    }    
+    if(cache->reference){
+      audiodb_free_datum_reference(cache->reference);
+      cache->reference = NULL;
+    }    
+  }
+  return 0;
+}
+
+int audiodb_free_datum_reference(adb_reference_t * reference){
+  if(reference){
+    if(reference->features){
+      free((char *)reference->features);
+      reference->features = 0;
+    }
+    if(reference->power){
+      free((char *)reference->power);
+      reference->power = 0;
+    }
+    if(reference->times){
+      free((char *)reference->times);
+      reference->times = 0;
+    }
+  }
+  return 0;
+}
+
+int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset=0, size_t data_size=0, adb_fd_cache_t *cache=0) {
   int fd = 0;
   FILE *file = NULL;
   struct stat st;
   off_t size;
+  bool clear_cache = false;
 
-  datum->data = NULL;
-  datum->power = NULL;
-  datum->times = NULL;
-  if((fd = open(insert->features, O_RDONLY)) == -1) {
-    goto error;
+  if(!cache){
+    datum->data = NULL;
+    datum->power = NULL;
+    datum->times = NULL;
   }
+
+  // STEP 1 check if we need to clear the cache
+  if(cache && (cache->fname && strncmp(cache->fname, insert->features, strlen(insert->features))!=0))
+    clear_cache = true;
+
+  // STEP 2. Clear the cache if necessary
+  if(cache && clear_cache){
+    close(cache->data_fd);
+    cache->data_fd = 0;
+    free(cache->fname);
+    cache->fname = 0;
+  }
+  
+  // STEP 3. Use the cached file descriptor or open a new file descriptor
+  if (cache && cache->data_fd ){
+    fd = cache->data_fd;
+  }
+  else{
+    if ((fd = open(insert->features, O_RDONLY)) == -1) {
+      goto error;
+    }
+    if(cache){
+      cache->fname = (char*) malloc(strlen(insert->features));
+      strncpy(cache->fname, insert->features, strlen(insert->features));
+    }
+  }
+
   if(fstat(fd, &st)) {
     goto error;
   }
-  read_or_goto_error(fd, &(datum->dim), sizeof(uint32_t));
+
+  // STEP 4. If file descriptor is new, read the dimensionality, maybe cache the file descriptor
+  if( !( cache && cache->data_fd ) ){
+    read_or_goto_error(fd, &(datum->dim), sizeof(uint32_t));
+    if(cache)
+      cache->data_fd = fd;
+  }
+
+  // STEP 5. Allocate data memory if necessary, read the requested amount of data
   if(data_size)
     size = data_size;
   else
     size = st.st_size - sizeof(uint32_t);
+
   datum->nvectors = size / (sizeof(double) * datum->dim);
-  datum->data = (double *) malloc(size);
+
+  if(!datum->data){
+    datum->data = (double *) malloc(size);
+  }
+    
   if(!datum->data) {
     goto error;
   }
+  
   if(data_offset)
-    lseek(fd, data_offset, SEEK_CUR);
+    lseek(fd, sizeof(uint32_t) + data_offset, SEEK_SET);
   read_or_goto_error(fd, datum->data, size);
-  close(fd);
-  fd = 0;
+
+  // STEP 6. Close the file descriptor, unless we are caching it
+  if(!cache)
+    close(fd);
+  fd = 0; // we're done with the data
+
   if(insert->power) {
     int dim;
-    if((fd = open(insert->power, O_RDONLY)) == -1) {
+
+    // Clear the cache if necessary 
+    if(clear_cache){
+      close(cache->power_fd);
+      cache->power_fd = 0;
+    }
+
+    // Use the cached file descriptor or open a new file descriptor
+    if (cache && cache->power_fd)
+      fd = cache->power_fd;
+    else if((fd = open(insert->power, O_RDONLY)) == -1) {
       goto error;
     }
+
     if(fstat(fd, &st)) {
       goto error;
     }
+
     /* This cast is so non-trivial that it deserves a comment.
      *
      * The data types in this expression, left to right, are: off_t,
@@ -352,36 +453,71 @@
      *
      * I hate C.
      */
+
     if( (!data_size) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) {
       goto error;
     }
-    read_or_goto_error(fd, &dim, sizeof(uint32_t));
-    if(dim != 1) {
-      goto error;
+
+    // If file descriptor is new, read the dimensionality, maybe cache the file descriptor
+    if( !( cache && cache->power_fd ) ){
+      read_or_goto_error(fd, &dim, sizeof(uint32_t));
+      if(dim != 1) {
+	goto error;
+      }
+      if(cache)
+	cache->power_fd = fd;
     }
-    datum->power = (double *) malloc(size / datum->dim);
+
+    // Allocate data memory if necessary, read the requested amount of data
+    if(!datum->power)
+      datum->power = (double *) malloc(size / datum->dim);
     if(!datum->power) {
       goto error;
     }
+
     if(data_offset)
-      lseek(fd, data_offset/datum->dim, SEEK_CUR);
+      lseek(fd, sizeof(uint32_t) + data_offset/datum->dim, SEEK_SET);
+
     read_or_goto_error(fd, datum->power, size / datum->dim);
-    close(fd);
+
+    if(!cache)
+      close(fd);
+    fd = 0;
   }
+
   if(insert->times) {
     double t, *tp;
-    if(!(file = fopen(insert->times, "r"))) {
-      goto error;
+
+    // Clear the cache if necessary
+    if(clear_cache){
+      fclose(cache->times_file);
+      cache->times_file = 0;
     }
-    datum->times = (double *) malloc(2 * size / datum->dim);
+
+    // Use the cached file descriptor or open a new file descriptor and maybe cache
+    if (cache && cache->times_file)
+      file = cache->times_file;
+    else{
+      if(!(file = fopen(insert->times, "r"))) {
+	goto error;
+      }
+      if(cache)
+        cache->times_file = file;
+    }
+    
+    // Allocate data memory if necessary, read the requested amount of data
+    if(!datum->times)
+      datum->times = (double *) malloc(2 * size / datum->dim);
     if(!datum->times) {
       goto error;
     }
+
+    rewind(file);
     if(fscanf(file, " %lf", &t) != 1) {
       goto error;
     }
     if(data_offset)
-      while(data_offset--)
+      while(data_offset-- != 1 )
 	if(fscanf(file, " %lf", &t) != 1)
 	  goto error;
     tp = datum->times;
@@ -397,7 +533,10 @@
       goto error;
     }
     *tp = t;
-    fclose(file);
+    if(!cache){
+      fclose(file);
+      file=0;
+    }
   }
   datum->key = insert->key ? insert->key : insert->features;
   return 0;
@@ -410,8 +549,10 @@
     fclose(file);
   }
   audiodb_free_datum(datum);
+  if(cache)
+    audiodb_free_datum_cache(cache);
   return 1;
-}
+  }
 
 int audiodb_insert(adb_t *adb, adb_insert_t *insert) {
   if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
--- a/query.cpp	Sat Feb 07 12:12:46 2009 +0000
+++ b/query.cpp	Sat Feb 07 16:59:31 2009 +0000
@@ -198,26 +198,46 @@
   return 1;
 }
 
-int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0) {
+int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0, adb_fd_cache_t* cache=0){
   off_t track_offset = (*adb->track_offsets)[track_id];
+  
   if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
     /* create a reference/insert, then use adb_insert_create_datum() */
-    adb_reference_t reference = {0};
-    char features[ADB_MAXSTR], power[ADB_MAXSTR], times[ADB_MAXSTR];
-    lseek(adb->fd, adb->header->dataOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
-    read_or_goto_error(adb->fd, features, ADB_MAXSTR);
-    reference.features = features;
-    if(adb->header->flags & ADB_HEADER_FLAG_POWER) {
-      lseek(adb->fd, adb->header->powerTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
-      read_or_goto_error(adb->fd, power, ADB_MAXSTR);
-      reference.power = power;
+    adb_reference_t *reference = NULL;
+    if(! (cache && cache->reference) ){
+      reference = (adb_reference_t *) malloc(sizeof(adb_reference_t));
+      reference->features = (char*) malloc(ADB_MAXSTR*sizeof(char));
+      if(adb->header->flags & ADB_HEADER_FLAG_POWER) 
+	reference->power = (char*) malloc(ADB_MAXSTR*sizeof(char));
+      if(adb->header->flags & ADB_HEADER_FLAG_TIMES) 
+	reference->times = (char*)malloc(ADB_MAXSTR*sizeof(char));
+      if(cache)
+	cache->reference = reference;
     }
-    if(adb->header->flags & ADB_HEADER_FLAG_TIMES) {
-      lseek(adb->fd, adb->header->timesTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
-      read_or_goto_error(adb->fd, times, ADB_MAXSTR);
-      reference.times = times;
+    else
+      reference = cache->reference;
+
+    if(! (cache && cache->track_id==track_id) ){
+      if(cache)
+	cache->track_id = track_id;
+      lseek(adb->fd, adb->header->dataOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
+      read_or_goto_error(adb->fd, (void *)reference->features, ADB_MAXSTR);
+      if(adb->header->flags & ADB_HEADER_FLAG_POWER) {
+	lseek(adb->fd, adb->header->powerTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
+	read_or_goto_error(adb->fd, (void *)reference->power, ADB_MAXSTR);
+      }
+      if(adb->header->flags & ADB_HEADER_FLAG_TIMES) {
+	lseek(adb->fd, adb->header->timesTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET);
+	read_or_goto_error(adb->fd, (void *)reference->times, ADB_MAXSTR);
+      }
     }
-    return audiodb_insert_create_datum(&reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double));
+
+    int retval = audiodb_insert_create_datum(reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double), cache);
+    if(!cache){
+      audiodb_free_datum_reference(reference);
+      free(reference);      
+    }
+    return retval;
   } else {
     /* initialize from sources of data that we already have */
     if(num_vectors)
@@ -485,17 +505,26 @@
   double dist;
   double *dbdata = 0, *dbdata_pointer;
   Uns32T npairs = qstate->exact_evaluation_queue->size();
+  Uns32T currentTrack = qstate->exact_evaluation_queue->top().trackID+1; // i.e. not first track
 #ifdef _LSH_DEBUG_
   cout << "Num vector pairs to evaluate: " << npairs << "..." << endl;
   cout.flush();
 #endif  
   adb_datum_t d = {0};
+  adb_fd_cache_t c = {0};
+  c.track_id = currentTrack;
   while(npairs--) {
     PointPair pp = qstate->exact_evaluation_queue->top();
+    if(pp.trackID != currentTrack){
+          audiodb_free_datum(&d);
+	  currentTrack = pp.trackID;
+    }
     maybe_delete_array(dbpointers.mean_duration);
-    if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length)) {
+    if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length, &c)) {
       delete qstate->exact_evaluation_queue;
       delete qstate->set;
+      audiodb_free_datum(&d);
+      audiodb_free_datum_cache(&c);
       return 1;
     }
       
@@ -503,6 +532,7 @@
       delete qstate->exact_evaluation_queue;
       delete qstate->set;
       audiodb_free_datum(&d);
+      audiodb_free_datum_cache(&c);
       return 1;
     }
       
@@ -535,13 +565,11 @@
       }
     }
     qstate->exact_evaluation_queue->pop();
-    audiodb_free_datum(&d);
   }
 
   // Cleanup
-  //  maybe_delete_array(dbdata);
-  //maybe_delete_array(dbpointers.l2norm_data);
-  //maybe_delete_array(dbpointers.power_data);
+  audiodb_free_datum(&d);
+  audiodb_free_datum_cache(&c);
   maybe_delete_array(dbpointers.mean_duration);
   delete qstate->exact_evaluation_queue;
   delete qstate->set;