changeset 539:06ed85832c3b multiprobeLSH

Optimized the query_loop_points inner loop for memcpy and I/O efficiency. Uses sparse seeks and reads to perform scattered reads across data set. Current version does not cache fid between open calls to the same trackID.
author mas01mc
date Sat, 07 Feb 2009 01:20:05 +0000
parents 02e0a9ecfd0f
children 1bf090279174
files audioDB-internals.h insert.cpp query.cpp
diffstat 3 files changed, 54 insertions(+), 61 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB-internals.h	Fri Feb 06 21:08:35 2009 +0000
+++ b/audioDB-internals.h	Sat Feb 07 01:20:05 2009 +0000
@@ -290,8 +290,8 @@
 }
 
 int audiodb_read_data(adb_t *, int, int, double **, size_t *);
-int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *);
-int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *);
+int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size);
+int audiodb_track_id_datum(adb_t *, uint32_t , adb_datum_t , off_t vector_offset, size_t vector_size);
 int audiodb_free_datum(adb_datum_t *);
 int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *);
 int audiodb_datum_qpointers_partial(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *, adb_qstate_internal_t *);
--- a/insert.cpp	Fri Feb 06 21:08:35 2009 +0000
+++ b/insert.cpp	Sat Feb 07 01:20:05 2009 +0000
@@ -288,7 +288,7 @@
   return 0;
 }
 
-int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum) {
+int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset=0, size_t data_size=0) {
   int fd = 0;
   FILE *file = NULL;
   struct stat st;
@@ -304,12 +304,17 @@
     goto error;
   }
   read_or_goto_error(fd, &(datum->dim), sizeof(uint32_t));
-  size = st.st_size - sizeof(uint32_t);
+  if(data_size)
+    size = data_size;
+  else
+    size = st.st_size - sizeof(uint32_t);
   datum->nvectors = size / (sizeof(double) * datum->dim);
   datum->data = (double *) malloc(size);
   if(!datum->data) {
     goto error;
   }
+  if(data_offset)
+    lseek(fd, data_offset, SEEK_CUR);
   read_or_goto_error(fd, datum->data, size);
   close(fd);
   fd = 0;
@@ -347,7 +352,7 @@
      *
      * I hate C.
      */
-    if(((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) {
+    if( (!data_size) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) {
       goto error;
     }
     read_or_goto_error(fd, &dim, sizeof(uint32_t));
@@ -358,6 +363,8 @@
     if(!datum->power) {
       goto error;
     }
+    if(data_offset)
+      lseek(fd, data_offset/datum->dim, SEEK_CUR);
     read_or_goto_error(fd, datum->power, size / datum->dim);
     close(fd);
   }
@@ -373,6 +380,10 @@
     if(fscanf(file, " %lf", &t) != 1) {
       goto error;
     }
+    if(data_offset)
+      while(data_offset-- > 1)
+	if(fscanf(file, " %lf", &t) != 1)
+	  goto error;
     tp = datum->times;
     *tp++ = t;
     for(unsigned int n = 0; n < datum->nvectors - 1; n++) {
--- a/query.cpp	Fri Feb 06 21:08:35 2009 +0000
+++ b/query.cpp	Sat Feb 07 01:20:05 2009 +0000
@@ -198,7 +198,7 @@
   return 1;
 }
 
-int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d) {
+int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0) {
   off_t track_offset = (*adb->track_offsets)[track_id];
   if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
     /* create a reference/insert, then use adb_insert_create_datum() */
@@ -217,24 +217,27 @@
       read_or_goto_error(adb->fd, times, ADB_MAXSTR);
       reference.times = times;
     }
-    return audiodb_insert_create_datum(&reference, d);
+    return audiodb_insert_create_datum(&reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double));
   } else {
     /* initialize from sources of data that we already have */
-    d->nvectors = (*adb->track_lengths)[track_id];
+    if(num_vectors)
+      d->nvectors = num_vectors;
+    else
+      d->nvectors = (*adb->track_lengths)[track_id];
     d->dim = adb->header->dim;
     d->key = (*adb->keys)[track_id].c_str();
     /* read out stuff from the database tables */
     d->data = (double *) malloc(d->nvectors * d->dim * sizeof(double));
-    lseek(adb->fd, adb->header->dataOffset + track_offset, SEEK_SET);
+    lseek(adb->fd, adb->header->dataOffset + track_offset + vector_offset*d->dim*sizeof(double), SEEK_SET);
     read_or_goto_error(adb->fd, d->data, d->nvectors * d->dim * sizeof(double));
     if(adb->header->flags & ADB_HEADER_FLAG_POWER) {
       d->power = (double *) malloc(d->nvectors * sizeof(double));
-      lseek(adb->fd, adb->header->powerTableOffset + track_offset / d->dim, SEEK_SET);
+      lseek(adb->fd, adb->header->powerTableOffset + track_offset / d->dim + vector_offset*sizeof(double), SEEK_SET);
       read_or_goto_error(adb->fd, d->power, d->nvectors * sizeof(double));
     }
     if(adb->header->flags & ADB_HEADER_FLAG_TIMES) {
       d->times = (double *) malloc(2 * d->nvectors * sizeof(double));
-      lseek(adb->fd, adb->header->timesTableOffset + track_offset / d->dim, SEEK_SET);
+      lseek(adb->fd, adb->header->timesTableOffset + track_offset / d->dim + 2 * vector_offset*sizeof(double), SEEK_SET);
       read_or_goto_error(adb->fd, d->times, 2 * d->nvectors * sizeof(double));
     }
     return 0;
@@ -285,34 +288,19 @@
 				    adb_qstate_internal_t *qstate){
   uint32_t nvectors = d->nvectors;
   qpointers->nvectors = nvectors;
-  std::priority_queue<PointPair, std::vector<PointPair>, greater<PointPair> > ppairs(*qstate->exact_evaluation_queue);
   
-  size_t vector_size = nvectors * sizeof(double) * d->dim;
-
-  if(d->power)
-    qpointers->power_data = new double[vector_size / d->dim];
-
-  uint32_t seq_len_dbl = sequence_length*sizeof(double);
-  PointPair pp = ppairs.top();
-  uint32_t tid = pp.trackID;
-
-  while( !ppairs.empty() && pp.trackID==tid){
-    uint32_t spos = pp.spos;
+  PointPair pp = (*qstate->exact_evaluation_queue).top();
 #ifdef _LSH_DEBUG_
-    cout << "tid=" << pp.trackID << " qpos=" << pp.qpos << " spos=" << pp.spos << endl;
-    cout.flush();
+  cout << "tid=" << pp.trackID << " qpos=" << pp.qpos << " spos=" << pp.spos << endl;
+  cout.flush();
 #endif
-
-    if(d->power) {
-      memcpy(qpointers->power_data+spos, d->power+spos, seq_len_dbl);
-      audiodb_sequence_sum(qpointers->power_data+spos, sequence_length, sequence_length);
-      audiodb_sequence_average(qpointers->power_data+spos, sequence_length, sequence_length);
-    }
-    ppairs.pop();
-    if(!ppairs.empty())
-      pp = ppairs.top();
+  
+  if(d->power) {
+    //memcpy(qpointers->power_data, d->power, seq_len_dbl);
+    audiodb_sequence_sum(d->power, sequence_length, sequence_length);
+    audiodb_sequence_average(d->power, sequence_length, sequence_length);
   }
-
+  
   if(d->times) {
     qpointers->mean_duration = new double[1];
     *qpointers->mean_duration = 0;
@@ -321,11 +309,11 @@
     }
     *qpointers->mean_duration /= nvectors;
   }  
-
+  
   *vector = d->data;
   *vector_data = d->data;
   qpointers->l2norm = 0 ;
-  qpointers->power = qpointers->power_data;
+  qpointers->power = d->power;
   return 0;
 }
 
@@ -496,7 +484,6 @@
    */
   double dist;
   double *dbdata = 0, *dbdata_pointer;
-  Uns32T currentTrack = 0x80000000; // KLUDGE: Initialize with a value outside of track index range
   Uns32T npairs = qstate->exact_evaluation_queue->size();
 #ifdef _LSH_DEBUG_
   cout << "Num vector pairs to evaluate: " << npairs << "..." << endl;
@@ -505,33 +492,28 @@
   adb_datum_t d = {0};
   while(npairs--) {
     PointPair pp = qstate->exact_evaluation_queue->top();
-    if(currentTrack != pp.trackID) {
-      maybe_delete_array(dbpointers.power_data);
-      maybe_delete_array(dbpointers.mean_duration);
-      currentTrack = pp.trackID;
+    maybe_delete_array(dbpointers.mean_duration);
+    if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length)) {
+      delete qstate->exact_evaluation_queue;
+      delete qstate->set;
+      return 1;
+    }
+      
+    if(audiodb_datum_qpointers_partial(&d, sequence_length, &dbdata, &dbdata_pointer, &dbpointers, qstate)) {
+      delete qstate->exact_evaluation_queue;
+      delete qstate->set;
       audiodb_free_datum(&d);
-      if(audiodb_track_id_datum(adb, pp.trackID, &d)) {
-        delete qstate->exact_evaluation_queue;
-        delete qstate->set;
-        return 1;
-      }
+      return 1;
+    }
       
-      if(audiodb_datum_qpointers_partial(&d, sequence_length, &dbdata, &dbdata_pointer, &dbpointers, qstate)) {
-        delete qstate->exact_evaluation_queue;
-        delete qstate->set;
-        audiodb_free_datum(&d);
-        return 1;
-      }      
-    }
     Uns32T qPos = (spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) ? pp.qpos : 0;
-    Uns32T sPos = pp.spos; // index into l2norm table
     // Test power thresholds before computing distance
-    if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[sPos])) &&
-	( qPos<qpointers->nvectors-sequence_length+1 && sPos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){
+    if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[0])) &&
+	( qPos<qpointers->nvectors-sequence_length+1 && pp.spos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){
       // Compute distance    
-      dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
+      dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata, adb->header->dim*sequence_length);
       double qn = audiodb_dot_product(query + qPos*adb->header->dim, query + qPos*adb->header->dim, adb->header->dim*sequence_length);
-      double sn = audiodb_dot_product(dbdata + sPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
+      double sn = audiodb_dot_product(dbdata, dbdata, adb->header->dim*sequence_length);
       qn = sqrt(qn);
       sn = sqrt(sn);
       switch(spec->params.distance) {
@@ -553,13 +535,13 @@
       }
     }
     qstate->exact_evaluation_queue->pop();
+    audiodb_free_datum(&d);
   }
 
   // Cleanup
-  audiodb_free_datum(&d);
   //  maybe_delete_array(dbdata);
   //maybe_delete_array(dbpointers.l2norm_data);
-  maybe_delete_array(dbpointers.power_data);
+  //maybe_delete_array(dbpointers.power_data);
   maybe_delete_array(dbpointers.mean_duration);
   delete qstate->exact_evaluation_queue;
   delete qstate->set;