diff query.cpp @ 539:06ed85832c3b multiprobeLSH

Optimized the query_loop_points inner loop for memcpy and I/O efficiency. Uses sparse seeks and reads to perform scattered reads across data set. Current version does not cache fid between open calls to the same trackID.
author mas01mc
date Sat, 07 Feb 2009 01:20:05 +0000
parents ddf763553175
children 52d82badc544
line wrap: on
line diff
--- a/query.cpp	Fri Feb 06 21:08:35 2009 +0000
+++ b/query.cpp	Sat Feb 07 01:20:05 2009 +0000
@@ -198,7 +198,7 @@
   return 1;
 }
 
-int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d) {
+int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0) {
   off_t track_offset = (*adb->track_offsets)[track_id];
   if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
     /* create a reference/insert, then use adb_insert_create_datum() */
@@ -217,24 +217,27 @@
       read_or_goto_error(adb->fd, times, ADB_MAXSTR);
       reference.times = times;
     }
-    return audiodb_insert_create_datum(&reference, d);
+    return audiodb_insert_create_datum(&reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double));
   } else {
     /* initialize from sources of data that we already have */
-    d->nvectors = (*adb->track_lengths)[track_id];
+    if(num_vectors)
+      d->nvectors = num_vectors;
+    else
+      d->nvectors = (*adb->track_lengths)[track_id];
     d->dim = adb->header->dim;
     d->key = (*adb->keys)[track_id].c_str();
     /* read out stuff from the database tables */
     d->data = (double *) malloc(d->nvectors * d->dim * sizeof(double));
-    lseek(adb->fd, adb->header->dataOffset + track_offset, SEEK_SET);
+    lseek(adb->fd, adb->header->dataOffset + track_offset + vector_offset*d->dim*sizeof(double), SEEK_SET);
     read_or_goto_error(adb->fd, d->data, d->nvectors * d->dim * sizeof(double));
     if(adb->header->flags & ADB_HEADER_FLAG_POWER) {
       d->power = (double *) malloc(d->nvectors * sizeof(double));
-      lseek(adb->fd, adb->header->powerTableOffset + track_offset / d->dim, SEEK_SET);
+      lseek(adb->fd, adb->header->powerTableOffset + track_offset / d->dim + vector_offset*sizeof(double), SEEK_SET);
       read_or_goto_error(adb->fd, d->power, d->nvectors * sizeof(double));
     }
     if(adb->header->flags & ADB_HEADER_FLAG_TIMES) {
       d->times = (double *) malloc(2 * d->nvectors * sizeof(double));
-      lseek(adb->fd, adb->header->timesTableOffset + track_offset / d->dim, SEEK_SET);
+      lseek(adb->fd, adb->header->timesTableOffset + track_offset / d->dim + 2 * vector_offset*sizeof(double), SEEK_SET);
       read_or_goto_error(adb->fd, d->times, 2 * d->nvectors * sizeof(double));
     }
     return 0;
@@ -285,34 +288,19 @@
 				    adb_qstate_internal_t *qstate){
   uint32_t nvectors = d->nvectors;
   qpointers->nvectors = nvectors;
-  std::priority_queue<PointPair, std::vector<PointPair>, greater<PointPair> > ppairs(*qstate->exact_evaluation_queue);
   
-  size_t vector_size = nvectors * sizeof(double) * d->dim;
-
-  if(d->power)
-    qpointers->power_data = new double[vector_size / d->dim];
-
-  uint32_t seq_len_dbl = sequence_length*sizeof(double);
-  PointPair pp = ppairs.top();
-  uint32_t tid = pp.trackID;
-
-  while( !ppairs.empty() && pp.trackID==tid){
-    uint32_t spos = pp.spos;
+  PointPair pp = (*qstate->exact_evaluation_queue).top();
 #ifdef _LSH_DEBUG_
-    cout << "tid=" << pp.trackID << " qpos=" << pp.qpos << " spos=" << pp.spos << endl;
-    cout.flush();
+  cout << "tid=" << pp.trackID << " qpos=" << pp.qpos << " spos=" << pp.spos << endl;
+  cout.flush();
 #endif
-
-    if(d->power) {
-      memcpy(qpointers->power_data+spos, d->power+spos, seq_len_dbl);
-      audiodb_sequence_sum(qpointers->power_data+spos, sequence_length, sequence_length);
-      audiodb_sequence_average(qpointers->power_data+spos, sequence_length, sequence_length);
-    }
-    ppairs.pop();
-    if(!ppairs.empty())
-      pp = ppairs.top();
+  
+  if(d->power) {
+    //memcpy(qpointers->power_data, d->power, seq_len_dbl);
+    audiodb_sequence_sum(d->power, sequence_length, sequence_length);
+    audiodb_sequence_average(d->power, sequence_length, sequence_length);
   }
-
+  
   if(d->times) {
     qpointers->mean_duration = new double[1];
     *qpointers->mean_duration = 0;
@@ -321,11 +309,11 @@
     }
     *qpointers->mean_duration /= nvectors;
   }  
-
+  
   *vector = d->data;
   *vector_data = d->data;
   qpointers->l2norm = 0 ;
-  qpointers->power = qpointers->power_data;
+  qpointers->power = d->power;
   return 0;
 }
 
@@ -496,7 +484,6 @@
    */
   double dist;
   double *dbdata = 0, *dbdata_pointer;
-  Uns32T currentTrack = 0x80000000; // KLUDGE: Initialize with a value outside of track index range
   Uns32T npairs = qstate->exact_evaluation_queue->size();
 #ifdef _LSH_DEBUG_
   cout << "Num vector pairs to evaluate: " << npairs << "..." << endl;
@@ -505,33 +492,28 @@
   adb_datum_t d = {0};
   while(npairs--) {
     PointPair pp = qstate->exact_evaluation_queue->top();
-    if(currentTrack != pp.trackID) {
-      maybe_delete_array(dbpointers.power_data);
-      maybe_delete_array(dbpointers.mean_duration);
-      currentTrack = pp.trackID;
+    maybe_delete_array(dbpointers.mean_duration);
+    if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length)) {
+      delete qstate->exact_evaluation_queue;
+      delete qstate->set;
+      return 1;
+    }
+      
+    if(audiodb_datum_qpointers_partial(&d, sequence_length, &dbdata, &dbdata_pointer, &dbpointers, qstate)) {
+      delete qstate->exact_evaluation_queue;
+      delete qstate->set;
       audiodb_free_datum(&d);
-      if(audiodb_track_id_datum(adb, pp.trackID, &d)) {
-        delete qstate->exact_evaluation_queue;
-        delete qstate->set;
-        return 1;
-      }
+      return 1;
+    }
       
-      if(audiodb_datum_qpointers_partial(&d, sequence_length, &dbdata, &dbdata_pointer, &dbpointers, qstate)) {
-        delete qstate->exact_evaluation_queue;
-        delete qstate->set;
-        audiodb_free_datum(&d);
-        return 1;
-      }      
-    }
     Uns32T qPos = (spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) ? pp.qpos : 0;
-    Uns32T sPos = pp.spos; // index into l2norm table
     // Test power thresholds before computing distance
-    if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[sPos])) &&
-	( qPos<qpointers->nvectors-sequence_length+1 && sPos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){
+    if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[0])) &&
+	( qPos<qpointers->nvectors-sequence_length+1 && pp.spos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){
       // Compute distance    
-      dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
+      dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata, adb->header->dim*sequence_length);
       double qn = audiodb_dot_product(query + qPos*adb->header->dim, query + qPos*adb->header->dim, adb->header->dim*sequence_length);
-      double sn = audiodb_dot_product(dbdata + sPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
+      double sn = audiodb_dot_product(dbdata, dbdata, adb->header->dim*sequence_length);
       qn = sqrt(qn);
       sn = sqrt(sn);
       switch(spec->params.distance) {
@@ -553,13 +535,13 @@
       }
     }
     qstate->exact_evaluation_queue->pop();
+    audiodb_free_datum(&d);
   }
 
   // Cleanup
-  audiodb_free_datum(&d);
   //  maybe_delete_array(dbdata);
   //maybe_delete_array(dbpointers.l2norm_data);
-  maybe_delete_array(dbpointers.power_data);
+  //maybe_delete_array(dbpointers.power_data);
   maybe_delete_array(dbpointers.mean_duration);
   delete qstate->exact_evaluation_queue;
   delete qstate->set;