changeset 519:fdcd436d7cbd multiprobeLSH

lshlib MultiProbe glue added. Compiles and links against audioDB. Fails LSH tests at the moment.
author mas01mc
date Mon, 26 Jan 2009 05:42:15 +0000
parents ca1ee92c359c
children 0767731a6cce
files Makefile l2norm.cpp lshlib.cpp lshlib.h multiprobe.cpp multiprobe.h
diffstat 6 files changed, 169 insertions(+), 59 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Mon Jan 26 02:50:44 2009 +0000
+++ b/Makefile	Mon Jan 26 05:42:15 2009 +0000
@@ -8,7 +8,7 @@
 
 SHARED_LIB_FLAGS=-shared -Wl,-soname,
 
-LIBOBJS=lock.o pointpair.o create.o open.o power.o l2norm.o insert.o status.o query.o dump.o close.o lshlib.o index-utils.o query-indexed.o
+LIBOBJS=lock.o pointpair.o create.o open.o power.o l2norm.o insert.o status.o query.o dump.o close.o lshlib.o multiprobe.o index-utils.o query-indexed.o
 OBJS=$(LIBOBJS) index.o soap.o liszt.o sample.o cmdline.o audioDB.o common.o
 
 EXECUTABLE=audioDB
--- a/l2norm.cpp	Mon Jan 26 02:50:44 2009 +0000
+++ b/l2norm.cpp	Mon Jan 26 05:42:15 2009 +0000
@@ -4,7 +4,7 @@
 #include "audioDB-internals.h"
 
 static int audiodb_l2norm_existing(adb_t *adb) {
-  double *data_buffer, *l2norm_buffer;
+  double *data_buffer = 0, *l2norm_buffer = 0;
   adb_header_t *header = adb->header;
   size_t data_buffer_size = align_page_up(header->length);
   size_t nvectors = header->length / (sizeof(double) * header->dim);
--- a/lshlib.cpp	Mon Jan 26 02:50:44 2009 +0000
+++ b/lshlib.cpp	Mon Jan 26 05:42:15 2009 +0000
@@ -18,7 +18,10 @@
   exit(1);
 }
 
-H::H(){
+H::H():
+  multiProbePtr(new MultiProbe()),
+  boundaryDistances(0)
+{
   // Delay initialization of lsh functions until we know the parameters
 }
 
@@ -38,7 +41,9 @@
   L((mm*(mm-1))/2),
   d(dd),
   w(ww),
-  radius(rr)
+  radius(rr),
+  multiProbePtr(new MultiProbe()),
+  boundaryDistances(0)
 {
 
   if(m<2){
@@ -140,6 +145,16 @@
 
   // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS
   H::initialize_partial_functions();
+
+  // MultiProbe distance functions, there are 2*k per hashtable
+  H::boundaryDistances = new float*[ H::L ];  // L x 2k boundary distances
+  assert( H::boundaryDistances ); // failure
+  for( j = 0; j < H::L ; j++ ){ // 2*k functions x_i(q)  
+    H::boundaryDistances[j] = new float[ 2*H::k ]; 
+    assert( H::boundaryDistances[j] ); // failure
+    for( kk = 0; kk < 2*H::k ; kk++ )
+      H::boundaryDistances[j][kk] = 0.0f; // initialize with zeros
+  }
 }
 
 void H::initialize_partial_functions(){
@@ -237,6 +252,12 @@
     delete[] H::r1;
     delete[] H::r2;
     delete[] H::h;
+
+    // MultiProbe cleanup
+    for( j = 0 ; j < H::L ; j++ )
+      delete[] H::boundaryDistances[j];
+    delete[] H::boundaryDistances;
+    delete multiProbePtr;
 }
 
 
@@ -249,7 +270,7 @@
   if( v.size() != H::d ) 
     error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality
   double tmp = 0;
-  float *pA, *pb;
+  float *pA, *pb, *bd;
   Uns32T *pg;
   int dd;
   vector<float>::iterator vi;
@@ -292,6 +313,7 @@
 #else
   for( aa=0; aa < H::L ; aa++ ){
     pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k      
+    bd= *( H::boundaryDistances + aa);
     for( kk = 0 ; kk != H::k ; kk++ ){
       pb = *( H::b + aa ) + kk;
       pA = * ( * ( H::A + aa ) + kk );
@@ -301,8 +323,11 @@
       while( dd-- )
 	tmp += *pA++ * *vi++;  // project
       tmp += *pb;              // translate
-      tmp *= iw;               // scale      
-      *pg++ = (Uns32T) (floor(tmp));      // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z
+      tmp *= iw;               // scale
+      *pg = (Uns32T) (floor(tmp));      // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z
+      *bd = (tmp - *pg++);//*w;       // boundary distance -1
+      *(bd+1) = (1.0f - *bd); //*w;    // boundary distance +1
+      bd+=2;
     }
   }
 #endif
@@ -314,6 +339,34 @@
   H::t2 = computeProductModDefaultPrime( g, r2, H::k );
 }
 
+// make hash value by purturbating the given hash functions
+// according the the boundary distances of the current query
+void H::generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){
+  assert(!multiProbePtr->empty()); // Test this for now, until all is stable
+  Uns32T* mpg = new Uns32T(H::k); // temporary array storage
+
+  // Copy the hash bucket identifiers
+  Uns32T* mpgPtr = mpg;
+  Uns32T kk = H::k;
+  while(kk--)
+    *mpgPtr++ = *g++;
+
+  // Retrieve the next purturbation set
+  perturbation_set ps = multiProbePtr->getNextPerturbationSet();
+  perturbation_set::iterator it = ps.begin();
+
+  // Perturbate the hash functions g
+  while( it != ps.end() ){
+    *(mpg + multiProbePtr->getIndex(it)) +=  multiProbePtr->getBoundary(it);  
+    it++;
+  }
+
+  H::t1 = computeProductModDefaultPrime( mpg, r1, H::k ) % H::N;
+  H::t2 = computeProductModDefaultPrime( mpg, r2, H::k );
+    
+  delete[] mpg; // free up temporary storage
+}
+
 #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}}
 
 // Computes (a.b) mod UH_PRIME_DEFAULT
@@ -447,6 +500,7 @@
 {
   FILE* dbFile = 0;
   int dbfid = unserialize_lsh_header(filename);
+
   indexName = new char[O2_INDEX_MAXSTR];
   strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER
   H::initialize_lsh_functions(); // Base-class data-structure initialization
@@ -501,21 +555,29 @@
 
 // point retrieval routine
 void G::retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
+  assert(LSH_MULTI_PROBE_COUNT);
   calling_instance = caller;
   add_point_callback = add_point;
   H::compute_hash_functions( v );
   for(Uns32T j = 0 ; j < H::L ; j++ ){
-    H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 
-    if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) {
+    // MultiProbe loop    
+    multiProbePtr->generatePerturbationSets( *( H::boundaryDistances + j ) , 2*H::k, (unsigned)LSH_MULTI_PROBE_COUNT);
+    for(Uns32T multiProbeIdx = 0 ; multiProbeIdx < LSH_MULTI_PROBE_COUNT+1 ; multiProbeIdx++ ){
+      if(!multiProbeIdx)	
+	H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 
+      else
+	H::generate_multiprobe_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 
+      if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) {
 #ifdef LSH_LIST_HEAD_COUNTERS
-      if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
-	retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos);
-      } else {
-	bucket_chain_point( bPtr->next, qpos);
+	if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
+	  retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos);
+	} else {
+	  bucket_chain_point( bPtr->next, qpos);
+	}
+#else
+	bucket_chain_point( bPtr , qpos);
+#endif
       }
-#else
-      bucket_chain_point( bPtr , qpos);
-#endif
     }
   }
 }
--- a/lshlib.h	Mon Jan 26 02:50:44 2009 +0000
+++ b/lshlib.h	Mon Jan 26 05:42:15 2009 +0000
@@ -33,6 +33,7 @@
 #ifdef MT19937
 #include "mt19937/mt19937ar.h"
 #endif
+#include "multiprobe.h"
 
 #define IntT int
 #define LongUns64T long long unsigned
@@ -107,10 +108,14 @@
 
 #define LSH_CORE_ARRAY_BIT (0x80000000) //  LSH_CORE_ARRAY test bit for list head
 
-using namespace std;
+#ifndef LSH_MULTI_PROBE_COUNT
+#define LSH_MULTI_PROBE_COUNT 1 // How many adjacent hash-buckets to probe in LSH retrieval
+#endif
 
 Uns32T get_page_logn();
 
+using namespace std;
+
 // Disk table entry
 typedef class SerialElement SerialElementT;
 class SerialElement {
@@ -232,6 +237,7 @@
   Uns32T t2;       // second hash table key
   Uns32T P;        // hash table prime number
 
+  
   Uns32T N; // num rows per table
   Uns32T C; // num collision per row
   Uns32T k; // num projections per hash function
@@ -242,6 +248,9 @@
   float w;     // width of hash slots (relative to normalized feature space)
   float radius;// scaling coefficient for data (1./radius)
 
+  MultiProbe* multiProbePtr; // Utility class for handling multi-probe queries
+  float ** boundaryDistances; // Array of query bucket-boundary-distances per hashtable
+
   void initialize_data_structures();
   void initialize_lsh_functions();
   void initialize_partial_functions();
@@ -275,6 +284,7 @@
   // Interface to hash functions
   void compute_hash_functions(vector<float>& v);
   void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2);
+  void generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2);
   Uns32T get_t1(){return t1;} // hash-key t1
   Uns32T get_t2(){return t2;} // hash-key t2
 };
--- a/multiprobe.cpp	Mon Jan 26 02:50:44 2009 +0000
+++ b/multiprobe.cpp	Mon Jan 26 05:42:15 2009 +0000
@@ -19,9 +19,25 @@
 
 #include "multiprobe.h"
 
-#define _TEST_MP_LSH
+//#define _TEST_MP_LSH
 
-MinHeapElement::MinHeapElement(perturbation_set a, double s):
+bool operator> (const min_heap_element& a, const min_heap_element& b){
+  return a.score > b.score;
+}
+
+bool operator< (const min_heap_element& a, const min_heap_element& b){
+  return a.score < b.score;
+}
+
+bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
+  return a.first > b.first;
+}
+
+bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
+  return a.first < b.first;
+}
+
+MinHeapElement::MinHeapElement(perturbation_set a, float s):
   perturbs(a),
   score(s)
 {
@@ -57,15 +73,30 @@
   distFuns = 0;
 }
 
-inline size_t MultiProbe::size(){
+size_t MultiProbe::size(){
   return outSets->size();
 }
 
-inline bool MultiProbe::empty(){
+bool MultiProbe::empty(){
   return !outSets->size();
 }
 
 
+void MultiProbe::generatePerturbationSets(vector<float>& x, unsigned T){
+  cleanup(); // Make re-entrant
+  initialize();
+  makeSortedDistFuns(x);
+  algorithm1(T);
+}
+
+// overloading to support efficient array use without initial copy
+void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){
+  cleanup(); // Make re-entrant
+  initialize();
+  makeSortedDistFuns(x, N);
+  algorithm1(T);
+}
+
 // Generate the optimal T perturbation sets for current query
 // pre-conditions:
 //   an LSH structure was initialized and passed to constructor
@@ -77,20 +108,15 @@
 //  these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
 //  data structures are cleared and reset to zeros thereby making them re-entrant
 //
-void MultiProbe::generatePerturbationSets(vector<double>& x, int T){
+void MultiProbe::algorithm1(unsigned T){
   perturbation_set ai,as,ae;
-  double ai_score;
-
-  cleanup(); // Make re-entrant
-  initialize();
-  makeSortedDistFuns(x);
-
+  float ai_score;
   ai.insert(0); // Initialize for this query
   minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
 
   min_heap_element mhe = minHeap->top();
 
-  for(int i = 0 ; i != T ; i++ ){
+  for(unsigned i = 0 ; i != T ; i++ ){
     do{
       if(minHeap->empty())
 	break;
@@ -126,31 +152,44 @@
   int val = *(--it) + 1;
   a.erase(it);
   a.insert(it,val);
+  return a;
 }
 
 // Given the set a, add a new element one greater than the max
 inline perturbation_set& MultiProbe::expand(perturbation_set& a){
   perturbation_set::reverse_iterator ri = a.rbegin();
   a.insert(*ri+1);
+  return a;
 }
 
 // Take the list of distances (x) assuming list len is 2M and
 // delta = (-1)^i, i = { 0 .. 2M-1 }
-void MultiProbe::makeSortedDistFuns(vector<double>& x){
+void MultiProbe::makeSortedDistFuns(vector<float>& x){
   numHashBoundaries = x.size(); // x.size() == 2M
   delete distFuns;
   distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
-  for(int i = 0; i != numHashBoundaries ; i++ )
-    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
+  for(unsigned i = 0; i != numHashBoundaries ; i++ )
+    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?-1:1));
+  // SORT
+  sort( distFuns->begin(), distFuns->end() );
+}
+
+// Float array version of above
+void MultiProbe::makeSortedDistFuns(float* x, unsigned N){
+  numHashBoundaries = N; // x.size() == 2M
+  delete distFuns;
+  distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
+  for(unsigned i = 0; i != numHashBoundaries ; i++ )
+    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?-1:1));
   // SORT
   sort( distFuns->begin(), distFuns->end() );
 }
 
 // For a given perturbation set, the score is the 
 // sum of squares of corresponding distances in x
-double MultiProbe::score(perturbation_set& a){
+float MultiProbe::score(perturbation_set& a){
   //assert(!a.empty());
-  double score = 0.0, tmp;
+  float score = 0.0, tmp;
   perturbation_set::iterator it;
   it = a.begin();
   do{
@@ -170,12 +209,20 @@
   while( it != a.end() ){
     j = *it;
     it++;
-    if( ( j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) )
+    if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) )
       return false;    
   }
   return true;
 }
 
+int MultiProbe::getIndex(perturbation_set::iterator it){
+  return (*distFuns)[*it].second.first;
+}
+
+int MultiProbe::getBoundary(perturbation_set::iterator it){
+  return (*distFuns)[*it].second.second;
+}
+
 // copy return next perturbation_set
 perturbation_set MultiProbe::getNextPerturbationSet(){
   perturbation_set s = outSets->top().perturbs; 
@@ -191,10 +238,10 @@
   int N_SAMPS = 100; // Number of random samples
   int W = 4;         // simulated hash-bucket size
   int N_ITER = 100;  // How many re-entrant iterations
-  int T = 10; // Number of multi-probe sets to generate
+  unsigned T = 10; // Number of multi-probe sets to generate
 
   MultiProbe mp= MultiProbe();
-  vector<double> x(N_SAMPS);
+  vector<float> x(N_SAMPS);
 
   srand((unsigned)time(0)); 
 
--- a/multiprobe.h	Mon Jan 26 02:50:44 2009 +0000
+++ b/multiprobe.h	Mon Jan 26 05:42:15 2009 +0000
@@ -34,8 +34,8 @@
 typedef class MinHeapElement{
 public:
   perturbation_set perturbs;
-  double score;
-  MinHeapElement(perturbation_set a, double s);
+  float score;
+  MinHeapElement(perturbation_set a, float s);
   virtual ~MinHeapElement();
 } min_heap_element;
 
@@ -44,23 +44,7 @@
 		       greater<min_heap_element> 
 		       > min_heap_of_perturbation_set ;
 
-typedef pair<double, pair<int, int> > sorted_distance_functions ;
-
-bool operator> (const min_heap_element& a, const min_heap_element& b){
-  return a.score > b.score;
-}
-
-bool operator< (const min_heap_element& a, const min_heap_element& b){
-  return a.score < b.score;
-}
-
-bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
-  return a.first > b.first;
-}
-
-bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
-  return a.first < b.first;
-}
+typedef pair<float, pair<int, int> > sorted_distance_functions ;
 
 class MultiProbe{
 protected:
@@ -76,20 +60,27 @@
   // perturbation set operations
   perturbation_set& shift(perturbation_set&);
   perturbation_set& expand(perturbation_set&);
-  double score(perturbation_set&);
+  float score(perturbation_set&);
   bool valid(perturbation_set&);
-  void makeSortedDistFuns(vector<double> &);
+  void makeSortedDistFuns(vector<float> &);
+  void makeSortedDistFuns(float* x, unsigned N);
+
+  // perturbation set generation algorithm
+  void algorithm1(unsigned T);
 
 public: 
   MultiProbe();
   ~MultiProbe();
 
   // generation of perturbation sets
-  void generatePerturbationSets(vector<double>& vectorOfBounaryDistances, int numSetsToGenerate);  
+  void generatePerturbationSets(vector<float>& vectorOfBounaryDistances, unsigned numSetsToGenerate);  
+  void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate);
   perturbation_set getNextPerturbationSet();
   void dump(perturbation_set);
   size_t size(); // Number of perturbation sets are in the output queue
   bool empty(); // predicate for empty MultiProbe set
+  int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry
+  int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry
 };