changeset 518:ca1ee92c359c multiprobeLSH

Completed initial implementation of LSH MultiProbe class. Now needs to be joined to lshlib.cpp retrieve() method to perform multi-probe queries.
author mas01mc
date Mon, 26 Jan 2009 02:50:44 +0000
parents 807c8be7dd45
children fdcd436d7cbd
files multiprobe.cpp multiprobe.h
diffstat 2 files changed, 145 insertions(+), 71 deletions(-) [+]
line wrap: on
line diff
--- a/multiprobe.cpp	Sun Jan 25 06:10:38 2009 +0000
+++ b/multiprobe.cpp	Mon Jan 26 02:50:44 2009 +0000
@@ -1,7 +1,8 @@
 /*
  * MultiProbe C++ class
  *
- * Given a LSH structure, perform lookup by probing nearby hash-function locations
+ * Given a vector of LSH boundary distances for a query, 
+ * perform lookup by probing nearby hash-function locations
  *
  * Implementation using C++ STL
  *
@@ -18,8 +19,16 @@
 
 #include "multiprobe.h"
 
+#define _TEST_MP_LSH
 
-#define _TEST_MP_LSH
+MinHeapElement::MinHeapElement(perturbation_set a, double s):
+  perturbs(a),
+  score(s)
+{
+
+}
+
+MinHeapElement::~MinHeapElement(){;}
 
 MultiProbe::MultiProbe():
   minHeap(0),
@@ -27,17 +36,36 @@
   distFuns(0),
   numHashBoundaries(0)
 {
-  minHeap = new min_heap_of_perturbation_set();
-  outSets = new vector_of_perturbation_set();
+
 }
 
 MultiProbe::~MultiProbe(){  
-  // FIXME: Are these arrays ?  
+  cleanup();
+}
+
+void MultiProbe::initialize(){
+  minHeap = new min_heap_of_perturbation_set();
+  outSets = new min_heap_of_perturbation_set();
+}
+
+void MultiProbe::cleanup(){
   delete minHeap;
+  minHeap = 0;
   delete outSets;
+  outSets = 0;
   delete distFuns;
+  distFuns = 0;
 }
 
+inline size_t MultiProbe::size(){
+  return outSets->size();
+}
+
+inline bool MultiProbe::empty(){
+  return !outSets->size();
+}
+
+
 // Generate the optimal T perturbation sets for current query
 // pre-conditions:
 //   an LSH structure was initialized and passed to constructor
@@ -45,50 +73,55 @@
 //   the query-to-boundary distances are stored in x[hashFunIndex]
 //
 // post-conditions:
-//
 //  generates an ordered list of perturbation sets (stored as an array of sets)
 //  these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
 //  data structures are cleared and reset to zeros thereby making them re-entrant
 //
-void MultiProbe::generatePerturbationSets(int T, vector<double>& x){
+void MultiProbe::generatePerturbationSets(vector<double>& x, int T){
   perturbation_set ai,as,ae;
+  double ai_score;
 
+  cleanup(); // Make re-entrant
+  initialize();
   makeSortedDistFuns(x);
 
-  ai.insert(1); // Initialize for this query
-  minHeap->push(make_pair(perturbation_set(ai), score(ai))); // unique instance stored in mhe
+  ai.insert(0); // Initialize for this query
+  minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
 
-  for(int i = 0 ; i < T ; i++ ){
+  min_heap_element mhe = minHeap->top();
+
+  for(int i = 0 ; i != T ; i++ ){
     do{
-      min_heap_element mhe = minHeap->top();
+      if(minHeap->empty())
+	break;
+      mhe = minHeap->top();
+      ai = mhe.perturbs;
+      ai_score = mhe.score;
       minHeap->pop();
-      ai = mhe.first;
-      cout << "ai: ";
-      dump(ai);
-      as = perturbation_set(ai);
+      as=ai;
       shift(as);
-      cout << "as: ";
-      dump(as);
-      minHeap->push(make_pair(perturbation_set(as), score(as)));
-      ae = perturbation_set(ai);
+      minHeap->push(min_heap_element(as, score(as)));
+      ae=ai;
       expand(ae);
-      cout << "ae: ";
-      dump(ae);
-      minHeap->push(make_pair(perturbation_set(ae), score(ae)));
+      minHeap->push(min_heap_element(ae, score(ae)));
     }while(!valid(ai));
-    outSets->push_back(ai); // Ordered list of perturbation sets
+    outSets->push(mhe); // Ordered list of perturbation sets
   }
 }
 
-void MultiProbe::dump(perturbation_set& a){
+void MultiProbe::dump(perturbation_set a){
   perturbation_set::iterator it = a.begin();
-  while(it != a.end())
-    cout << *it++ << " ";
-  cout << "\n";
+  while(it != a.end()){
+    cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " " 
+	 << (*distFuns)[*it].first << *it << ", ";
+    it++;    
+  }
+  cout << "(" << score(a) << ")";
+  cout << endl;
 }
 
 // Given the set a, add 1 to last element of the set
-perturbation_set& MultiProbe::shift(perturbation_set& a){  
+inline perturbation_set& MultiProbe::shift(perturbation_set& a){  
   perturbation_set::iterator it = a.end();
   int val = *(--it) + 1;
   a.erase(it);
@@ -96,7 +129,7 @@
 }
 
 // Given the set a, add a new element one greater than the max
-perturbation_set& MultiProbe::expand(perturbation_set& a){
+inline perturbation_set& MultiProbe::expand(perturbation_set& a){
   perturbation_set::reverse_iterator ri = a.rbegin();
   a.insert(*ri+1);
 }
@@ -107,9 +140,8 @@
   numHashBoundaries = x.size(); // x.size() == 2M
   delete distFuns;
   distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
-  for(int i = 0; i != numHashBoundaries ; i++ ){  
-    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?-1:+1));
-  }
+  for(int i = 0; i != numHashBoundaries ; i++ )
+    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
   // SORT
   sort( distFuns->begin(), distFuns->end() );
 }
@@ -122,9 +154,9 @@
   perturbation_set::iterator it;
   it = a.begin();
   do{
-    tmp = (*distFuns)[*it++].first;
+    tmp = (*distFuns)[*it].first;
     score += tmp*tmp;
-  }while( it != a.end() );
+  }while( ++it != a.end() );
   return score;
 }
 
@@ -136,27 +168,49 @@
   int j;  
   perturbation_set::iterator it = a.begin();  
   while( it != a.end() ){
-    j = *it++;
+    j = *it;
+    it++;
     if( ( j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) )
       return false;    
   }
   return true;
 }
 
+// copy return next perturbation_set
+perturbation_set MultiProbe::getNextPerturbationSet(){
+  perturbation_set s = outSets->top().perturbs; 
+  outSets->pop();
+  return s;
+}
+
+// Test routine: generate 100 random boundary distance pairs
+// call generatePerturbationSets on these distances
+// dump output for inspection
 #ifdef _TEST_MP_LSH
 int main(const int argc, const char* argv[]){
-  MultiProbe mp = MultiProbe();  
-  vector<double> x(4);
-  x[0]=0.1;
-  x[1]=0.9;
-  x[2]=0.2;
-  x[3]=0.8;
-  mp.generatePerturbationSets(2, x);
+  int N_SAMPS = 100; // Number of random samples
+  int W = 4;         // simulated hash-bucket size
+  int N_ITER = 100;  // How many re-entrant iterations
+  int T = 10; // Number of multi-probe sets to generate
+
+  MultiProbe mp= MultiProbe();
+  vector<double> x(N_SAMPS);
+
+  srand((unsigned)time(0)); 
+
+  // Test re-entrance on single instance
+  for(int j = 0; j< N_ITER ; j++){
+    cout << "********** ITERATION " << j << " **********" << endl;
+    cout.flush();
+    for (int i = 0 ; i != x.size()/2 ; i++ ){
+      x[2*i] = W*(rand()/(RAND_MAX+1.0));
+      x[2*i+1] = W - x[2*i];
+    }
+    // Generate multi-probe sets
+    mp.generatePerturbationSets(x, T);
+    // Output contents of multi-probe sets
+    while(!mp.empty())
+      mp.dump(mp.getNextPerturbationSet());
+  }
 }
 #endif
-
-
-
-
-
-
--- a/multiprobe.h	Sun Jan 25 06:10:38 2009 +0000
+++ b/multiprobe.h	Mon Jan 26 02:50:44 2009 +0000
@@ -1,7 +1,8 @@
 /*
  * MultiProbe C++ class
  *
- * Given a LSH structure, perform lookup by probing nearby hash-function locations
+ * Given a vector of LSH boundary distances for a query, 
+ * perform lookup by probing nearby hash-function locations
  *
  * Implementation using C++ STL
  *
@@ -16,6 +17,9 @@
  *
  */
 
+#ifndef __LSH_MULTIPROBE_
+#define __LSH_MULTIPROBE_
+
 #include <functional>
 #include <queue>
 #include <vector>
@@ -23,26 +27,31 @@
 #include <algorithm>
 #include <iostream>
 
-using namespace std ;
+using namespace std;
 
 typedef set<int > perturbation_set ;
-typedef vector<perturbation_set > vector_of_perturbation_set ;
-typedef pair<perturbation_set, float > min_heap_element ;
-typedef vector<min_heap_element > vector_of_min_heap_element ;
-typedef priority_queue<
-  min_heap_element, 
-  vector_of_min_heap_element, 
-  greater<min_heap_element> 
-  > min_heap_of_perturbation_set ;
+
+typedef class MinHeapElement{
+public:
+  perturbation_set perturbs;
+  double score;
+  MinHeapElement(perturbation_set a, double s);
+  virtual ~MinHeapElement();
+} min_heap_element;
+
+typedef priority_queue<min_heap_element, 
+		       vector<min_heap_element>, 
+		       greater<min_heap_element> 
+		       > min_heap_of_perturbation_set ;
+
 typedef pair<double, pair<int, int> > sorted_distance_functions ;
-typedef vector<sorted_distance_functions> vector_of_sorted_distance_functions;
 
-bool operator>(const min_heap_element& a, const min_heap_element& b){
-  return a.second > b.second;
+bool operator> (const min_heap_element& a, const min_heap_element& b){
+  return a.score > b.score;
 }
 
-bool operator<(const min_heap_element& a, const min_heap_element& b){
-  return a.second < b.second;
+bool operator< (const min_heap_element& a, const min_heap_element& b){
+  return a.score < b.score;
 }
 
 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
@@ -56,25 +65,34 @@
 class MultiProbe{
 protected:
   min_heap_of_perturbation_set* minHeap;
-  vector_of_perturbation_set* outSets;
-  vector_of_sorted_distance_functions* distFuns;
+  min_heap_of_perturbation_set* outSets;
+  vector<sorted_distance_functions>* distFuns;
   unsigned numHashBoundaries;
-public: 
-  MultiProbe();
-  ~MultiProbe();
 
+  // data structure initialization and cleanup
+  void initialize();
+  void cleanup();
+  
   // perturbation set operations
   perturbation_set& shift(perturbation_set&);
   perturbation_set& expand(perturbation_set&);
   double score(perturbation_set&);
   bool valid(perturbation_set&);
+  void makeSortedDistFuns(vector<double> &);
+
+public: 
+  MultiProbe();
+  ~MultiProbe();
 
   // generation of perturbation sets
-  void generatePerturbationSets(int, vector<double>&);
-  void makeSortedDistFuns(vector<double> &);
-  void dump(perturbation_set& );
+  void generatePerturbationSets(vector<double>& vectorOfBounaryDistances, int numSetsToGenerate);  
+  perturbation_set getNextPerturbationSet();
+  void dump(perturbation_set);
+  size_t size(); // Number of perturbation sets are in the output queue
+  bool empty(); // predicate for empty MultiProbe set
 };
 
+
 /* NOTES:
 
 Reference:
@@ -113,3 +131,5 @@
   end for
 
  */
+
+#endif // __LSH_MULTIPROBE_