changeset 517:807c8be7dd45 multiprobeLSH

Completed multiprobe framework for LSH. Requires testing.
author mas01mc
date Sun, 25 Jan 2009 06:10:38 +0000
parents 2a7bad47a4a7
children ca1ee92c359c
files multiprobe.cpp multiprobe.h
diffstat 2 files changed, 90 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/multiprobe.cpp	Sat Jan 24 19:51:46 2009 +0000
+++ b/multiprobe.cpp	Sun Jan 25 06:10:38 2009 +0000
@@ -21,7 +21,12 @@
 
 #define _TEST_MP_LSH
 
-MultiProbe::MultiProbe(){
+MultiProbe::MultiProbe():
+  minHeap(0),
+  outSets(0),
+  distFuns(0),
+  numHashBoundaries(0)
+{
   minHeap = new min_heap_of_perturbation_set();
   outSets = new vector_of_perturbation_set();
 }
@@ -30,14 +35,7 @@
   // FIXME: Are these arrays ?  
   delete minHeap;
   delete outSets;
-}
-
-perturbation_set& MultiProbe::shift(perturbation_set a){  
-  // a[a.length]++
-}
-
-perturbation_set& MultiProbe::expand(perturbation_set a){
-  // a[a.length+1]=a[a.length]+1;
+  delete distFuns;
 }
 
 // Generate the optimal T perturbation sets for current query
@@ -52,12 +50,12 @@
 //  these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
 //  data structures are cleared and reset to zeros thereby making them re-entrant
 //
-void MultiProbe::generatePerturbationSets(int T, vector<int>& x){
+void MultiProbe::generatePerturbationSets(int T, vector<double>& x){
   perturbation_set ai,as,ae;
 
-  distFuns = makeSortedDistFuns(x); // NEED TO IMPLEMENT, distances from query to boundaries
+  makeSortedDistFuns(x);
 
-  ai.insert(1);
+  ai.insert(1); // Initialize for this query
   minHeap->push(make_pair(perturbation_set(ai), score(ai))); // unique instance stored in mhe
 
   for(int i = 0 ; i < T ; i++ ){
@@ -65,47 +63,95 @@
       min_heap_element mhe = minHeap->top();
       minHeap->pop();
       ai = mhe.first;
-      as = shift(perturbation_set(ai));
+      cout << "ai: ";
+      dump(ai);
+      as = perturbation_set(ai);
+      shift(as);
+      cout << "as: ";
+      dump(as);
       minHeap->push(make_pair(perturbation_set(as), score(as)));
-      ae = expand(perturbation_set(ai));
+      ae = perturbation_set(ai);
+      expand(ae);
+      cout << "ae: ";
+      dump(ae);
       minHeap->push(make_pair(perturbation_set(ae), score(ae)));
     }while(!valid(ai));
     outSets->push_back(ai); // Ordered list of perturbation sets
   }
 }
 
+void MultiProbe::dump(perturbation_set& a){
+  perturbation_set::iterator it = a.begin();
+  while(it != a.end())
+    cout << *it++ << " ";
+  cout << "\n";
+}
+
+// Given the set a, add 1 to last element of the set
+perturbation_set& MultiProbe::shift(perturbation_set& a){  
+  perturbation_set::iterator it = a.end();
+  int val = *(--it) + 1;
+  a.erase(it);
+  a.insert(it,val);
+}
+
+// Given the set a, add a new element one greater than the max
+perturbation_set& MultiProbe::expand(perturbation_set& a){
+  perturbation_set::reverse_iterator ri = a.rbegin();
+  a.insert(*ri+1);
+}
 
 // Take the list of distances (x) assuming list len is 2M and
 // delta = (-1)^i, i = { 0 .. 2M-1 }
-sorted_distance_functions& MultiProbe::makeSortedDistFuns(vector<int>& x){
-  // Change these to vector<pair<double, pair<int,int> > >
-  sorted_distance_functions* distFuns = new sorted_distance_functions[x.size()];
-  for(int i = 0; i < x.size() ; i++){  
-    distFuns[i] = make_pair(x[i], make_pair(i, i%2?-1:+1));
+void MultiProbe::makeSortedDistFuns(vector<double>& x){
+  numHashBoundaries = x.size(); // x.size() == 2M
+  delete distFuns;
+  distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
+  for(int i = 0; i != numHashBoundaries ; i++ ){  
+    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?-1:+1));
   }
+  // SORT
+  sort( distFuns->begin(), distFuns->end() );
 }
 
+// For a given perturbation set, the score is the 
+// sum of squares of corresponding distances in x
 double MultiProbe::score(perturbation_set& a){
-  // Needs implementing
+  //assert(!a.empty());
+  double score = 0.0, tmp;
+  perturbation_set::iterator it;
+  it = a.begin();
+  do{
+    tmp = (*distFuns)[*it++].first;
+    score += tmp*tmp;
+  }while( it != a.end() );
+  return score;
 }
 
+// A valid set must have at most one
+// of the two elements {j, 2M + 1 - j} for every j
+//
+// A perturbation set containing an element > 2M is invalid
 bool MultiProbe::valid(perturbation_set& a){
-
-
-  // A valid set must have at most one
-  // of the two elements {j, 2M + 1 - j} for every j
-  //
-  // A perturbation set containing an element > 2M is invalid
-
-  
-
+  int j;  
+  perturbation_set::iterator it = a.begin();  
+  while( it != a.end() ){
+    j = *it++;
+    if( ( j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) )
+      return false;    
+  }
+  return true;
 }
 
 #ifdef _TEST_MP_LSH
 int main(const int argc, const char* argv[]){
-
-  MultiProbe mp();
-
+  MultiProbe mp = MultiProbe();  
+  vector<double> x(4);
+  x[0]=0.1;
+  x[1]=0.9;
+  x[2]=0.2;
+  x[3]=0.8;
+  mp.generatePerturbationSets(2, x);
 }
 #endif
 
--- a/multiprobe.h	Sat Jan 24 19:51:46 2009 +0000
+++ b/multiprobe.h	Sun Jan 25 06:10:38 2009 +0000
@@ -20,6 +20,8 @@
 #include <queue>
 #include <vector>
 #include <set>
+#include <algorithm>
+#include <iostream>
 
 using namespace std ;
 
@@ -33,6 +35,7 @@
   greater<min_heap_element> 
   > min_heap_of_perturbation_set ;
 typedef pair<double, pair<int, int> > sorted_distance_functions ;
+typedef vector<sorted_distance_functions> vector_of_sorted_distance_functions;
 
 bool operator>(const min_heap_element& a, const min_heap_element& b){
   return a.second > b.second;
@@ -43,31 +46,33 @@
 }
 
 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
-  return a.second > b.second;
+  return a.first > b.first;
 }
 
 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
-  return a.second < b.second;
+  return a.first < b.first;
 }
 
 class MultiProbe{
 protected:
   min_heap_of_perturbation_set* minHeap;
   vector_of_perturbation_set* outSets;
-  sorted_distance_functions distFuns;
+  vector_of_sorted_distance_functions* distFuns;
+  unsigned numHashBoundaries;
 public: 
   MultiProbe();
   ~MultiProbe();
 
   // perturbation set operations
-  perturbation_set& shift(perturbation_set);
-  perturbation_set& expand(perturbation_set);
+  perturbation_set& shift(perturbation_set&);
+  perturbation_set& expand(perturbation_set&);
   double score(perturbation_set&);
   bool valid(perturbation_set&);
 
   // generation of perturbation sets
-  void generatePerturbationSets(int, vector<int>&);
-  sorted_distance_functions& makeSortedDistFuns(vector<int> &);
+  void generatePerturbationSets(int, vector<double>&);
+  void makeSortedDistFuns(vector<double> &);
+  void dump(perturbation_set& );
 };
 
 /* NOTES: