changeset 516:2a7bad47a4a7 multiprobeLSH

New branch to implement multiprobe LSH. Copy of trunk:802. Added multiprobe.{cpp,h} source files.
author mas01mc
date Sat, 24 Jan 2009 19:51:46 +0000
parents c7bdb7913762
children 807c8be7dd45
files lshlib.cpp multiprobe.cpp multiprobe.h
diffstat 3 files changed, 227 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/lshlib.cpp	Sat Jan 24 09:39:39 2009 +0000
+++ b/lshlib.cpp	Sat Jan 24 19:51:46 2009 +0000
@@ -138,7 +138,7 @@
     }
   }
 
-  // Storage for whole or partial function evaluation depdenting on USE_U_FUNCTIONS
+  // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS
   H::initialize_partial_functions();
 }
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multiprobe.cpp	Sat Jan 24 19:51:46 2009 +0000
@@ -0,0 +1,116 @@
+/*
+ * MultiProbe C++ class
+ *
+ * Given a LSH structure, perform lookup by probing nearby hash-function locations
+ *
+ * Implementation using C++ STL
+ *
+ * Reference:
+ * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
+ * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
+ * Search", Proc. Intl. Conf. VLDB, 2007
+ *
+ *
+ * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
+ * License: GNU Public License 2.0
+ *
+ */
+
+#include "multiprobe.h"
+
+
+#define _TEST_MP_LSH
+
+MultiProbe::MultiProbe(){
+  minHeap = new min_heap_of_perturbation_set();
+  outSets = new vector_of_perturbation_set();
+}
+
+MultiProbe::~MultiProbe(){  
+  // FIXME: Are these arrays ?  
+  delete minHeap;
+  delete outSets;
+}
+
+perturbation_set& MultiProbe::shift(perturbation_set a){  
+  // a[a.length]++
+}
+
+perturbation_set& MultiProbe::expand(perturbation_set a){
+  // a[a.length+1]=a[a.length]+1;
+}
+
+// Generate the optimal T perturbation sets for current query
+// pre-conditions:
+//   an LSH structure was initialized and passed to constructor
+//   a query vector was passed to lsh->compute_hash_functions()
+//   the query-to-boundary distances are stored in x[hashFunIndex]
+//
+// post-conditions:
+//
+//  generates an ordered list of perturbation sets (stored as an array of sets)
+//  these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
+//  data structures are cleared and reset to zeros thereby making them re-entrant
+//
+void MultiProbe::generatePerturbationSets(int T, vector<int>& x){
+  perturbation_set ai,as,ae;
+
+  distFuns = makeSortedDistFuns(x); // NEED TO IMPLEMENT, distances from query to boundaries
+
+  ai.insert(1);
+  minHeap->push(make_pair(perturbation_set(ai), score(ai))); // unique instance stored in mhe
+
+  for(int i = 0 ; i < T ; i++ ){
+    do{
+      min_heap_element mhe = minHeap->top();
+      minHeap->pop();
+      ai = mhe.first;
+      as = shift(perturbation_set(ai));
+      minHeap->push(make_pair(perturbation_set(as), score(as)));
+      ae = expand(perturbation_set(ai));
+      minHeap->push(make_pair(perturbation_set(ae), score(ae)));
+    }while(!valid(ai));
+    outSets->push_back(ai); // Ordered list of perturbation sets
+  }
+}
+
+
+// Take the list of distances (x) assuming list len is 2M and
+// delta = (-1)^i, i = { 0 .. 2M-1 }
+sorted_distance_functions& MultiProbe::makeSortedDistFuns(vector<int>& x){
+  // Change these to vector<pair<double, pair<int,int> > >
+  sorted_distance_functions* distFuns = new sorted_distance_functions[x.size()];
+  for(int i = 0; i < x.size() ; i++){  
+    distFuns[i] = make_pair(x[i], make_pair(i, i%2?-1:+1));
+  }
+}
+
+double MultiProbe::score(perturbation_set& a){
+  // Needs implementing
+}
+
+bool MultiProbe::valid(perturbation_set& a){
+
+
+  // A valid set must have at most one
+  // of the two elements {j, 2M + 1 - j} for every j
+  //
+  // A perturbation set containing an element > 2M is invalid
+
+  
+
+}
+
+#ifdef _TEST_MP_LSH
+int main(const int argc, const char* argv[]){
+
+  MultiProbe mp();
+
+}
+#endif
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multiprobe.h	Sat Jan 24 19:51:46 2009 +0000
@@ -0,0 +1,110 @@
+/*
+ * MultiProbe C++ class
+ *
+ * Given a LSH structure, perform lookup by probing nearby hash-function locations
+ *
+ * Implementation using C++ STL
+ *
+ * Reference:
+ * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
+ * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
+ * Search", Proc. Intl. Conf. VLDB, 2007
+ *
+ *
+ * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
+ * License: GNU Public License 2.0
+ *
+ */
+
+#include <functional>
+#include <queue>
+#include <vector>
+#include <set>
+
+using namespace std ;
+
+typedef set<int > perturbation_set ;
+typedef vector<perturbation_set > vector_of_perturbation_set ;
+typedef pair<perturbation_set, float > min_heap_element ;
+typedef vector<min_heap_element > vector_of_min_heap_element ;
+typedef priority_queue<
+  min_heap_element, 
+  vector_of_min_heap_element, 
+  greater<min_heap_element> 
+  > min_heap_of_perturbation_set ;
+typedef pair<double, pair<int, int> > sorted_distance_functions ;
+
+bool operator>(const min_heap_element& a, const min_heap_element& b){
+  return a.second > b.second;
+}
+
+bool operator<(const min_heap_element& a, const min_heap_element& b){
+  return a.second < b.second;
+}
+
+bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
+  return a.second > b.second;
+}
+
+bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
+  return a.second < b.second;
+}
+
+class MultiProbe{
+protected:
+  min_heap_of_perturbation_set* minHeap;
+  vector_of_perturbation_set* outSets;
+  sorted_distance_functions distFuns;
+public: 
+  MultiProbe();
+  ~MultiProbe();
+
+  // perturbation set operations
+  perturbation_set& shift(perturbation_set);
+  perturbation_set& expand(perturbation_set);
+  double score(perturbation_set&);
+  bool valid(perturbation_set&);
+
+  // generation of perturbation sets
+  void generatePerturbationSets(int, vector<int>&);
+  sorted_distance_functions& makeSortedDistFuns(vector<int> &);
+};
+
+/* NOTES:
+
+Reference:
+Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
+"Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
+Search", Proc. Intl. Conf. VLDB, 2007
+
+  i = 1..M (number of hash functions used)
+  f_i(q) = a_i * q + b_i // projection to real line
+  h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot  
+  delta \in {-1, +1}
+  x_i( delta ) = distance of query to left or right boundary
+  Delta = [delta_1 delta_2 ... delta_M]
+  score(Delta) = sum_{i=1}_M x_i(delta_i).^2
+  
+  z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
+  p_j = (i, delta) if z_j == x_i(delta) 
+  
+  A_k is index into p_j
+
+  Multi-probe algorithm (after Lv et al. 2007)
+  ---------------------------------------------
+
+  A0 = {1}
+  minHeap.insert(A0, score(A0))
+  
+  for i = 1 to T do   // T = number of perturbation sets
+     repeat
+        Ai = minHeap.extractMin()
+	As = shift(Ai)
+	minHeap.insert(As, score(As))
+	Ae = expand(Ai)
+	minHeap.insert(Ae, score(Ae))
+     until valid(Ai)
+     output Ai
+  end for
+
+ */