diff multiprobe.cpp @ 754:9bd13c7819ae mkc_lsh_update

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/multiprobe.cpp	Thu Nov 25 13:42:40 2010 +0000
@@ -0,0 +1,263 @@
+/*
+ * MultiProbe C++ class
+ *
+ * Given a vector of LSH boundary distances for a query, 
+ * perform lookup by probing nearby hash-function locations
+ *
+ * Implementation using C++ STL
+ *
+ * Reference:
+ * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
+ * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
+ * Search", Proc. Intl. Conf. VLDB, 2007
+ *
+ *
+ * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
+ * License: GNU Public License 2.0
+ *
+ */
+
+#include "multiprobe.h"
+
+//#define _TEST_MP_LSH
+
+bool operator> (const min_heap_element& a, const min_heap_element& b){
+  return a.score > b.score;
+}
+
+bool operator< (const min_heap_element& a, const min_heap_element& b){
+  return a.score < b.score;
+}
+
+bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
+  return a.first > b.first;
+}
+
+bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
+  return a.first < b.first;
+}
+
+MinHeapElement::MinHeapElement(perturbation_set a, float s):
+  perturbs(a),
+  score(s)
+{
+
+}
+
+MinHeapElement::~MinHeapElement(){;}
+
+MultiProbe::MultiProbe():
+  minHeap(0),
+  outSets(0),
+  distFuns(0),
+  numHashBoundaries(0)
+{
+
+}
+
+MultiProbe::~MultiProbe(){  
+  cleanup();
+}
+
+void MultiProbe::initialize(){
+  minHeap = new min_heap_of_perturbation_set();
+  outSets = new min_heap_of_perturbation_set();
+}
+
+void MultiProbe::cleanup(){
+  delete minHeap;
+  minHeap = 0;
+  delete outSets;
+  outSets = 0;
+  delete distFuns;
+  distFuns = 0;
+}
+
+size_t MultiProbe::size(){
+  return outSets->size();
+}
+
+bool MultiProbe::empty(){
+  return !outSets->size();
+}
+
+
+void MultiProbe::generatePerturbationSets(vector<float>& x, unsigned T){
+  cleanup(); // Make re-entrant
+  initialize();
+  makeSortedDistFuns(x);
+  algorithm1(T);
+}
+
+// overloading to support efficient array use without initial copy
+void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){
+  cleanup(); // Make re-entrant
+  initialize();
+  makeSortedDistFuns(x, N);
+  algorithm1(T);
+}
+
+// Generate the optimal T perturbation sets for current query
+// pre-conditions:
+//   an LSH structure was initialized and passed to constructor
+//   a query vector was passed to lsh->compute_hash_functions()
+//   the query-to-boundary distances are stored in x[hashFunIndex]
+//
+// post-conditions:
+//  generates an ordered list of perturbation sets (stored as an array of sets)
+//  these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
+//  data structures are cleared and reset to zeros thereby making them re-entrant
+//
+void MultiProbe::algorithm1(unsigned T){
+  perturbation_set ai,as,ae;
+  float ai_score;
+  ai.insert(0); // Initialize for this query
+  minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
+
+  min_heap_element mhe = minHeap->top();
+
+  if(T>distFuns->size())
+    T = distFuns->size();
+  for(unsigned i = 0 ; i != T ; i++ ){
+    do{
+      mhe = minHeap->top();
+      ai = mhe.perturbs;
+      ai_score = mhe.score;
+      minHeap->pop();
+      as=ai;
+      shift(as);
+      minHeap->push(min_heap_element(as, score(as)));
+      ae=ai;
+      expand(ae);
+      minHeap->push(min_heap_element(ae, score(ae)));
+    }while(!valid(ai));
+    outSets->push(mhe); // Ordered list of perturbation sets
+  }
+}
+
+void MultiProbe::dump(perturbation_set a){
+  perturbation_set::iterator it = a.begin();
+  while(it != a.end()){
+    cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " " 
+	 << (*distFuns)[*it].first << *it << ", ";
+    it++;    
+  }
+  cout << "(" << score(a) << ")";
+  cout << endl;
+}
+
+// Given the set a, add 1 to last element of the set
+inline perturbation_set& MultiProbe::shift(perturbation_set& a){  
+  perturbation_set::iterator it = a.end();
+  int val = *(--it) + 1;
+  a.erase(it);
+  a.insert(it,val);
+  return a;
+}
+
+// Given the set a, add a new element one greater than the max
+inline perturbation_set& MultiProbe::expand(perturbation_set& a){
+  perturbation_set::reverse_iterator ri = a.rbegin();
+  a.insert(*ri+1);
+  return a;
+}
+
+// Take the list of distances (x) assuming list len is 2M and
+// delta = (-1)^i, i = { 0 .. 2M-1 }
+void MultiProbe::makeSortedDistFuns(vector<float>& x){
+  numHashBoundaries = x.size(); // x.size() == 2M
+  delete distFuns;
+  distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
+  for(unsigned i = 0; i != numHashBoundaries ; i++ )
+    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
+  // SORT
+  sort( distFuns->begin(), distFuns->end() );
+}
+
+// Float array version of above
+void MultiProbe::makeSortedDistFuns(float* x, unsigned N){
+  numHashBoundaries = N; // x.size() == 2M
+  delete distFuns;
+  distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
+  for(unsigned i = 0; i != numHashBoundaries ; i++ )
+    (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
+  // SORT
+  sort( distFuns->begin(), distFuns->end() );
+}
+
+// For a given perturbation set, the score is the 
+// sum of squares of corresponding distances in x
+float MultiProbe::score(perturbation_set& a){
+  //assert(!a.empty());
+  float score = 0.0, tmp;
+  perturbation_set::iterator it;
+  it = a.begin();
+  do{
+    tmp = (*distFuns)[*it].first;
+    score += tmp*tmp;
+  }while( ++it != a.end() );
+  return score;
+}
+
+// A valid set must have at most one
+// of the two elements {j, 2M + 1 - j} for every j
+//
+// A perturbation set containing an element > 2M is invalid
+bool MultiProbe::valid(perturbation_set& a){
+  int j;  
+  perturbation_set::iterator it = a.begin();  
+  while( it != a.end() ){
+    j = *it;
+    it++;
+    if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries - j - 1 ) != a.end() ) )
+      return false;    
+  }
+  return true;
+}
+
+int MultiProbe::getIndex(perturbation_set::iterator it){
+  return (*distFuns)[*it].second.first;
+}
+
+int MultiProbe::getBoundary(perturbation_set::iterator it){
+  return (*distFuns)[*it].second.second;
+}
+
+// copy return next perturbation_set
+perturbation_set MultiProbe::getNextPerturbationSet(){
+  perturbation_set s = outSets->top().perturbs; 
+  outSets->pop();
+  return s;
+}
+
+// Test routine: generate 100 random boundary distance pairs
+// call generatePerturbationSets on these distances
+// dump output for inspection
+#ifdef _TEST_MP_LSH
+int main(const int argc, const char* argv[]){
+  int N_SAMPS = 100; // Number of random samples
+  int W = 4;         // simulated hash-bucket size
+  int N_ITER = 100;  // How many re-entrant iterations
+  unsigned T = 10; // Number of multi-probe sets to generate
+
+  MultiProbe mp= MultiProbe();
+  vector<float> x(N_SAMPS);
+
+  srand((unsigned)time(0)); 
+
+  // Test re-entrance on single instance
+  for(int j = 0; j< N_ITER ; j++){
+    cout << "********** ITERATION " << j << " **********" << endl;
+    cout.flush();
+    for (int i = 0 ; i != x.size()/2 ; i++ ){
+      x[2*i] = W*(rand()/(RAND_MAX+1.0));
+      x[2*i+1] = W - x[2*i];
+    }
+    // Generate multi-probe sets
+    mp.generatePerturbationSets(x, T);
+    // Output contents of multi-probe sets
+    while(!mp.empty())
+      mp.dump(mp.getNextPerturbationSet());
+  }
+}
+#endif