mas01mc@516: /* mas01mc@516: * MultiProbe C++ class mas01mc@516: * mas01mc@518: * Given a vector of LSH boundary distances for a query, mas01mc@518: * perform lookup by probing nearby hash-function locations mas01mc@516: * mas01mc@516: * Implementation using C++ STL mas01mc@516: * mas01mc@516: * Reference: mas01mc@516: * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, mas01mc@516: * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity mas01mc@516: * Search", Proc. Intl. Conf. VLDB, 2007 mas01mc@516: * mas01mc@516: * mas01mc@516: * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved mas01mc@516: * License: GNU Public License 2.0 mas01mc@516: * mas01mc@516: */ mas01mc@516: mas01mc@516: #include "multiprobe.h" mas01mc@516: mas01mc@519: //#define _TEST_MP_LSH mas01mc@516: mas01mc@519: bool operator> (const min_heap_element& a, const min_heap_element& b){ mas01mc@519: return a.score > b.score; mas01mc@519: } mas01mc@519: mas01mc@519: bool operator< (const min_heap_element& a, const min_heap_element& b){ mas01mc@519: return a.score < b.score; mas01mc@519: } mas01mc@519: mas01mc@519: bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){ mas01mc@519: return a.first > b.first; mas01mc@519: } mas01mc@519: mas01mc@519: bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){ mas01mc@519: return a.first < b.first; mas01mc@519: } mas01mc@519: mas01mc@519: MinHeapElement::MinHeapElement(perturbation_set a, float s): mas01mc@518: perturbs(a), mas01mc@518: score(s) mas01mc@518: { mas01mc@518: mas01mc@518: } mas01mc@518: mas01mc@518: MinHeapElement::~MinHeapElement(){;} mas01mc@516: mas01mc@517: MultiProbe::MultiProbe(): mas01mc@517: minHeap(0), mas01mc@517: outSets(0), mas01mc@517: distFuns(0), mas01mc@517: numHashBoundaries(0) mas01mc@517: { mas01mc@518: mas01mc@516: } mas01mc@516: mas01mc@516: MultiProbe::~MultiProbe(){ mas01mc@518: cleanup(); mas01mc@518: } mas01mc@518: mas01mc@518: void MultiProbe::initialize(){ mas01mc@518: minHeap = new min_heap_of_perturbation_set(); mas01mc@518: outSets = new min_heap_of_perturbation_set(); mas01mc@518: } mas01mc@518: mas01mc@518: void MultiProbe::cleanup(){ mas01mc@516: delete minHeap; mas01mc@518: minHeap = 0; mas01mc@516: delete outSets; mas01mc@518: outSets = 0; mas01mc@517: delete distFuns; mas01mc@518: distFuns = 0; mas01mc@516: } mas01mc@516: mas01mc@519: size_t MultiProbe::size(){ mas01mc@518: return outSets->size(); mas01mc@518: } mas01mc@518: mas01mc@519: bool MultiProbe::empty(){ mas01mc@518: return !outSets->size(); mas01mc@518: } mas01mc@518: mas01mc@518: mas01mc@519: void MultiProbe::generatePerturbationSets(vector& x, unsigned T){ mas01mc@519: cleanup(); // Make re-entrant mas01mc@519: initialize(); mas01mc@519: makeSortedDistFuns(x); mas01mc@519: algorithm1(T); mas01mc@519: } mas01mc@519: mas01mc@519: // overloading to support efficient array use without initial copy mas01mc@519: void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){ mas01mc@519: cleanup(); // Make re-entrant mas01mc@519: initialize(); mas01mc@519: makeSortedDistFuns(x, N); mas01mc@519: algorithm1(T); mas01mc@519: } mas01mc@519: mas01mc@516: // Generate the optimal T perturbation sets for current query mas01mc@516: // pre-conditions: mas01mc@516: // an LSH structure was initialized and passed to constructor mas01mc@516: // a query vector was passed to lsh->compute_hash_functions() mas01mc@516: // the query-to-boundary distances are stored in x[hashFunIndex] mas01mc@516: // mas01mc@516: // post-conditions: mas01mc@516: // generates an ordered list of perturbation sets (stored as an array of sets) mas01mc@516: // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j mas01mc@516: // data structures are cleared and reset to zeros thereby making them re-entrant mas01mc@516: // mas01mc@519: void MultiProbe::algorithm1(unsigned T){ mas01mc@516: perturbation_set ai,as,ae; mas01mc@519: float ai_score; mas01mc@518: ai.insert(0); // Initialize for this query mas01mc@518: minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe mas01mc@516: mas01mc@518: min_heap_element mhe = minHeap->top(); mas01mc@518: mas01mc@522: if(T>distFuns->size()) mas01mc@522: T = distFuns->size(); mas01mc@519: for(unsigned i = 0 ; i != T ; i++ ){ mas01mc@516: do{ mas01mc@518: mhe = minHeap->top(); mas01mc@518: ai = mhe.perturbs; mas01mc@518: ai_score = mhe.score; mas01mc@516: minHeap->pop(); mas01mc@518: as=ai; mas01mc@517: shift(as); mas01mc@518: minHeap->push(min_heap_element(as, score(as))); mas01mc@518: ae=ai; mas01mc@517: expand(ae); mas01mc@518: minHeap->push(min_heap_element(ae, score(ae))); mas01mc@516: }while(!valid(ai)); mas01mc@518: outSets->push(mhe); // Ordered list of perturbation sets mas01mc@516: } mas01mc@516: } mas01mc@516: mas01mc@518: void MultiProbe::dump(perturbation_set a){ mas01mc@517: perturbation_set::iterator it = a.begin(); mas01mc@518: while(it != a.end()){ mas01mc@518: cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " " mas01mc@518: << (*distFuns)[*it].first << *it << ", "; mas01mc@518: it++; mas01mc@518: } mas01mc@518: cout << "(" << score(a) << ")"; mas01mc@518: cout << endl; mas01mc@517: } mas01mc@517: mas01mc@517: // Given the set a, add 1 to last element of the set mas01mc@518: inline perturbation_set& MultiProbe::shift(perturbation_set& a){ mas01mc@517: perturbation_set::iterator it = a.end(); mas01mc@517: int val = *(--it) + 1; mas01mc@517: a.erase(it); mas01mc@517: a.insert(it,val); mas01mc@519: return a; mas01mc@517: } mas01mc@517: mas01mc@517: // Given the set a, add a new element one greater than the max mas01mc@518: inline perturbation_set& MultiProbe::expand(perturbation_set& a){ mas01mc@517: perturbation_set::reverse_iterator ri = a.rbegin(); mas01mc@517: a.insert(*ri+1); mas01mc@519: return a; mas01mc@517: } mas01mc@516: mas01mc@516: // Take the list of distances (x) assuming list len is 2M and mas01mc@516: // delta = (-1)^i, i = { 0 .. 2M-1 } mas01mc@519: void MultiProbe::makeSortedDistFuns(vector& x){ mas01mc@517: numHashBoundaries = x.size(); // x.size() == 2M mas01mc@517: delete distFuns; mas01mc@517: distFuns = new std::vector(numHashBoundaries); mas01mc@519: for(unsigned i = 0; i != numHashBoundaries ; i++ ) mas01mc@520: (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); mas01mc@519: // SORT mas01mc@519: sort( distFuns->begin(), distFuns->end() ); mas01mc@519: } mas01mc@519: mas01mc@519: // Float array version of above mas01mc@519: void MultiProbe::makeSortedDistFuns(float* x, unsigned N){ mas01mc@519: numHashBoundaries = N; // x.size() == 2M mas01mc@519: delete distFuns; mas01mc@519: distFuns = new std::vector(numHashBoundaries); mas01mc@519: for(unsigned i = 0; i != numHashBoundaries ; i++ ) mas01mc@520: (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); mas01mc@517: // SORT mas01mc@517: sort( distFuns->begin(), distFuns->end() ); mas01mc@516: } mas01mc@516: mas01mc@517: // For a given perturbation set, the score is the mas01mc@517: // sum of squares of corresponding distances in x mas01mc@519: float MultiProbe::score(perturbation_set& a){ mas01mc@517: //assert(!a.empty()); mas01mc@519: float score = 0.0, tmp; mas01mc@517: perturbation_set::iterator it; mas01mc@517: it = a.begin(); mas01mc@517: do{ mas01mc@518: tmp = (*distFuns)[*it].first; mas01mc@517: score += tmp*tmp; mas01mc@518: }while( ++it != a.end() ); mas01mc@517: return score; mas01mc@516: } mas01mc@516: mas01mc@517: // A valid set must have at most one mas01mc@517: // of the two elements {j, 2M + 1 - j} for every j mas01mc@517: // mas01mc@517: // A perturbation set containing an element > 2M is invalid mas01mc@516: bool MultiProbe::valid(perturbation_set& a){ mas01mc@517: int j; mas01mc@517: perturbation_set::iterator it = a.begin(); mas01mc@517: while( it != a.end() ){ mas01mc@518: j = *it; mas01mc@518: it++; mas01mc@520: if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries - j - 1 ) != a.end() ) ) mas01mc@517: return false; mas01mc@517: } mas01mc@517: return true; mas01mc@516: } mas01mc@516: mas01mc@519: int MultiProbe::getIndex(perturbation_set::iterator it){ mas01mc@519: return (*distFuns)[*it].second.first; mas01mc@519: } mas01mc@519: mas01mc@519: int MultiProbe::getBoundary(perturbation_set::iterator it){ mas01mc@519: return (*distFuns)[*it].second.second; mas01mc@519: } mas01mc@519: mas01mc@518: // copy return next perturbation_set mas01mc@518: perturbation_set MultiProbe::getNextPerturbationSet(){ mas01mc@518: perturbation_set s = outSets->top().perturbs; mas01mc@518: outSets->pop(); mas01mc@518: return s; mas01mc@518: } mas01mc@518: mas01mc@518: // Test routine: generate 100 random boundary distance pairs mas01mc@518: // call generatePerturbationSets on these distances mas01mc@518: // dump output for inspection mas01mc@516: #ifdef _TEST_MP_LSH mas01mc@516: int main(const int argc, const char* argv[]){ mas01mc@518: int N_SAMPS = 100; // Number of random samples mas01mc@518: int W = 4; // simulated hash-bucket size mas01mc@518: int N_ITER = 100; // How many re-entrant iterations mas01mc@519: unsigned T = 10; // Number of multi-probe sets to generate mas01mc@518: mas01mc@518: MultiProbe mp= MultiProbe(); mas01mc@519: vector x(N_SAMPS); mas01mc@518: mas01mc@518: srand((unsigned)time(0)); mas01mc@518: mas01mc@518: // Test re-entrance on single instance mas01mc@518: for(int j = 0; j< N_ITER ; j++){ mas01mc@518: cout << "********** ITERATION " << j << " **********" << endl; mas01mc@518: cout.flush(); mas01mc@518: for (int i = 0 ; i != x.size()/2 ; i++ ){ mas01mc@518: x[2*i] = W*(rand()/(RAND_MAX+1.0)); mas01mc@518: x[2*i+1] = W - x[2*i]; mas01mc@518: } mas01mc@518: // Generate multi-probe sets mas01mc@518: mp.generatePerturbationSets(x, T); mas01mc@518: // Output contents of multi-probe sets mas01mc@518: while(!mp.empty()) mas01mc@518: mp.dump(mp.getNextPerturbationSet()); mas01mc@518: } mas01mc@516: } mas01mc@516: #endif