mas01mc@754: /* mas01mc@754: * MultiProbe C++ class mas01mc@754: * mas01mc@754: * Given a vector of LSH boundary distances for a query, mas01mc@754: * perform lookup by probing nearby hash-function locations mas01mc@754: * mas01mc@754: * Implementation using C++ STL mas01mc@754: * mas01mc@754: * Reference: mas01mc@754: * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, mas01mc@754: * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity mas01mc@754: * Search", Proc. Intl. Conf. VLDB, 2007 mas01mc@754: * mas01mc@754: * mas01mc@754: * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved mas01mc@754: * License: GNU Public License 2.0 mas01mc@754: * mas01mc@754: */ mas01mc@754: mas01mc@754: #include "multiprobe.h" mas01mc@754: mas01mc@754: //#define _TEST_MP_LSH mas01mc@754: mas01mc@754: bool operator> (const min_heap_element& a, const min_heap_element& b){ mas01mc@754: return a.score > b.score; mas01mc@754: } mas01mc@754: mas01mc@754: bool operator< (const min_heap_element& a, const min_heap_element& b){ mas01mc@754: return a.score < b.score; mas01mc@754: } mas01mc@754: mas01mc@754: bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){ mas01mc@754: return a.first > b.first; mas01mc@754: } mas01mc@754: mas01mc@754: bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){ mas01mc@754: return a.first < b.first; mas01mc@754: } mas01mc@754: mas01mc@754: MinHeapElement::MinHeapElement(perturbation_set a, float s): mas01mc@754: perturbs(a), mas01mc@754: score(s) mas01mc@754: { mas01mc@754: mas01mc@754: } mas01mc@754: mas01mc@754: MinHeapElement::~MinHeapElement(){;} mas01mc@754: mas01mc@754: MultiProbe::MultiProbe(): mas01mc@754: minHeap(0), mas01mc@754: outSets(0), mas01mc@754: distFuns(0), mas01mc@754: numHashBoundaries(0) mas01mc@754: { mas01mc@754: mas01mc@754: } mas01mc@754: mas01mc@754: MultiProbe::~MultiProbe(){ mas01mc@754: cleanup(); mas01mc@754: } mas01mc@754: mas01mc@754: void MultiProbe::initialize(){ mas01mc@754: minHeap = new min_heap_of_perturbation_set(); mas01mc@754: outSets = new min_heap_of_perturbation_set(); mas01mc@754: } mas01mc@754: mas01mc@754: void MultiProbe::cleanup(){ mas01mc@754: delete minHeap; mas01mc@754: minHeap = 0; mas01mc@754: delete outSets; mas01mc@754: outSets = 0; mas01mc@754: delete distFuns; mas01mc@754: distFuns = 0; mas01mc@754: } mas01mc@754: mas01mc@754: size_t MultiProbe::size(){ mas01mc@754: return outSets->size(); mas01mc@754: } mas01mc@754: mas01mc@754: bool MultiProbe::empty(){ mas01mc@754: return !outSets->size(); mas01mc@754: } mas01mc@754: mas01mc@754: mas01mc@754: void MultiProbe::generatePerturbationSets(vector& x, unsigned T){ mas01mc@754: cleanup(); // Make re-entrant mas01mc@754: initialize(); mas01mc@754: makeSortedDistFuns(x); mas01mc@754: algorithm1(T); mas01mc@754: } mas01mc@754: mas01mc@754: // overloading to support efficient array use without initial copy mas01mc@754: void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){ mas01mc@754: cleanup(); // Make re-entrant mas01mc@754: initialize(); mas01mc@754: makeSortedDistFuns(x, N); mas01mc@754: algorithm1(T); mas01mc@754: } mas01mc@754: mas01mc@754: // Generate the optimal T perturbation sets for current query mas01mc@754: // pre-conditions: mas01mc@754: // an LSH structure was initialized and passed to constructor mas01mc@754: // a query vector was passed to lsh->compute_hash_functions() mas01mc@754: // the query-to-boundary distances are stored in x[hashFunIndex] mas01mc@754: // mas01mc@754: // post-conditions: mas01mc@754: // generates an ordered list of perturbation sets (stored as an array of sets) mas01mc@754: // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j mas01mc@754: // data structures are cleared and reset to zeros thereby making them re-entrant mas01mc@754: // mas01mc@754: void MultiProbe::algorithm1(unsigned T){ mas01mc@754: perturbation_set ai,as,ae; mas01mc@754: float ai_score; mas01mc@754: ai.insert(0); // Initialize for this query mas01mc@754: minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe mas01mc@754: mas01mc@754: min_heap_element mhe = minHeap->top(); mas01mc@754: mas01mc@754: if(T>distFuns->size()) mas01mc@754: T = distFuns->size(); mas01mc@754: for(unsigned i = 0 ; i != T ; i++ ){ mas01mc@754: do{ mas01mc@754: mhe = minHeap->top(); mas01mc@754: ai = mhe.perturbs; mas01mc@754: ai_score = mhe.score; mas01mc@754: minHeap->pop(); mas01mc@754: as=ai; mas01mc@754: shift(as); mas01mc@754: minHeap->push(min_heap_element(as, score(as))); mas01mc@754: ae=ai; mas01mc@754: expand(ae); mas01mc@754: minHeap->push(min_heap_element(ae, score(ae))); mas01mc@754: }while(!valid(ai)); mas01mc@754: outSets->push(mhe); // Ordered list of perturbation sets mas01mc@754: } mas01mc@754: } mas01mc@754: mas01mc@754: void MultiProbe::dump(perturbation_set a){ mas01mc@754: perturbation_set::iterator it = a.begin(); mas01mc@754: while(it != a.end()){ mas01mc@754: cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " " mas01mc@754: << (*distFuns)[*it].first << *it << ", "; mas01mc@754: it++; mas01mc@754: } mas01mc@754: cout << "(" << score(a) << ")"; mas01mc@754: cout << endl; mas01mc@754: } mas01mc@754: mas01mc@754: // Given the set a, add 1 to last element of the set mas01mc@754: inline perturbation_set& MultiProbe::shift(perturbation_set& a){ mas01mc@754: perturbation_set::iterator it = a.end(); mas01mc@754: int val = *(--it) + 1; mas01mc@754: a.erase(it); mas01mc@754: a.insert(it,val); mas01mc@754: return a; mas01mc@754: } mas01mc@754: mas01mc@754: // Given the set a, add a new element one greater than the max mas01mc@754: inline perturbation_set& MultiProbe::expand(perturbation_set& a){ mas01mc@754: perturbation_set::reverse_iterator ri = a.rbegin(); mas01mc@754: a.insert(*ri+1); mas01mc@754: return a; mas01mc@754: } mas01mc@754: mas01mc@754: // Take the list of distances (x) assuming list len is 2M and mas01mc@754: // delta = (-1)^i, i = { 0 .. 2M-1 } mas01mc@754: void MultiProbe::makeSortedDistFuns(vector& x){ mas01mc@754: numHashBoundaries = x.size(); // x.size() == 2M mas01mc@754: delete distFuns; mas01mc@754: distFuns = new std::vector(numHashBoundaries); mas01mc@754: for(unsigned i = 0; i != numHashBoundaries ; i++ ) mas01mc@754: (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); mas01mc@754: // SORT mas01mc@754: sort( distFuns->begin(), distFuns->end() ); mas01mc@754: } mas01mc@754: mas01mc@754: // Float array version of above mas01mc@754: void MultiProbe::makeSortedDistFuns(float* x, unsigned N){ mas01mc@754: numHashBoundaries = N; // x.size() == 2M mas01mc@754: delete distFuns; mas01mc@754: distFuns = new std::vector(numHashBoundaries); mas01mc@754: for(unsigned i = 0; i != numHashBoundaries ; i++ ) mas01mc@754: (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); mas01mc@754: // SORT mas01mc@754: sort( distFuns->begin(), distFuns->end() ); mas01mc@754: } mas01mc@754: mas01mc@754: // For a given perturbation set, the score is the mas01mc@754: // sum of squares of corresponding distances in x mas01mc@754: float MultiProbe::score(perturbation_set& a){ mas01mc@754: //assert(!a.empty()); mas01mc@754: float score = 0.0, tmp; mas01mc@754: perturbation_set::iterator it; mas01mc@754: it = a.begin(); mas01mc@754: do{ mas01mc@754: tmp = (*distFuns)[*it].first; mas01mc@754: score += tmp*tmp; mas01mc@754: }while( ++it != a.end() ); mas01mc@754: return score; mas01mc@754: } mas01mc@754: mas01mc@754: // A valid set must have at most one mas01mc@754: // of the two elements {j, 2M + 1 - j} for every j mas01mc@754: // mas01mc@754: // A perturbation set containing an element > 2M is invalid mas01mc@754: bool MultiProbe::valid(perturbation_set& a){ mas01mc@754: int j; mas01mc@754: perturbation_set::iterator it = a.begin(); mas01mc@754: while( it != a.end() ){ mas01mc@754: j = *it; mas01mc@754: it++; mas01mc@754: if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries - j - 1 ) != a.end() ) ) mas01mc@754: return false; mas01mc@754: } mas01mc@754: return true; mas01mc@754: } mas01mc@754: mas01mc@754: int MultiProbe::getIndex(perturbation_set::iterator it){ mas01mc@754: return (*distFuns)[*it].second.first; mas01mc@754: } mas01mc@754: mas01mc@754: int MultiProbe::getBoundary(perturbation_set::iterator it){ mas01mc@754: return (*distFuns)[*it].second.second; mas01mc@754: } mas01mc@754: mas01mc@754: // copy return next perturbation_set mas01mc@754: perturbation_set MultiProbe::getNextPerturbationSet(){ mas01mc@754: perturbation_set s = outSets->top().perturbs; mas01mc@754: outSets->pop(); mas01mc@754: return s; mas01mc@754: } mas01mc@754: mas01mc@754: // Test routine: generate 100 random boundary distance pairs mas01mc@754: // call generatePerturbationSets on these distances mas01mc@754: // dump output for inspection mas01mc@754: #ifdef _TEST_MP_LSH mas01mc@754: int main(const int argc, const char* argv[]){ mas01mc@754: int N_SAMPS = 100; // Number of random samples mas01mc@754: int W = 4; // simulated hash-bucket size mas01mc@754: int N_ITER = 100; // How many re-entrant iterations mas01mc@754: unsigned T = 10; // Number of multi-probe sets to generate mas01mc@754: mas01mc@754: MultiProbe mp= MultiProbe(); mas01mc@754: vector x(N_SAMPS); mas01mc@754: mas01mc@754: srand((unsigned)time(0)); mas01mc@754: mas01mc@754: // Test re-entrance on single instance mas01mc@754: for(int j = 0; j< N_ITER ; j++){ mas01mc@754: cout << "********** ITERATION " << j << " **********" << endl; mas01mc@754: cout.flush(); mas01mc@754: for (int i = 0 ; i != x.size()/2 ; i++ ){ mas01mc@754: x[2*i] = W*(rand()/(RAND_MAX+1.0)); mas01mc@754: x[2*i+1] = W - x[2*i]; mas01mc@754: } mas01mc@754: // Generate multi-probe sets mas01mc@754: mp.generatePerturbationSets(x, T); mas01mc@754: // Output contents of multi-probe sets mas01mc@754: while(!mp.empty()) mas01mc@754: mp.dump(mp.getNextPerturbationSet()); mas01mc@754: } mas01mc@754: } mas01mc@754: #endif