Mercurial > hg > audiodb
view multiprobe.cpp @ 770:c54bc2ffbf92 tip
update tags
author | convert-repo |
---|---|
date | Fri, 16 Dec 2011 11:34:01 +0000 |
parents | 77f7bc99dfd6 |
children |
line wrap: on
line source
/* * MultiProbe C++ class * * Given a vector of LSH boundary distances for a query, * perform lookup by probing nearby hash-function locations * * Implementation using C++ STL * * Reference: * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity * Search", Proc. Intl. Conf. VLDB, 2007 * * * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved * License: GNU Public License 2.0 * */ #include "multiprobe.h" //#define _TEST_MP_LSH bool operator> (const min_heap_element& a, const min_heap_element& b){ return a.score > b.score; } bool operator< (const min_heap_element& a, const min_heap_element& b){ return a.score < b.score; } bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){ return a.first > b.first; } bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){ return a.first < b.first; } MinHeapElement::MinHeapElement(perturbation_set a, float s): perturbs(a), score(s) { } MinHeapElement::~MinHeapElement(){;} MultiProbe::MultiProbe(): minHeap(0), outSets(0), distFuns(0), numHashBoundaries(0) { } MultiProbe::~MultiProbe(){ cleanup(); } void MultiProbe::initialize(){ minHeap = new min_heap_of_perturbation_set(); outSets = new min_heap_of_perturbation_set(); } void MultiProbe::cleanup(){ delete minHeap; minHeap = 0; delete outSets; outSets = 0; delete distFuns; distFuns = 0; } size_t MultiProbe::size(){ return outSets->size(); } bool MultiProbe::empty(){ return !outSets->size(); } void MultiProbe::generatePerturbationSets(vector<float>& x, unsigned T){ cleanup(); // Make re-entrant initialize(); makeSortedDistFuns(x); algorithm1(T); } // overloading to support efficient array use without initial copy void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){ cleanup(); // Make re-entrant initialize(); makeSortedDistFuns(x, N); algorithm1(T); } // Generate the optimal T perturbation sets for current query // pre-conditions: // an LSH structure was initialized and passed to constructor // a query vector was passed to lsh->compute_hash_functions() // the query-to-boundary distances are stored in x[hashFunIndex] // // post-conditions: // generates an ordered list of perturbation sets (stored as an array of sets) // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j // data structures are cleared and reset to zeros thereby making them re-entrant // void MultiProbe::algorithm1(unsigned T){ perturbation_set ai,as,ae; float ai_score; ai.insert(0); // Initialize for this query minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe min_heap_element mhe = minHeap->top(); if(T>distFuns->size()) T = distFuns->size(); for(unsigned i = 0 ; i != T ; i++ ){ do{ mhe = minHeap->top(); ai = mhe.perturbs; ai_score = mhe.score; minHeap->pop(); as=ai; shift(as); minHeap->push(min_heap_element(as, score(as))); ae=ai; expand(ae); minHeap->push(min_heap_element(ae, score(ae))); }while(!valid(ai)); outSets->push(mhe); // Ordered list of perturbation sets } } void MultiProbe::dump(perturbation_set a){ perturbation_set::iterator it = a.begin(); while(it != a.end()){ cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " " << (*distFuns)[*it].first << *it << ", "; it++; } cout << "(" << score(a) << ")"; cout << endl; } // Given the set a, add 1 to last element of the set inline perturbation_set& MultiProbe::shift(perturbation_set& a){ perturbation_set::iterator it = a.end(); int val = *(--it) + 1; a.erase(it); a.insert(it,val); return a; } // Given the set a, add a new element one greater than the max inline perturbation_set& MultiProbe::expand(perturbation_set& a){ perturbation_set::reverse_iterator ri = a.rbegin(); a.insert(*ri+1); return a; } // Take the list of distances (x) assuming list len is 2M and // delta = (-1)^i, i = { 0 .. 2M-1 } void MultiProbe::makeSortedDistFuns(vector<float>& x){ numHashBoundaries = x.size(); // x.size() == 2M delete distFuns; distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries); for(unsigned i = 0; i != numHashBoundaries ; i++ ) (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); // SORT sort( distFuns->begin(), distFuns->end() ); } // Float array version of above void MultiProbe::makeSortedDistFuns(float* x, unsigned N){ numHashBoundaries = N; // x.size() == 2M delete distFuns; distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries); for(unsigned i = 0; i != numHashBoundaries ; i++ ) (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); // SORT sort( distFuns->begin(), distFuns->end() ); } // For a given perturbation set, the score is the // sum of squares of corresponding distances in x float MultiProbe::score(perturbation_set& a){ //assert(!a.empty()); float score = 0.0, tmp; perturbation_set::iterator it; it = a.begin(); do{ tmp = (*distFuns)[*it].first; score += tmp*tmp; }while( ++it != a.end() ); return score; } // A valid set must have at most one // of the two elements {j, 2M + 1 - j} for every j // // A perturbation set containing an element > 2M is invalid bool MultiProbe::valid(perturbation_set& a){ int j; perturbation_set::iterator it = a.begin(); while( it != a.end() ){ j = *it; it++; if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries - j - 1 ) != a.end() ) ) return false; } return true; } int MultiProbe::getIndex(perturbation_set::iterator it){ return (*distFuns)[*it].second.first; } int MultiProbe::getBoundary(perturbation_set::iterator it){ return (*distFuns)[*it].second.second; } // copy return next perturbation_set perturbation_set MultiProbe::getNextPerturbationSet(){ perturbation_set s = outSets->top().perturbs; outSets->pop(); return s; } // Test routine: generate 100 random boundary distance pairs // call generatePerturbationSets on these distances // dump output for inspection #ifdef _TEST_MP_LSH int main(const int argc, const char* argv[]){ int N_SAMPS = 100; // Number of random samples int W = 4; // simulated hash-bucket size int N_ITER = 100; // How many re-entrant iterations unsigned T = 10; // Number of multi-probe sets to generate MultiProbe mp= MultiProbe(); vector<float> x(N_SAMPS); srand((unsigned)time(0)); // Test re-entrance on single instance for(int j = 0; j< N_ITER ; j++){ cout << "********** ITERATION " << j << " **********" << endl; cout.flush(); for (int i = 0 ; i != x.size()/2 ; i++ ){ x[2*i] = W*(rand()/(RAND_MAX+1.0)); x[2*i+1] = W - x[2*i]; } // Generate multi-probe sets mp.generatePerturbationSets(x, T); // Output contents of multi-probe sets while(!mp.empty()) mp.dump(mp.getNextPerturbationSet()); } } #endif