Mercurial > hg > audiodb
changeset 519:fdcd436d7cbd multiprobeLSH
lshlib MultiProbe glue added. Compiles and links against audioDB. Fails LSH tests at the moment.
author | mas01mc |
---|---|
date | Mon, 26 Jan 2009 05:42:15 +0000 |
parents | ca1ee92c359c |
children | 0767731a6cce |
files | Makefile l2norm.cpp lshlib.cpp lshlib.h multiprobe.cpp multiprobe.h |
diffstat | 6 files changed, 169 insertions(+), 59 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Mon Jan 26 02:50:44 2009 +0000 +++ b/Makefile Mon Jan 26 05:42:15 2009 +0000 @@ -8,7 +8,7 @@ SHARED_LIB_FLAGS=-shared -Wl,-soname, -LIBOBJS=lock.o pointpair.o create.o open.o power.o l2norm.o insert.o status.o query.o dump.o close.o lshlib.o index-utils.o query-indexed.o +LIBOBJS=lock.o pointpair.o create.o open.o power.o l2norm.o insert.o status.o query.o dump.o close.o lshlib.o multiprobe.o index-utils.o query-indexed.o OBJS=$(LIBOBJS) index.o soap.o liszt.o sample.o cmdline.o audioDB.o common.o EXECUTABLE=audioDB
--- a/l2norm.cpp Mon Jan 26 02:50:44 2009 +0000 +++ b/l2norm.cpp Mon Jan 26 05:42:15 2009 +0000 @@ -4,7 +4,7 @@ #include "audioDB-internals.h" static int audiodb_l2norm_existing(adb_t *adb) { - double *data_buffer, *l2norm_buffer; + double *data_buffer = 0, *l2norm_buffer = 0; adb_header_t *header = adb->header; size_t data_buffer_size = align_page_up(header->length); size_t nvectors = header->length / (sizeof(double) * header->dim);
--- a/lshlib.cpp Mon Jan 26 02:50:44 2009 +0000 +++ b/lshlib.cpp Mon Jan 26 05:42:15 2009 +0000 @@ -18,7 +18,10 @@ exit(1); } -H::H(){ +H::H(): + multiProbePtr(new MultiProbe()), + boundaryDistances(0) +{ // Delay initialization of lsh functions until we know the parameters } @@ -38,7 +41,9 @@ L((mm*(mm-1))/2), d(dd), w(ww), - radius(rr) + radius(rr), + multiProbePtr(new MultiProbe()), + boundaryDistances(0) { if(m<2){ @@ -140,6 +145,16 @@ // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS H::initialize_partial_functions(); + + // MultiProbe distance functions, there are 2*k per hashtable + H::boundaryDistances = new float*[ H::L ]; // L x 2k boundary distances + assert( H::boundaryDistances ); // failure + for( j = 0; j < H::L ; j++ ){ // 2*k functions x_i(q) + H::boundaryDistances[j] = new float[ 2*H::k ]; + assert( H::boundaryDistances[j] ); // failure + for( kk = 0; kk < 2*H::k ; kk++ ) + H::boundaryDistances[j][kk] = 0.0f; // initialize with zeros + } } void H::initialize_partial_functions(){ @@ -237,6 +252,12 @@ delete[] H::r1; delete[] H::r2; delete[] H::h; + + // MultiProbe cleanup + for( j = 0 ; j < H::L ; j++ ) + delete[] H::boundaryDistances[j]; + delete[] H::boundaryDistances; + delete multiProbePtr; } @@ -249,7 +270,7 @@ if( v.size() != H::d ) error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality double tmp = 0; - float *pA, *pb; + float *pA, *pb, *bd; Uns32T *pg; int dd; vector<float>::iterator vi; @@ -292,6 +313,7 @@ #else for( aa=0; aa < H::L ; aa++ ){ pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k + bd= *( H::boundaryDistances + aa); for( kk = 0 ; kk != H::k ; kk++ ){ pb = *( H::b + aa ) + kk; pA = * ( * ( H::A + aa ) + kk ); @@ -301,8 +323,11 @@ while( dd-- ) tmp += *pA++ * *vi++; // project tmp += *pb; // translate - tmp *= iw; // scale - *pg++ = (Uns32T) (floor(tmp)); // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z + tmp *= iw; // scale + *pg = (Uns32T) (floor(tmp)); // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z + *bd = (tmp - *pg++);//*w; // boundary distance -1 + *(bd+1) = (1.0f - *bd); //*w; // boundary distance +1 + bd+=2; } } #endif @@ -314,6 +339,34 @@ H::t2 = computeProductModDefaultPrime( g, r2, H::k ); } +// make hash value by purturbating the given hash functions +// according the the boundary distances of the current query +void H::generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){ + assert(!multiProbePtr->empty()); // Test this for now, until all is stable + Uns32T* mpg = new Uns32T(H::k); // temporary array storage + + // Copy the hash bucket identifiers + Uns32T* mpgPtr = mpg; + Uns32T kk = H::k; + while(kk--) + *mpgPtr++ = *g++; + + // Retrieve the next purturbation set + perturbation_set ps = multiProbePtr->getNextPerturbationSet(); + perturbation_set::iterator it = ps.begin(); + + // Perturbate the hash functions g + while( it != ps.end() ){ + *(mpg + multiProbePtr->getIndex(it)) += multiProbePtr->getBoundary(it); + it++; + } + + H::t1 = computeProductModDefaultPrime( mpg, r1, H::k ) % H::N; + H::t2 = computeProductModDefaultPrime( mpg, r2, H::k ); + + delete[] mpg; // free up temporary storage +} + #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}} // Computes (a.b) mod UH_PRIME_DEFAULT @@ -447,6 +500,7 @@ { FILE* dbFile = 0; int dbfid = unserialize_lsh_header(filename); + indexName = new char[O2_INDEX_MAXSTR]; strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER H::initialize_lsh_functions(); // Base-class data-structure initialization @@ -501,21 +555,29 @@ // point retrieval routine void G::retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){ + assert(LSH_MULTI_PROBE_COUNT); calling_instance = caller; add_point_callback = add_point; H::compute_hash_functions( v ); for(Uns32T j = 0 ; j < H::L ; j++ ){ - H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); - if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) { + // MultiProbe loop + multiProbePtr->generatePerturbationSets( *( H::boundaryDistances + j ) , 2*H::k, (unsigned)LSH_MULTI_PROBE_COUNT); + for(Uns32T multiProbeIdx = 0 ; multiProbeIdx < LSH_MULTI_PROBE_COUNT+1 ; multiProbeIdx++ ){ + if(!multiProbeIdx) + H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); + else + H::generate_multiprobe_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); + if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) { #ifdef LSH_LIST_HEAD_COUNTERS - if(bPtr->t2&LSH_CORE_ARRAY_BIT) { - retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos); - } else { - bucket_chain_point( bPtr->next, qpos); + if(bPtr->t2&LSH_CORE_ARRAY_BIT) { + retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos); + } else { + bucket_chain_point( bPtr->next, qpos); + } +#else + bucket_chain_point( bPtr , qpos); +#endif } -#else - bucket_chain_point( bPtr , qpos); -#endif } } }
--- a/lshlib.h Mon Jan 26 02:50:44 2009 +0000 +++ b/lshlib.h Mon Jan 26 05:42:15 2009 +0000 @@ -33,6 +33,7 @@ #ifdef MT19937 #include "mt19937/mt19937ar.h" #endif +#include "multiprobe.h" #define IntT int #define LongUns64T long long unsigned @@ -107,10 +108,14 @@ #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head -using namespace std; +#ifndef LSH_MULTI_PROBE_COUNT +#define LSH_MULTI_PROBE_COUNT 1 // How many adjacent hash-buckets to probe in LSH retrieval +#endif Uns32T get_page_logn(); +using namespace std; + // Disk table entry typedef class SerialElement SerialElementT; class SerialElement { @@ -232,6 +237,7 @@ Uns32T t2; // second hash table key Uns32T P; // hash table prime number + Uns32T N; // num rows per table Uns32T C; // num collision per row Uns32T k; // num projections per hash function @@ -242,6 +248,9 @@ float w; // width of hash slots (relative to normalized feature space) float radius;// scaling coefficient for data (1./radius) + MultiProbe* multiProbePtr; // Utility class for handling multi-probe queries + float ** boundaryDistances; // Array of query bucket-boundary-distances per hashtable + void initialize_data_structures(); void initialize_lsh_functions(); void initialize_partial_functions(); @@ -275,6 +284,7 @@ // Interface to hash functions void compute_hash_functions(vector<float>& v); void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2); + void generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2); Uns32T get_t1(){return t1;} // hash-key t1 Uns32T get_t2(){return t2;} // hash-key t2 };
--- a/multiprobe.cpp Mon Jan 26 02:50:44 2009 +0000 +++ b/multiprobe.cpp Mon Jan 26 05:42:15 2009 +0000 @@ -19,9 +19,25 @@ #include "multiprobe.h" -#define _TEST_MP_LSH +//#define _TEST_MP_LSH -MinHeapElement::MinHeapElement(perturbation_set a, double s): +bool operator> (const min_heap_element& a, const min_heap_element& b){ + return a.score > b.score; +} + +bool operator< (const min_heap_element& a, const min_heap_element& b){ + return a.score < b.score; +} + +bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){ + return a.first > b.first; +} + +bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){ + return a.first < b.first; +} + +MinHeapElement::MinHeapElement(perturbation_set a, float s): perturbs(a), score(s) { @@ -57,15 +73,30 @@ distFuns = 0; } -inline size_t MultiProbe::size(){ +size_t MultiProbe::size(){ return outSets->size(); } -inline bool MultiProbe::empty(){ +bool MultiProbe::empty(){ return !outSets->size(); } +void MultiProbe::generatePerturbationSets(vector<float>& x, unsigned T){ + cleanup(); // Make re-entrant + initialize(); + makeSortedDistFuns(x); + algorithm1(T); +} + +// overloading to support efficient array use without initial copy +void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){ + cleanup(); // Make re-entrant + initialize(); + makeSortedDistFuns(x, N); + algorithm1(T); +} + // Generate the optimal T perturbation sets for current query // pre-conditions: // an LSH structure was initialized and passed to constructor @@ -77,20 +108,15 @@ // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j // data structures are cleared and reset to zeros thereby making them re-entrant // -void MultiProbe::generatePerturbationSets(vector<double>& x, int T){ +void MultiProbe::algorithm1(unsigned T){ perturbation_set ai,as,ae; - double ai_score; - - cleanup(); // Make re-entrant - initialize(); - makeSortedDistFuns(x); - + float ai_score; ai.insert(0); // Initialize for this query minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe min_heap_element mhe = minHeap->top(); - for(int i = 0 ; i != T ; i++ ){ + for(unsigned i = 0 ; i != T ; i++ ){ do{ if(minHeap->empty()) break; @@ -126,31 +152,44 @@ int val = *(--it) + 1; a.erase(it); a.insert(it,val); + return a; } // Given the set a, add a new element one greater than the max inline perturbation_set& MultiProbe::expand(perturbation_set& a){ perturbation_set::reverse_iterator ri = a.rbegin(); a.insert(*ri+1); + return a; } // Take the list of distances (x) assuming list len is 2M and // delta = (-1)^i, i = { 0 .. 2M-1 } -void MultiProbe::makeSortedDistFuns(vector<double>& x){ +void MultiProbe::makeSortedDistFuns(vector<float>& x){ numHashBoundaries = x.size(); // x.size() == 2M delete distFuns; distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries); - for(int i = 0; i != numHashBoundaries ; i++ ) - (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1)); + for(unsigned i = 0; i != numHashBoundaries ; i++ ) + (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?-1:1)); + // SORT + sort( distFuns->begin(), distFuns->end() ); +} + +// Float array version of above +void MultiProbe::makeSortedDistFuns(float* x, unsigned N){ + numHashBoundaries = N; // x.size() == 2M + delete distFuns; + distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries); + for(unsigned i = 0; i != numHashBoundaries ; i++ ) + (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?-1:1)); // SORT sort( distFuns->begin(), distFuns->end() ); } // For a given perturbation set, the score is the // sum of squares of corresponding distances in x -double MultiProbe::score(perturbation_set& a){ +float MultiProbe::score(perturbation_set& a){ //assert(!a.empty()); - double score = 0.0, tmp; + float score = 0.0, tmp; perturbation_set::iterator it; it = a.begin(); do{ @@ -170,12 +209,20 @@ while( it != a.end() ){ j = *it; it++; - if( ( j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) ) + if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) ) return false; } return true; } +int MultiProbe::getIndex(perturbation_set::iterator it){ + return (*distFuns)[*it].second.first; +} + +int MultiProbe::getBoundary(perturbation_set::iterator it){ + return (*distFuns)[*it].second.second; +} + // copy return next perturbation_set perturbation_set MultiProbe::getNextPerturbationSet(){ perturbation_set s = outSets->top().perturbs; @@ -191,10 +238,10 @@ int N_SAMPS = 100; // Number of random samples int W = 4; // simulated hash-bucket size int N_ITER = 100; // How many re-entrant iterations - int T = 10; // Number of multi-probe sets to generate + unsigned T = 10; // Number of multi-probe sets to generate MultiProbe mp= MultiProbe(); - vector<double> x(N_SAMPS); + vector<float> x(N_SAMPS); srand((unsigned)time(0));
--- a/multiprobe.h Mon Jan 26 02:50:44 2009 +0000 +++ b/multiprobe.h Mon Jan 26 05:42:15 2009 +0000 @@ -34,8 +34,8 @@ typedef class MinHeapElement{ public: perturbation_set perturbs; - double score; - MinHeapElement(perturbation_set a, double s); + float score; + MinHeapElement(perturbation_set a, float s); virtual ~MinHeapElement(); } min_heap_element; @@ -44,23 +44,7 @@ greater<min_heap_element> > min_heap_of_perturbation_set ; -typedef pair<double, pair<int, int> > sorted_distance_functions ; - -bool operator> (const min_heap_element& a, const min_heap_element& b){ - return a.score > b.score; -} - -bool operator< (const min_heap_element& a, const min_heap_element& b){ - return a.score < b.score; -} - -bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){ - return a.first > b.first; -} - -bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){ - return a.first < b.first; -} +typedef pair<float, pair<int, int> > sorted_distance_functions ; class MultiProbe{ protected: @@ -76,20 +60,27 @@ // perturbation set operations perturbation_set& shift(perturbation_set&); perturbation_set& expand(perturbation_set&); - double score(perturbation_set&); + float score(perturbation_set&); bool valid(perturbation_set&); - void makeSortedDistFuns(vector<double> &); + void makeSortedDistFuns(vector<float> &); + void makeSortedDistFuns(float* x, unsigned N); + + // perturbation set generation algorithm + void algorithm1(unsigned T); public: MultiProbe(); ~MultiProbe(); // generation of perturbation sets - void generatePerturbationSets(vector<double>& vectorOfBounaryDistances, int numSetsToGenerate); + void generatePerturbationSets(vector<float>& vectorOfBounaryDistances, unsigned numSetsToGenerate); + void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate); perturbation_set getNextPerturbationSet(); void dump(perturbation_set); size_t size(); // Number of perturbation sets are in the output queue bool empty(); // predicate for empty MultiProbe set + int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry + int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry };