# HG changeset patch # User mas01mc # Date 1217431342 0 # Node ID 9fd5340faffd2ea613526713249b01f11ddb3b42 # Parent d9a88cfd4ab614800533e501ccfa9b522baba194 Refactored LSH interface to separate hashfunctions and parameters from insertion/retrieval/serialization diff -r d9a88cfd4ab6 -r 9fd5340faffd lshlib.cpp --- a/lshlib.cpp Tue Jul 29 22:01:17 2008 +0000 +++ b/lshlib.cpp Wed Jul 30 15:22:22 2008 +0000 @@ -21,27 +21,28 @@ exit(1); } -H::H(Uns32T kk, Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC): +H::H(){ + // Delay initialization of lsh functions until we know the parameters +} + +H::H(Uns32T kk, Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float ww, float rr): #ifdef USE_U_FUNCTIONS use_u_functions(true), #else use_u_functions(false), #endif + maxp(0), bucketCount(0), pointCount(0), N(NN), C(CC), k(kk), m(mm), - L(mm*(mm-1)/2), - d(dd) + L((mm*(mm-1))/2), + d(dd), + w(ww), + radius(rr) { - Uns32T j; - cout << "file size: ~" << (((unsigned long long)L*N*C*sizeof(SerialElementT))/1000000UL) << "MB" << endl; - if(((unsigned long long)L*N*C*sizeof(SerialElementT))>4000000000UL) - error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB"); - else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL) - cout << "warning: hash tables exceed 1000MB." << endl; if(m<2){ m=2; @@ -52,16 +53,18 @@ k++; // make sure k is even cout << "warning: setting k even" << endl; } - __initialize_data_structures(); - for(j=0; j4000000000UL) + error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB"); + else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL) + cout << "warning: hash tables exceed 1000MB." << endl; + + // We have the necessary parameters, so construct hashfunction datastructures + initialize_lsh_functions(); } -// Post constructor initialization -void H::__initialize_data_structures(){ +void H::initialize_lsh_functions(){ H::P = UH_PRIME_DEFAULT; /* FIXME: don't use time(); instead use /dev/random or similar */ @@ -72,28 +75,155 @@ #else srand(time(NULL)); // seed random number generator #endif - Uns32T i,j; - H::h = new bucket**[ H::L ]; - H::r1 = new Uns32T*[ H::L ]; - H::r2 = new Uns32T*[ H::L ]; - assert( H::h && H::r1 && H::r2 ); // failure - for( j = 0 ; j < H::L ; j++ ){ - H::r1[ j ] = new Uns32T[ H::k ]; - H::r2[ j ] = new Uns32T[ H::k ]; - assert( H::r1[j] && H::r2[j] ); // failure + Uns32T i,j, kk; +#ifdef USE_U_FUNCTIONS + H::A = new float**[ H::m ]; // m x k x d random projectors + H::b = new float*[ H::m ]; // m x k random biases +#else + H::A = new float**[ H::L ]; // m x k x d random projectors + H::b = new float*[ H::L ]; // m x k random biases +#endif + H::g = new Uns32T*[ H::L ]; // L x k random projections + assert( H::g && H::A && H::b ); // failure +#ifdef USE_U_FUNCTIONS + // Use m \times u_i functions \in R^{(k/2) \times (d)} + // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d} + for( j = 0; j < H::m ; j++ ){ // m functions u_i(v) + H::A[j] = new float*[ H::k/2 ]; // k/2 x d 2-stable distribution coefficients + H::b[j] = new float[ H::k/2 ]; // bias + assert( H::A[j] && H::b[j] ); // failure + for( kk = 0; kk < H::k/2 ; kk++ ){ + H::A[j][kk] = new float[ H::d ]; + assert( H::A[j][kk] ); // failure + for(Uns32T i = 0 ; i < H::d ; i++ ) + H::A[j][kk][i] = H::randn(); // Normal + H::b[j][kk] = H::ranf()*H::w; // Uniform + } + } +#else + // Use m \times u_i functions \in R^{k \times (d)} + // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d} + for( j = 0; j < H::L ; j++ ){ // m functions u_i(v) + H::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients + H::b[j] = new float[ H::k ]; // bias + assert( H::A[j] && H::b[j] ); // failure + for( kk = 0; kk < H::k ; kk++ ){ + H::A[j][kk] = new float[ H::d ]; + assert( H::A[j][kk] ); // failure + for(Uns32T i = 0 ; i < H::d ; i++ ) + H::A[j][kk][i] = H::randn(); // Normal + H::b[j][kk] = H::ranf()*H::w; // Uniform + } + } +#endif + + // Storage for LSH hash function output (Uns32T) + for( j = 0 ; j < H::L ; j++ ){ // L functions g_j(u_a, u_b) a,b \in nchoosek(m,2) + H::g[j] = new Uns32T[ H::k ]; // k x 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z + assert( H::g[j] ); } + // LSH Hash tables + H::h = new bucket**[ H::L ]; + assert( H::h ); for( j = 0 ; j < H::L ; j++ ){ H::h[j] = new bucket*[ H::N ]; assert( H::h[j] ); for( i = 0 ; i < H::N ; i++) H::h[j][i] = 0; } + + // Standard hash functions + H::r1 = new Uns32T*[ H::L ]; + H::r2 = new Uns32T*[ H::L ]; + assert( H::r1 && H::r2 ); // failure + for( j = 0 ; j < H::L ; j++ ){ + H::r1[ j ] = new Uns32T[ H::k ]; + H::r2[ j ] = new Uns32T[ H::k ]; + assert( H::r1[j] && H::r2[j] ); // failure + for( i = 0; i >(H::m); + for( Uns32T aa=0 ; aa < H::m ; aa++ ) + H::uu[aa] = vector( H::k/2 ); +#else + H::uu = vector >(H::L); + for( Uns32T aa=0 ; aa < H::L ; aa++ ) + H::uu[aa] = vector( H::k ); +#endif +} + + +// Generate z ~ N(0,1) +float H::randn(){ +// Box-Muller + float x1, x2; + do{ + x1 = ranf(); + } while (x1 == 0); // cannot take log of 0 + x2 = ranf(); + float z; + z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2); + return z; +} + +float H::ranf(){ +#ifdef MT19937 + return (float) genrand_real2(); +#else + return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) ); +#endif +} + +// range is 1..2^29 +/* FIXME: that looks like an ... odd range. Still. */ +Uns32T H::randr(){ +#ifdef MT19937 + return (Uns32T)((genrand_int32() >> 3) + 1); +#else + return (Uns32T) ((rand() >> 2) + 1); +#endif } // Destruct hash tables H::~H(){ - Uns32T i,j; + Uns32T i,j,kk; +#ifdef USE_U_FUNCTIONS + for( j = 0 ; j < H::m ; j++ ){ + for( kk = 0 ; kk < H::k/2 ; kk++ ) + delete[] A[j][kk]; + delete[] A[j]; + } + delete[] A; + for( j = 0 ; j < H::m ; j++ ) + delete[] b[j]; + delete[] b; +#else + for( j = 0 ; j < H::L ; j++ ){ + for( kk = 0 ; kk < H::k ; kk++ ) + delete[] A[j][kk]; + delete[] A[j]; + } + delete[] A; + for( j = 0 ; j < H::L ; j++ ) + delete[] b[j]; + delete[] b; +#endif + + for( j = 0 ; j < H::L ; j++ ) + delete[] g[j]; + delete[] g; for( j=0 ; j < H::L ; j++ ){ delete[] H::r1[ j ]; delete[] H::r2[ j ]; @@ -107,17 +237,93 @@ } +// Compute all hash functions for vector v +// #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d} +// to make L \times g_j functions \in Z^k +void H::compute_hash_functions(vector& v){ // v \in R^d + float iw = 1. / H::w; // hash bucket width + Uns32T aa, kk; + if( v.size() != H::d ) + error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality + double tmp = 0; + float *pA, *pb; + Uns32T *pg; + int dd; + vector::iterator vi; + vector::iterator ui; + +#ifdef USE_U_FUNCTIONS + Uns32T bb; + // Store m dot products to expand + for( aa=0; aa < H::m ; aa++ ){ + ui = H::uu[aa].begin(); + for( kk = 0 ; kk < H::k/2 ; kk++ ){ + pb = *( H::b + aa ) + kk; + pA = * ( * ( H::A + aa ) + kk ); + dd = H::d; + tmp = 0.; + vi = v.begin(); + while( dd-- ) + tmp += *pA++ * *vi++; // project + tmp += *pb; // translate + tmp *= iw; // scale + *ui++ = (Uns32T) floor(tmp); // floor + } + } + // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d} + Uns32T j; + for( aa=0, j=0 ; aa < H::m-1 ; aa++ ) + for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){ + pg= *( H::g + j ); // L \times functions g_j(v) \in Z^k + // u_1 \in Z^{(k/2) \times d} + ui = H::uu[aa].begin(); + kk=H::k/2; + while( kk-- ) + *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z + // u_2 \in Z^{(k/2) \times d} + ui = H::uu[bb].begin(); + kk=H::k/2; + while( kk--) + *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z + } +#else + for( aa=0; aa < H::L ; aa++ ){ + ui = H::uu[aa].begin(); + for( kk = 0 ; kk < H::k ; kk++ ){ + pb = *( H::b + aa ) + kk; + pA = * ( * ( H::A + aa ) + kk ); + dd = H::d; + tmp = 0.; + vi = v.begin(); + while( dd-- ) + tmp += *pA++ * *vi++; // project + tmp += *pb; // translate + tmp *= iw; // scale + *ui++ = (Uns32T) (floor(tmp)); // floor + } + } + // Compute hash functions + for( aa=0 ; aa < H::L ; aa++ ){ + pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k + // u_1 \in Z^{k \times d} + ui = H::uu[aa].begin(); + kk=H::k; + while( kk-- ) + *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z + } +#endif +} + // make hash value \in Z -void H::__generate_hash_keys(Uns32T*g,Uns32T* r1, Uns32T* r2){ - H::t1 = __computeProductModDefaultPrime( g, r1, H::k ) % H::N; - H::t2 = __computeProductModDefaultPrime( g, r2, H::k ); - +void H::generate_hash_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){ + H::t1 = computeProductModDefaultPrime( g, r1, H::k ) % H::N; + H::t2 = computeProductModDefaultPrime( g, r2, H::k ); } #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}} // Computes (a.b) mod UH_PRIME_DEFAULT -inline Uns32T H::__computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){ +inline Uns32T H::computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){ LongUns64T h = 0; for(IntT i = 0; i < size; i++){ @@ -197,68 +403,18 @@ } } -inline bucket** H::__get_bucket(int j){ +inline bucket** H::get_bucket(int j){ return *(h+j); } -// hash functions G -G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float r): - H(kk,mm,dd,NN,CC), - w(ww), - radius(r), - maxp(0), +// Interface to Locality Sensitive Hashing G +G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr): + H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures + lshHeader(0), calling_instance(0), - add_point_callback(0), - lshHeader(0) + add_point_callback(0) { - Uns32T j; -#ifdef USE_U_FUNCTIONS - G::A = new float**[ H::m ]; // m x k x d random projectors - G::b = new float*[ H::m ]; // m x k random biases -#else - G::A = new float**[ H::L ]; // m x k x d random projectors - G::b = new float*[ H::L ]; // m x k random biases -#endif - G::g = new Uns32T*[ H::L ]; // L x k random projections - assert( G::g && G::A && G::b ); // failure -#ifdef USE_U_FUNCTIONS - // Use m \times u_i functions \in R^{(k/2) \times (d)} - // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d} - for( j = 0; j < H::m ; j++ ){ // m functions u_i(v) - G::A[j] = new float*[ H::k/2 ]; // k/2 x d 2-stable distribution coefficients - G::b[j] = new float[ H::k/2 ]; // bias - assert( G::A[j] && G::b[j] ); // failure - for( kk = 0; kk < H::k/2 ; kk++ ){ - G::A[j][kk] = new float[ H::d ]; - assert( G::A[j][kk] ); // failure - for(Uns32T i = 0 ; i < H::d ; i++ ) - G::A[j][kk][i] = randn(); // Normal - G::b[j][kk] = ranf()*G::w; // Uniform - } - } -#else - // Use m \times u_i functions \in R^{k \times (d)} - // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d} - for( j = 0; j < H::L ; j++ ){ // m functions u_i(v) - G::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients - G::b[j] = new float[ H::k ]; // bias - assert( G::A[j] && G::b[j] ); // failure - for( kk = 0; kk < H::k ; kk++ ){ - G::A[j][kk] = new float[ H::d ]; - assert( G::A[j][kk] ); // failure - for(Uns32T i = 0 ; i < H::d ; i++ ) - G::A[j][kk][i] = randn(); // Normal - G::b[j][kk] = ranf()*G::w; // Uniform - } - } -#endif - - for( j = 0 ; j < H::L ; j++ ){ // L functions g_j(u_a, u_b) a,b \in nchoosek(m,2) - G::g[j] = new Uns32T[ H::k ]; // k x 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z - assert( G::g[j] ); - } - - initialize_partial_functions(); // m partially evaluated hash functions + } // Serialize from file LSH constructor @@ -266,12 +422,15 @@ // Load the hash functions, close the database // Optionally load the LSH tables into head-allocated lists in core G::G(char* filename, bool lshInCoreFlag): + H(), // default base-class constructor call delays data-structure initialization + lshHeader(0), calling_instance(0), add_point_callback(0) { int dbfid = unserialize_lsh_header(filename); - unserialize_lsh_functions(dbfid); - initialize_partial_functions(); + + H::initialize_lsh_functions(); // Base-class data-structure initialization + unserialize_lsh_functions(dbfid); // populate with on-disk hashfunction values // Format1 only needs unserializing if specifically requested if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2) && lshInCoreFlag){ @@ -283,172 +442,21 @@ unserialize_lsh_hashtables_format2(dbfid); } - close(dbfid); -} - -void G::initialize_partial_functions(){ - -#ifdef USE_U_FUNCTIONS - uu = vector >(H::m); - for( Uns32T aa=0 ; aa < H::m ; aa++ ) - uu[aa] = vector( H::k/2 ); -#else - uu = vector >(H::L); - for( Uns32T aa=0 ; aa < H::L ; aa++ ) - uu[aa] = vector( H::k ); -#endif -} - - -// Generate z ~ N(0,1) -float G::randn(){ -// Box-Muller - float x1, x2; - do{ - x1 = ranf(); - } while (x1 == 0); // cannot take log of 0 - x2 = ranf(); - float z; - z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2); - return z; -} - -float G::ranf(){ -#ifdef MT19937 - return (float) genrand_real2(); -#else - return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) ); -#endif -} - -// range is 1..2^29 -/* FIXME: that looks like an ... odd range. Still. */ -Uns32T H::__randr(){ -#ifdef MT19937 - return (Uns32T)((genrand_int32() >> 3) + 1); -#else - return (Uns32T) ((rand() >> 2) + 1); -#endif -} + close(dbfid);} G::~G(){ - Uns32T j,kk; -#ifdef USE_U_FUNCTIONS - for( j = 0 ; j < H::m ; j++ ){ - for( kk = 0 ; kk < H::k/2 ; kk++ ) - delete[] A[j][kk]; - delete[] A[j]; - } - delete[] A; - for( j = 0 ; j < H::m ; j++ ) - delete[] b[j]; - delete[] b; -#else - for( j = 0 ; j < H::L ; j++ ){ - for( kk = 0 ; kk < H::k ; kk++ ) - delete[] A[j][kk]; - delete[] A[j]; - } - delete[] A; - for( j = 0 ; j < H::L ; j++ ) - delete[] b[j]; - delete[] b; -#endif - - for( j = 0 ; j < H::L ; j++ ) - delete[] g[j]; - delete[] g; delete lshHeader; } -// Compute all hash functions for vector v -// #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d} -// to make L \times g_j functions \in Z^k -void G::compute_hash_functions(vector& v){ // v \in R^d - float iw = 1. / G::w; // hash bucket width - Uns32T aa, kk; - if( v.size() != H::d ) - error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality - double tmp = 0; - float *pA, *pb; - Uns32T *pg; - int dd; - vector::iterator vi; - vector::iterator ui; - -#ifdef USE_U_FUNCTIONS - Uns32T bb; - // Store m dot products to expand - for( aa=0; aa < H::m ; aa++ ){ - ui = uu[aa].begin(); - for( kk = 0 ; kk < H::k/2 ; kk++ ){ - pb = *( G::b + aa ) + kk; - pA = * ( * ( G::A + aa ) + kk ); - dd = H::d; - tmp = 0.; - vi = v.begin(); - while( dd-- ) - tmp += *pA++ * *vi++; // project - tmp += *pb; // translate - tmp *= iw; // scale - *ui++ = (Uns32T) floor(tmp); // floor - } - } - // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d} - Uns32T j; - for( aa=0, j=0 ; aa < H::m-1 ; aa++ ) - for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){ - pg= *( G::g + j ); // L \times functions g_j(v) \in Z^k - // u_1 \in Z^{(k/2) \times d} - ui = uu[aa].begin(); - kk=H::k/2; - while( kk-- ) - *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z - // u_2 \in Z^{(k/2) \times d} - ui = uu[bb].begin(); - kk=H::k/2; - while( kk--) - *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z - } -#else - for( aa=0; aa < H::L ; aa++ ){ - ui = uu[aa].begin(); - for( kk = 0 ; kk < H::k ; kk++ ){ - pb = *( G::b + aa ) + kk; - pA = * ( * ( G::A + aa ) + kk ); - dd = H::d; - tmp = 0.; - vi = v.begin(); - while( dd-- ) - tmp += *pA++ * *vi++; // project - tmp += *pb; // translate - tmp *= iw; // scale - *ui++ = (Uns32T) (floor(tmp)); // floor - } - } - // Compute hash functions - for( aa=0 ; aa < H::L ; aa++ ){ - pg= *( G::g + aa ); // L \times functions g_j(v) \in Z^k - // u_1 \in Z^{k \times d} - ui = uu[aa].begin(); - kk=H::k; - while( kk-- ) - *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z - } -#endif - -} - - // single point insertion; inserted values are hash value and pointID Uns32T G::insert_point(vector& v, Uns32T pp){ Uns32T collisionCount = 0; H::p = pp; - if(pp>G::maxp) - G::maxp=pp; // Store highest pointID in database - compute_hash_functions( v ); + if(pp>H::maxp) + H::maxp=pp; // Store highest pointID in database + H::compute_hash_functions( v ); for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion - __generate_hash_keys( *( G::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); + H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); collisionCount += bucket_insert_point( *(h + j) + t1 ); } return collisionCount; @@ -466,10 +474,10 @@ void G::retrieve_point(vector& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){ calling_instance = caller; add_point_callback = add_point; - compute_hash_functions( v ); + H::compute_hash_functions( v ); for(Uns32T j = 0 ; j < H::L ; j++ ){ - __generate_hash_keys( *( G::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); - if( bucket* bPtr = *(__get_bucket(j) + get_t1()) ) + H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); + if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) #ifdef LSH_BLOCK_FULL_ROWS bucket_chain_point( bPtr->next, qpos); #else @@ -519,6 +527,7 @@ // ... // r2[L-1][0] r2[L-1][1] ... r2[L-1][k-1] // +// ******* HASHTABLES FORMAT1 (optimized for LSH_ON_DISK retrieval) ******* // ---hash table 0: N x C x 8 --- // [t2 pointID][t2 pointID]...[t2 pointID] // [t2 pointID][t2 pointID]...[t2 pointID] @@ -539,6 +548,29 @@ // ... // [t2 pointID][t2 pointID]...[t2 pointID] // +// ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) ******* +// +// State machine controlled by regular expression. +// legend: +// +// O2_SERIAL_FLAGS_T1_BIT = 0x80000000U +// O2_SERIAL_FLAGS_T2_BIT = 0x40000000U +// O2_SERIAL_FLAGS_END_BIT = 0x20000000U +// +// T1(t1) - T1 hash token containing t1 hash key with O2_SERIAL_FLAGS_T1_BIT set (t1 range 0..2^29-1) +// T2 - T2 hash token with O2_SERIAL_FLAGS_T2_BIT set +// t2 - t2 hash key (range 1..2^32-6) +// p - point identifier (range 0..2^32-1) +// E - end hash table token with O2_SERIAL_FLAGS_END_BIT set +// {...} required arguments +// [...] optional arguments +// * - match zero or more occurences +// + - match one or more occurences +// {...}^L - repeat argument L times +// +// FORMAT2 Regular expression: +// { [T1(t1) T2 t2 p+ [T2 t2 p+]* ]* E. }^L +// // Serial header constructors SerialHeader::SerialHeader(){;} @@ -625,8 +657,8 @@ if(!dbIsNew){ db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer //serial_get_header(db); // read header - cout << "maxp = " << G::maxp << endl; - lshHeader->maxp=G::maxp; + cout << "maxp = " << H::maxp << endl; + lshHeader->maxp=H::maxp; // Default to FILEFORMAT1 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2)) lshHeader->flags|=O2_SERIAL_FILEFORMAT2; @@ -674,7 +706,7 @@ Uns32T *pu; Uns32T x,y,z; - db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer + char* db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer pf = get_serial_hashfunction_base(db); // HASH FUNCTIONS @@ -687,7 +719,7 @@ for( y = 0 ; y < H::k ; y++ ) #endif for( z = 0 ; z < d ; z++ ) - *pf++ = A[x][y][z]; + *pf++ = H::A[x][y][z]; // Write the random biases b[][] #ifdef USE_U_FUNCTIONS @@ -697,19 +729,19 @@ for( x = 0 ; x < H::L ; x++ ) for( y = 0 ; y < H::k ; y++ ) #endif - *pf++=b[x][y]; + *pf++ = H::b[x][y]; pu = (Uns32T*)pf; // Write the Z projectors r1[][] for( x = 0 ; x < H::L ; x++) for( y = 0 ; y < H::k ; y++) - *pu++ = r1[x][y]; + *pu++ = H::r1[x][y]; // Write the Z projectors r2[][] for( x = 0 ; x < H::L ; x++) for( y = 0; y < H::k ; y++) - *pu++ = r2[x][y]; + *pu++ = H::r2[x][y]; serial_munmap(db, get_serial_hashtable_offset()); return 1; @@ -952,7 +984,7 @@ if (write (dbfid, "", 1) != 1) error("write error", "", "write"); - db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); + char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE); @@ -966,6 +998,7 @@ } char* G::serial_mmap(int dbfid, Uns32T memSize, Uns32T forWrite, off_t offset){ + char* db; if(forWrite){ if ((db = (char*) mmap(0, memSize, PROT_READ | PROT_WRITE, MAP_SHARED, dbfid, offset)) == (caddr_t) -1) @@ -1033,9 +1066,9 @@ H::C = lshHeader->numCols; H::k = lshHeader->numFuns; H::d = lshHeader->dataDim; - G::w = lshHeader->binWidth; - G::radius = lshHeader->radius; - G::maxp = lshHeader->maxp; + H::w = lshHeader->binWidth; + H::radius = lshHeader->radius; + H::maxp = lshHeader->maxp; return dbfid; } @@ -1051,35 +1084,17 @@ // Load the hash functions into core char* db = serial_mmap(dbfid, get_serial_hashtable_offset(), 0);// get database pointer again -#ifdef USE_U_FUNCTIONS - G::A = new float**[ H::m ]; // m x k x d random projectors - G::b = new float*[ H::m ]; // m x k random biases -#else - G::A = new float**[ H::L ]; // m x k x d random projectors - G::b = new float*[ H::L ]; // m x k random biases -#endif - G::g = new Uns32T*[ H::L ]; // L x k random projections - assert(g&&A&&b); // failure - pf = get_serial_hashfunction_base(db); #ifdef USE_U_FUNCTIONS for( j = 0 ; j < H::m ; j++ ){ // L functions gj(v) - G::A[j] = new float*[ H::k/2 ]; // k x d 2-stable distribution coefficients - G::b[j] = new float[ H::k/2 ]; // bias - assert( G::A[j] && G::b[j] ); // failure for( kk = 0 ; kk < H::k/2 ; kk++ ){ // Normally distributed hash functions #else for( j = 0 ; j < H::L ; j++ ){ // L functions gj(v) - G::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients - G::b[j] = new float[ H::k ]; // bias - assert( G::A[j] && G::b[j] ); // failure for( kk = 0 ; kk < H::k ; kk++ ){ // Normally distributed hash functions #endif - G::A[j][kk] = new float[ H::d ]; - assert( G::A[j][kk] ); // failure for(Uns32T i = 0 ; i < H::d ; i++ ) - G::A[j][kk][i] = *pf++; // Normally distributed random vectors + H::A[j][kk][i] = *pf++; // Normally distributed random vectors } } #ifdef USE_U_FUNCTIONS @@ -1089,15 +1104,7 @@ for( j = 0 ; j < H::L ; j++ ) // biases b for( kk = 0 ; kk < H::k ; kk++ ) #endif - G::b[j][kk] = *pf++; - - for( j = 0 ; j < H::L ; j++){ // 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z - G::g[j] = new Uns32T[ H::k ]; - assert( G::g[j] ); - } - - - H::__initialize_data_structures(); + H::b[j][kk] = *pf++; pu = (Uns32T*)pf; for( j = 0 ; j < H::L ; j++ ) // Z projectors r1 @@ -1108,7 +1115,7 @@ for( kk = 0 ; kk < H::k ; kk++ ) H::r2[j][kk] = *pu++; - serial_munmap(db, get_serial_hashtable_offset()); + serial_munmap(db, get_serial_hashtable_offset()); } void G::unserialize_lsh_hashtables_format1(int fid){ @@ -1289,8 +1296,8 @@ error("could not advise local hashtable memory","","madvise"); SerialElementT* pe = (SerialElementT*)db ; for(Uns32T qpos=0; qposnumCols, qpos); // Point to correct row } serial_munmap(db, hashTableSize); // drop hashtable mmap @@ -1313,7 +1320,7 @@ Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols; calling_instance = caller; add_point_callback = add_point; - compute_hash_functions(v); + H::compute_hash_functions(v); for(Uns32T j=0; jnumCols, qpos); // Point to correct row serial_munmap(db, hashTableSize); // drop hashtable mmap } diff -r d9a88cfd4ab6 -r 9fd5340faffd lshlib.h --- a/lshlib.h Tue Jul 29 22:01:17 2008 +0000 +++ b/lshlib.h Wed Jul 30 15:22:22 2008 +0000 @@ -184,26 +184,27 @@ class H{ friend class G; private: - bucket*** h; // hash functions + + float *** A; // m x k x d random projectors from R^d->R^k + float ** b; // m x k uniform additive constants + + Uns32T ** g; // L x k random hash projections \in Z^k Uns32T** r1; // random ints for hashing Uns32T** r2; // random ints for hashing + + bucket*** h; // The LSH hash tables + + bool use_u_functions; // flag to optimize computation of hashes + vector > uu; // Storage for m patial hash evaluations ( g_j = [u_a,u_b] ) + + Uns32T maxp; // highest pointID stored in database + Uns32T bucketCount; // count of number of point buckets allocated + Uns32T pointCount; // count of number of points inserted + Uns32T t1; // first hash table key Uns32T t2; // second hash table key Uns32T P; // hash table prime number - bool use_u_functions; // flag to optimize computation of hashes - Uns32T bucketCount; // count of number of point buckets allocated - Uns32T pointCount; // count of number of points inserted - void __initialize_data_structures(); - void __bucket_insert_point(bucket*); - void __sbucket_insert_point(sbucket*); - Uns32T __computeProductModDefaultPrime(Uns32T*,Uns32T*,IntT); - Uns32T __randr(); - bucket** __get_bucket(int j); - void __generate_hash_keys(Uns32T*,Uns32T*,Uns32T*); - void error(const char* a, const char* b = "", const char *sysFunc = 0); - - public: Uns32T N; // num rows per table Uns32T C; // num collision per row Uns32T k; // num projections per hash function @@ -211,37 +212,52 @@ Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0 Uns32T d; // dimensions Uns32T p; // current point + float w; // width of hash slots (relative to normalized feature space) + float radius;// scaling coefficient for data (1./radius) - H(){;} - H(Uns32T k, Uns32T m, Uns32T d, Uns32T N, Uns32T C); + void initialize_data_structures(); + void initialize_lsh_functions(); + void initialize_partial_functions(); + void __bucket_insert_point(bucket*); + void __sbucket_insert_point(sbucket*); + Uns32T computeProductModDefaultPrime(Uns32T*,Uns32T*,IntT); + Uns32T randr(); + float randn(); + float ranf(); + bucket** get_bucket(int j); + void error(const char* a, const char* b = "", const char *sysFunc = 0); + + public: + + H(); + H(Uns32T k, Uns32T m, Uns32T d, Uns32T N, Uns32T C, float w, float r); ~H(); + float get_w(){return w;} + float get_radius(){return radius;} + Uns32T get_numRows(){return N;} + Uns32T get_numCols(){return C;} + Uns32T get_numFuns(){return k;} + Uns32T get_numTables(){return L;} + Uns32T get_dataDim(){return d;} + Uns32T get_maxp(){return maxp;} Uns32T bucket_insert_point(bucket**); - Uns32T get_t1(){return t1;} - Uns32T get_t2(){return t2;} - + // Interface to hash functions + void compute_hash_functions(vector& v); + void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2); + Uns32T get_t1(){return t1;} // hash-key t1 + Uns32T get_t2(){return t2;} // hash-key t2 }; - +// Typedef for point-reporting callback function. Used to collect points during LSH retrieval typedef void (*ReporterCallbackPtr)(void* objPtr, Uns32T pointID, Uns32T queryIndex, float squaredDistance); // Interface for indexing and retrieval class G: public H{ private: - float *** A; // m x k x d random projectors from R^d->R^k - float ** b; // m x k uniform additive constants - Uns32T ** g; // L x k random hash projections \in Z^k - float w; // width of hash slots (relative to normalized feature space) - float radius;// scaling coefficient for data (1./radius) - vector > uu; // Storage for m patial hash evaluations - Uns32T maxp; // highest pointID stored in database - void* calling_instance; // store calling object instance for member-function callback - void (*add_point_callback)(void*, Uns32T, Uns32T, float); - - void initialize_partial_functions(); - + // LSH serial data structure file handling void get_lock(int fd, bool exclusive); void release_lock(int fd); int serial_create(char* filename, Uns32T FMT); @@ -296,12 +312,9 @@ void serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos); void serial_bucket_dump(SerialElementT* pe); - // Hash functions - void compute_hash_functions(vector& v); - float randn(); - float ranf(); - - char* db; // pointer to serialized structure + // Callback Function for point reporting + void* calling_instance; // store calling object instance for member-function callback + void (*add_point_callback)(void*, Uns32T, Uns32T, float); // The callback public: G(char* lshFile, bool lshInCore = false); // unserialize constructor @@ -323,8 +336,6 @@ void serialize(char* filename, Uns32T serialFormat = O2_SERIAL_FILEFORMAT1); // write hashfunctions and hashtables to disk SerialHeaderT* get_lshHeader(){return lshHeader;} - float get_radius(){return radius;} - Uns32T get_maxp(){return maxp;} void serial_dump_tables(char* filename); float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; } };