mas01mc@292: #include "lshlib.h" mas01mc@292: mas01mc@292: //#define __LSH_DUMP_CORE_TABLES__ mas01mc@292: //#define USE_U_FUNCTIONS mas01mc@296: #define LSH_BLOCK_FULL_ROWS mas01mc@292: mas01mc@292: void err(char*s){cout << s << endl;exit(2);} mas01mc@292: mas01mc@292: Uns32T get_page_logn(){ mas01mc@292: int pagesz = (int)sysconf(_SC_PAGESIZE); mas01mc@292: return (Uns32T)log2((double)pagesz); mas01mc@292: } mas01mc@292: mas01mc@292: unsigned align_up(unsigned x, unsigned w){ return ((x) + ((1< >(H::m); mas01mc@293: for( Uns32T aa=0 ; aa < H::m ; aa++ ) mas01mc@293: H::uu[aa] = vector( H::k/2 ); mas01mc@293: #else mas01mc@293: H::uu = vector >(H::L); mas01mc@293: for( Uns32T aa=0 ; aa < H::L ; aa++ ) mas01mc@293: H::uu[aa] = vector( H::k ); mas01mc@293: #endif mas01mc@293: } mas01mc@293: mas01mc@293: mas01mc@293: // Generate z ~ N(0,1) mas01mc@293: float H::randn(){ mas01mc@293: // Box-Muller mas01mc@293: float x1, x2; mas01mc@293: do{ mas01mc@293: x1 = ranf(); mas01mc@293: } while (x1 == 0); // cannot take log of 0 mas01mc@293: x2 = ranf(); mas01mc@293: float z; mas01mc@293: z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2); mas01mc@293: return z; mas01mc@293: } mas01mc@293: mas01mc@293: float H::ranf(){ mas01mc@293: #ifdef MT19937 mas01mc@293: return (float) genrand_real2(); mas01mc@293: #else mas01mc@293: return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) ); mas01mc@293: #endif mas01mc@293: } mas01mc@293: mas01mc@293: // range is 1..2^29 mas01mc@293: /* FIXME: that looks like an ... odd range. Still. */ mas01mc@293: Uns32T H::randr(){ mas01mc@293: #ifdef MT19937 mas01mc@293: return (Uns32T)((genrand_int32() >> 3) + 1); mas01mc@293: #else mas01mc@293: return (Uns32T) ((rand() >> 2) + 1); mas01mc@293: #endif mas01mc@292: } mas01mc@292: mas01mc@292: // Destruct hash tables mas01mc@292: H::~H(){ mas01mc@293: Uns32T i,j,kk; mas01mc@293: #ifdef USE_U_FUNCTIONS mas01mc@293: for( j = 0 ; j < H::m ; j++ ){ mas01mc@293: for( kk = 0 ; kk < H::k/2 ; kk++ ) mas01mc@293: delete[] A[j][kk]; mas01mc@293: delete[] A[j]; mas01mc@293: } mas01mc@293: delete[] A; mas01mc@293: for( j = 0 ; j < H::m ; j++ ) mas01mc@293: delete[] b[j]; mas01mc@293: delete[] b; mas01mc@293: #else mas01mc@293: for( j = 0 ; j < H::L ; j++ ){ mas01mc@293: for( kk = 0 ; kk < H::k ; kk++ ) mas01mc@293: delete[] A[j][kk]; mas01mc@293: delete[] A[j]; mas01mc@293: } mas01mc@293: delete[] A; mas01mc@293: for( j = 0 ; j < H::L ; j++ ) mas01mc@293: delete[] b[j]; mas01mc@293: delete[] b; mas01mc@293: #endif mas01mc@293: mas01mc@293: for( j = 0 ; j < H::L ; j++ ) mas01mc@293: delete[] g[j]; mas01mc@293: delete[] g; mas01mc@292: for( j=0 ; j < H::L ; j++ ){ mas01mc@292: delete[] H::r1[ j ]; mas01mc@292: delete[] H::r2[ j ]; mas01mc@292: for(i = 0; i< H::N ; i++) mas01mc@292: delete H::h[ j ][ i ]; mas01mc@292: delete[] H::h[ j ]; mas01mc@292: } mas01mc@292: delete[] H::r1; mas01mc@292: delete[] H::r2; mas01mc@292: delete[] H::h; mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@293: // Compute all hash functions for vector v mas01mc@293: // #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d} mas01mc@293: // to make L \times g_j functions \in Z^k mas01mc@293: void H::compute_hash_functions(vector& v){ // v \in R^d mas01mc@293: float iw = 1. / H::w; // hash bucket width mas01mc@293: Uns32T aa, kk; mas01mc@293: if( v.size() != H::d ) mas01mc@293: error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality mas01mc@293: double tmp = 0; mas01mc@293: float *pA, *pb; mas01mc@293: Uns32T *pg; mas01mc@293: int dd; mas01mc@293: vector::iterator vi; mas01mc@293: vector::iterator ui; mas01mc@293: mas01mc@293: #ifdef USE_U_FUNCTIONS mas01mc@293: Uns32T bb; mas01mc@293: // Store m dot products to expand mas01mc@293: for( aa=0; aa < H::m ; aa++ ){ mas01mc@293: ui = H::uu[aa].begin(); mas01mc@293: for( kk = 0 ; kk < H::k/2 ; kk++ ){ mas01mc@293: pb = *( H::b + aa ) + kk; mas01mc@293: pA = * ( * ( H::A + aa ) + kk ); mas01mc@293: dd = H::d; mas01mc@293: tmp = 0.; mas01mc@293: vi = v.begin(); mas01mc@293: while( dd-- ) mas01mc@293: tmp += *pA++ * *vi++; // project mas01mc@293: tmp += *pb; // translate mas01mc@293: tmp *= iw; // scale mas01mc@293: *ui++ = (Uns32T) floor(tmp); // floor mas01mc@293: } mas01mc@293: } mas01mc@293: // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d} mas01mc@293: Uns32T j; mas01mc@293: for( aa=0, j=0 ; aa < H::m-1 ; aa++ ) mas01mc@293: for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){ mas01mc@293: pg= *( H::g + j ); // L \times functions g_j(v) \in Z^k mas01mc@293: // u_1 \in Z^{(k/2) \times d} mas01mc@293: ui = H::uu[aa].begin(); mas01mc@293: kk=H::k/2; mas01mc@293: while( kk-- ) mas01mc@293: *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z mas01mc@293: // u_2 \in Z^{(k/2) \times d} mas01mc@293: ui = H::uu[bb].begin(); mas01mc@293: kk=H::k/2; mas01mc@293: while( kk--) mas01mc@293: *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z mas01mc@293: } mas01mc@293: #else mas01mc@293: for( aa=0; aa < H::L ; aa++ ){ mas01mc@293: ui = H::uu[aa].begin(); mas01mc@293: for( kk = 0 ; kk < H::k ; kk++ ){ mas01mc@293: pb = *( H::b + aa ) + kk; mas01mc@293: pA = * ( * ( H::A + aa ) + kk ); mas01mc@293: dd = H::d; mas01mc@293: tmp = 0.; mas01mc@293: vi = v.begin(); mas01mc@293: while( dd-- ) mas01mc@293: tmp += *pA++ * *vi++; // project mas01mc@293: tmp += *pb; // translate mas01mc@293: tmp *= iw; // scale mas01mc@293: *ui++ = (Uns32T) (floor(tmp)); // floor mas01mc@293: } mas01mc@293: } mas01mc@293: // Compute hash functions mas01mc@293: for( aa=0 ; aa < H::L ; aa++ ){ mas01mc@293: pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k mas01mc@293: // u_1 \in Z^{k \times d} mas01mc@293: ui = H::uu[aa].begin(); mas01mc@293: kk=H::k; mas01mc@293: while( kk-- ) mas01mc@293: *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z mas01mc@293: } mas01mc@293: #endif mas01mc@293: } mas01mc@293: mas01mc@292: // make hash value \in Z mas01mc@293: void H::generate_hash_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){ mas01mc@293: H::t1 = computeProductModDefaultPrime( g, r1, H::k ) % H::N; mas01mc@293: H::t2 = computeProductModDefaultPrime( g, r2, H::k ); mas01mc@292: } mas01mc@292: mas01mc@292: #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}} mas01mc@292: mas01mc@292: // Computes (a.b) mod UH_PRIME_DEFAULT mas01mc@293: inline Uns32T H::computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){ mas01mc@292: LongUns64T h = 0; mas01mc@292: mas01mc@292: for(IntT i = 0; i < size; i++){ mas01mc@292: h = h + (LongUns64T)a[i] * (LongUns64T)b[i]; mas01mc@292: h = (h & TWO_TO_32_MINUS_1) + 5 * (h >> 32); mas01mc@292: if (h >= UH_PRIME_DEFAULT) { mas01mc@292: h = h - UH_PRIME_DEFAULT; mas01mc@292: } mas01mc@292: CR_ASSERT(h < UH_PRIME_DEFAULT); mas01mc@292: } mas01mc@292: return h; mas01mc@292: } mas01mc@292: mas01mc@292: Uns32T H::bucket_insert_point(bucket **pp){ mas01mc@296: collisionCount = 0; mas01mc@292: if(!*pp){ mas01mc@292: *pp = new bucket(); mas01mc@292: #ifdef LSH_BLOCK_FULL_ROWS mas01mc@292: (*pp)->t2 = 0; // Use t2 as a collision counter for the row mas01mc@292: (*pp)->next = new bucket(); mas01mc@292: #endif mas01mc@292: } mas01mc@292: #ifdef LSH_BLOCK_FULL_ROWS mas01mc@292: collisionCount = (*pp)->t2; mas01mc@292: if(collisionCount < H::C){ // Block if row is full mas01mc@292: (*pp)->t2++; // Increment collision counter mas01mc@292: pointCount++; mas01mc@292: collisionCount++; mas01mc@292: __bucket_insert_point((*pp)->next); // First bucket holds collision count mas01mc@292: } mas01mc@292: #else mas01mc@296: pointCount++; mas01mc@296: __bucket_insert_point(*pp); // No collision count storage mas01mc@292: #endif mas01mc@292: return collisionCount; mas01mc@292: } mas01mc@292: mas01mc@292: void H::__bucket_insert_point(bucket* p){ mas01mc@292: if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2? mas01mc@292: p->t2 = H::t2; mas01mc@292: bucketCount++; // Record start of new point-locale collision chain mas01mc@292: p->snext = new sbucket(); mas01mc@292: __sbucket_insert_point(p->snext); mas01mc@292: return; mas01mc@292: } mas01mc@292: mas01mc@292: if(p->t2 == H::t2){ mas01mc@292: __sbucket_insert_point(p->snext); mas01mc@292: return; mas01mc@292: } mas01mc@292: mas01mc@292: if(p->next){ mas01mc@292: __bucket_insert_point(p->next); mas01mc@292: } mas01mc@292: mas01mc@292: else{ mas01mc@292: p->next = new bucket(); mas01mc@292: __bucket_insert_point(p->next); mas01mc@292: } mas01mc@292: mas01mc@292: } mas01mc@292: mas01mc@292: void H::__sbucket_insert_point(sbucket* p){ mas01mc@292: if(p->pointID==IFLAG){ mas01mc@292: p->pointID = H::p; mas01mc@292: return; mas01mc@292: } mas01mc@292: mas01mc@292: // Search for pointID mas01mc@292: if(p->snext){ mas01mc@292: __sbucket_insert_point(p->snext); mas01mc@292: } mas01mc@292: else{ mas01mc@292: // Make new point collision bucket at end of list mas01mc@292: p->snext = new sbucket(); mas01mc@292: __sbucket_insert_point(p->snext); mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@293: inline bucket** H::get_bucket(int j){ mas01mc@292: return *(h+j); mas01mc@292: } mas01mc@292: mas01mc@293: // Interface to Locality Sensitive Hashing G mas01mc@293: G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr): mas01mc@293: H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures mas01mc@308: indexName(0), mas01mc@293: lshHeader(0), mas01mc@292: calling_instance(0), mas01mc@293: add_point_callback(0) mas01mc@292: { mas01mc@293: mas01mc@292: } mas01mc@292: mas01mc@292: // Serialize from file LSH constructor mas01mc@292: // Read parameters from database file mas01mc@292: // Load the hash functions, close the database mas01mc@292: // Optionally load the LSH tables into head-allocated lists in core mas01mc@292: G::G(char* filename, bool lshInCoreFlag): mas01mc@293: H(), // default base-class constructor call delays data-structure initialization mas01mc@309: indexName(0), mas01mc@293: lshHeader(0), mas01mc@292: calling_instance(0), mas01mc@292: add_point_callback(0) mas01mc@292: { mas01mc@292: int dbfid = unserialize_lsh_header(filename); mas01mc@309: indexName = new char[O2_INDEX_MAXSTR]; mas01mc@309: strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER mas01mc@293: H::initialize_lsh_functions(); // Base-class data-structure initialization mas01mc@293: unserialize_lsh_functions(dbfid); // populate with on-disk hashfunction values mas01mc@292: mas01mc@292: // Format1 only needs unserializing if specifically requested mas01mc@292: if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2) && lshInCoreFlag){ mas01mc@292: unserialize_lsh_hashtables_format1(dbfid); mas01mc@292: } mas01mc@292: mas01mc@292: // Format2 always needs unserializing mas01mc@292: if(lshHeader->flags&O2_SERIAL_FILEFORMAT2 && lshInCoreFlag){ mas01mc@292: unserialize_lsh_hashtables_format2(dbfid); mas01mc@292: } mas01mc@292: mas01mc@293: close(dbfid);} mas01mc@292: mas01mc@292: G::~G(){ mas01mc@292: delete lshHeader; mas01mc@292: } mas01mc@292: mas01mc@292: // single point insertion; inserted values are hash value and pointID mas01mc@292: Uns32T G::insert_point(vector& v, Uns32T pp){ mas01mc@292: Uns32T collisionCount = 0; mas01mc@292: H::p = pp; mas01mc@299: if(H::maxp && pp<=H::maxp) mas01mc@296: error("points must be indexed in strict ascending order", "LSH::insert_point(vector&, Uns32T pointID)"); mas01mc@296: H::maxp=pp; // Store highest pointID in database mas01mc@293: H::compute_hash_functions( v ); mas01mc@292: for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion mas01mc@293: H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); mas01mc@292: collisionCount += bucket_insert_point( *(h + j) + t1 ); mas01mc@292: } mas01mc@292: return collisionCount; mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@292: // batch insert for a point set mas01mc@292: // inserted values are vector hash value and pointID starting at basePointID mas01mc@292: void G::insert_point_set(vector >& vv, Uns32T basePointID){ mas01mc@292: for(Uns32T point=0; point& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){ mas01mc@292: calling_instance = caller; mas01mc@292: add_point_callback = add_point; mas01mc@293: H::compute_hash_functions( v ); mas01mc@292: for(Uns32T j = 0 ; j < H::L ; j++ ){ mas01mc@293: H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); mas01mc@293: if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) mas01mc@292: #ifdef LSH_BLOCK_FULL_ROWS mas01mc@292: bucket_chain_point( bPtr->next, qpos); mas01mc@292: #else mas01mc@292: bucket_chain_point( bPtr , qpos); mas01mc@292: #endif mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::retrieve_point_set(vector >& vv, ReporterCallbackPtr add_point, void* caller){ mas01mc@292: for(Uns32T qpos = 0 ; qpos < vv.size() ; qpos++ ) mas01mc@292: retrieve_point(vv[qpos], qpos, add_point, caller); mas01mc@292: } mas01mc@292: mas01mc@292: // export lsh tables to table structure on disk mas01mc@292: // mas01mc@292: // LSH TABLE STRUCTURE mas01mc@292: // ---header 64 bytes --- mas01mc@292: // [magic #tables #rows #cols elementSize databaseSize version flags dim #funs 0 0 0 0 0 0] mas01mc@292: // mas01mc@292: // ---random projections L x k x d float --- mas01mc@292: // A[0][0][0] A[0][0][1] ... A[0][0][d-1] mas01mc@292: // A[0][1][0] A[0][1][1] ... A[1][1][d-1] mas01mc@292: // ... mas01mc@292: // A[0][K-1][0] A[0][1][1] ... A[0][k-1][d-1] mas01mc@292: // ... mas01mc@292: // ... mas01mc@292: // A[L-1][0][0] A[M-1][0][1] ... A[L-1][0][d-1] mas01mc@292: // A[L-1][1][0] A[M-1][1][1] ... A[L-1][1][d-1] mas01mc@292: // ... mas01mc@292: // A[L-1][k-1][0] A[M-1][1][1] ... A[L-1][k-1][d-1] mas01mc@292: // mas01mc@292: // ---bias L x k float --- mas01mc@292: // b[0][0] b[0][1] ... b[0][k-1] mas01mc@292: // b[1][0] b[1][1] ... b[1][k-1] mas01mc@292: // ... mas01mc@292: // b[L-1][0] b[L-1][1] ... b[L-1][k-1] mas01mc@292: // mas01mc@292: // ---random r1 L x k float --- mas01mc@292: // r1[0][0] r1[0][1] ... r1[0][k-1] mas01mc@292: // r1[1][0] r1[1][1] ... r1[1][k-1] mas01mc@292: // ... mas01mc@292: // r1[L-1][0] r1[L-1][1] ... r1[L-1][k-1] mas01mc@292: // mas01mc@292: // ---random r2 L x k float --- mas01mc@292: // r2[0][0] r2[0][1] ... r2[0][k-1] mas01mc@292: // r2[1][0] r2[1][1] ... r2[1][k-1] mas01mc@292: // ... mas01mc@292: // r2[L-1][0] r2[L-1][1] ... r2[L-1][k-1] mas01mc@292: // mas01mc@293: // ******* HASHTABLES FORMAT1 (optimized for LSH_ON_DISK retrieval) ******* mas01mc@292: // ---hash table 0: N x C x 8 --- mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // ... mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // mas01mc@292: // ---hash table 1: N x C x 8 --- mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // ... mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // mas01mc@292: // ... mas01mc@292: // mas01mc@292: // ---hash table L-1: N x C x 8 --- mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // ... mas01mc@292: // [t2 pointID][t2 pointID]...[t2 pointID] mas01mc@292: // mas01mc@293: // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) ******* mas01mc@293: // mas01mc@293: // State machine controlled by regular expression. mas01mc@293: // legend: mas01mc@293: // mas01mc@306: // O2_SERIAL_TOKEN_T1 = 0xFFFFFFFCU mas01mc@306: // O2_SERIAL_TOKEN_T2 = 0xFFFFFFFDU mas01mc@306: // O2_SERIAL_TOKEN_ENDTABLE = 0xFFFFFFFEU mas01mc@293: // mas01mc@306: // T1 - T1 hash token mas01mc@306: // t1 - t1 hash key (t1 range 0..2^29-1) mas01mc@306: // T2 - T2 token mas01mc@293: // t2 - t2 hash key (range 1..2^32-6) mas01mc@293: // p - point identifier (range 0..2^32-1) mas01mc@306: // E - end hash table token mas01mc@293: // {...} required arguments mas01mc@293: // [...] optional arguments mas01mc@293: // * - match zero or more occurences mas01mc@293: // + - match one or more occurences mas01mc@293: // {...}^L - repeat argument L times mas01mc@293: // mas01mc@293: // FORMAT2 Regular expression: mas01mc@306: // { [T1 t1 [T2 t2 p+]+ ]* E }^L mas01mc@293: // mas01mc@292: mas01mc@292: // Serial header constructors mas01mc@292: SerialHeader::SerialHeader(){;} mas01mc@296: SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc): mas01mc@292: lshMagic(O2_SERIAL_MAGIC), mas01mc@292: binWidth(W), mas01mc@292: numTables(L), mas01mc@292: numRows(N), mas01mc@292: numCols(C), mas01mc@292: elementSize(O2_SERIAL_ELEMENT_SIZE), mas01mc@296: version(O2_SERIAL_VERSION), mas01mc@296: size(0), // we are deprecating this value mas01mc@292: flags(FMT), mas01mc@292: dataDim(d), mas01mc@292: numFuns(k), mas01mc@292: radius(r), mas01mc@296: maxp(p), mas01mc@296: size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables mas01mc@296: + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions mas01mc@296: (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())), mas01mc@296: pointCount(pc){ mas01mc@296: mas01mc@296: if(FMT==O2_SERIAL_FILEFORMAT2) mas01mc@296: size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE mas01mc@296: + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T) mas01mc@296: +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn()); mas01mc@296: } // header mas01mc@292: mas01mc@292: float* G::get_serial_hashfunction_base(char* db){ mas01mc@292: if(db&&lshHeader) mas01mc@292: return (float*)(db+O2_SERIAL_HEADER_SIZE); mas01mc@292: else return NULL; mas01mc@292: } mas01mc@292: mas01mc@292: SerialElementT* G::get_serial_hashtable_base(char* db){ mas01mc@292: if(db&&lshHeader) mas01mc@292: return (SerialElementT*)(db+get_serial_hashtable_offset()); mas01mc@292: else mas01mc@292: return NULL; mas01mc@292: } mas01mc@292: mas01mc@292: Uns32T G::get_serial_hashtable_offset(){ mas01mc@292: if(lshHeader) mas01mc@292: return align_up(O2_SERIAL_HEADER_SIZE + mas01mc@292: L*lshHeader->numFuns*( sizeof(float)*lshHeader->dataDim+2*sizeof(Uns32T)+sizeof(float)),get_page_logn()); mas01mc@292: else mas01mc@292: return 0; mas01mc@292: } mas01mc@292: mas01mc@292: void G::serialize(char* filename, Uns32T serialFormat){ mas01mc@292: int dbfid; mas01mc@292: char* db; mas01mc@292: int dbIsNew=0; mas01mc@292: mas01mc@292: // Check requested serialFormat mas01mc@292: if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2)) mas01mc@292: error("Unrecognized serial file format request: ", "serialize()"); mas01mc@296: mas01mc@292: // Test to see if file exists mas01mc@292: if((dbfid = open (filename, O_RDONLY)) < 0) mas01mc@292: // If it doesn't, then create the file (CREATE) mas01mc@292: if(errno == ENOENT){ mas01mc@292: // Create the file mas01mc@292: std::cout << "Creating new serialized LSH database:" << filename << "..."; mas01mc@292: std::cout.flush(); mas01mc@292: serial_create(filename, serialFormat); mas01mc@292: dbIsNew=1; mas01mc@292: } mas01mc@292: else mas01mc@292: // The file can't be opened mas01mc@292: error("Can't open the file", filename, "open"); mas01mc@292: mas01mc@292: // Load the on-disk header into core mas01mc@292: dbfid = serial_open(filename, 1); // open for write mas01mc@292: db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer mas01mc@292: serial_get_header(db); // read header mas01mc@292: serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap mas01mc@292: mas01mc@292: // Check compatibility of core and disk data structures mas01mc@292: if( !serial_can_merge(serialFormat) ) mas01mc@292: error("Incompatible core and serial LSH, data structure dimensions mismatch."); mas01mc@292: mas01mc@292: // For new LSH databases write the hashfunctions mas01mc@292: if(dbIsNew) mas01mc@292: serialize_lsh_hashfunctions(dbfid); mas01mc@292: // Write the hashtables in the requested format mas01mc@292: if(serialFormat == O2_SERIAL_FILEFORMAT1) mas01mc@292: serialize_lsh_hashtables_format1(dbfid, !dbIsNew); mas01mc@292: else mas01mc@292: serialize_lsh_hashtables_format2(dbfid, !dbIsNew); mas01mc@292: mas01mc@292: if(!dbIsNew){ mas01mc@292: db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer mas01mc@292: //serial_get_header(db); // read header mas01mc@293: cout << "maxp = " << H::maxp << endl; mas01mc@293: lshHeader->maxp=H::maxp; mas01mc@292: // Default to FILEFORMAT1 mas01mc@292: if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2)) mas01mc@294: lshHeader->flags|=O2_SERIAL_FILEFORMAT1; mas01mc@292: memcpy((char*)db, (char*)lshHeader, sizeof(SerialHeaderT)); mas01mc@292: serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap mas01mc@292: } mas01mc@292: mas01mc@292: serial_close(dbfid); mas01mc@292: } mas01mc@292: mas01mc@292: // Test to see if core structure and requested format is mas01mc@292: // compatible with currently opened database mas01mc@292: int G::serial_can_merge(Uns32T format){ mas01mc@292: SerialHeaderT* that = lshHeader; mas01mc@292: if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) mas01mc@292: || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2) mas01mc@292: || !( this->w == that->binWidth && mas01mc@296: this->L == that->numTables && mas01mc@296: this->N == that->numRows && mas01mc@296: this->k == that->numFuns && mas01mc@296: this->d == that->dataDim && mas01mc@296: sizeof(SerialElementT) == that->elementSize && mas01mc@296: this->radius == that->radius)){ mas01mc@292: serial_print_header(format); mas01mc@292: return 0; mas01mc@292: } mas01mc@292: else mas01mc@292: return 1; mas01mc@292: } mas01mc@292: mas01mc@292: // Used as an error message for serial_can_merge() mas01mc@292: void G::serial_print_header(Uns32T format){ mas01mc@292: std::cout << "Fc:" << format << " Fs:" << lshHeader->flags << endl; mas01mc@292: std::cout << "Wc:" << w << " Ls:" << lshHeader->binWidth << endl; mas01mc@292: std::cout << "Lc:" << L << " Ls:" << lshHeader->numTables << endl; mas01mc@292: std::cout << "Nc:" << N << " Ns:" << lshHeader->numRows << endl; mas01mc@292: std::cout << "kc:" << k << " ks:" << lshHeader->numFuns << endl; mas01mc@292: std::cout << "dc:" << d << " ds:" << lshHeader->dataDim << endl; mas01mc@292: std::cout << "sc:" << sizeof(SerialElementT) << " ss:" << lshHeader->elementSize << endl; mas01mc@292: std::cout << "rc:" << this->radius << " rs:" << lshHeader->radius << endl; mas01mc@292: } mas01mc@292: mas01mc@292: int G::serialize_lsh_hashfunctions(int fid){ mas01mc@292: float* pf; mas01mc@292: Uns32T *pu; mas01mc@292: Uns32T x,y,z; mas01mc@292: mas01mc@293: char* db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer mas01mc@292: pf = get_serial_hashfunction_base(db); mas01mc@292: mas01mc@292: // HASH FUNCTIONS mas01mc@292: // Write the random projectors A[][][] mas01mc@292: #ifdef USE_U_FUNCTIONS mas01mc@292: for( x = 0 ; x < H::m ; x++ ) mas01mc@292: for( y = 0 ; y < H::k/2 ; y++ ) mas01mc@292: #else mas01mc@292: for( x = 0 ; x < H::L ; x++ ) mas01mc@292: for( y = 0 ; y < H::k ; y++ ) mas01mc@292: #endif mas01mc@292: for( z = 0 ; z < d ; z++ ) mas01mc@293: *pf++ = H::A[x][y][z]; mas01mc@292: mas01mc@292: // Write the random biases b[][] mas01mc@292: #ifdef USE_U_FUNCTIONS mas01mc@292: for( x = 0 ; x < H::m ; x++ ) mas01mc@292: for( y = 0 ; y < H::k/2 ; y++ ) mas01mc@292: #else mas01mc@292: for( x = 0 ; x < H::L ; x++ ) mas01mc@292: for( y = 0 ; y < H::k ; y++ ) mas01mc@292: #endif mas01mc@293: *pf++ = H::b[x][y]; mas01mc@292: mas01mc@292: pu = (Uns32T*)pf; mas01mc@292: mas01mc@292: // Write the Z projectors r1[][] mas01mc@292: for( x = 0 ; x < H::L ; x++) mas01mc@292: for( y = 0 ; y < H::k ; y++) mas01mc@293: *pu++ = H::r1[x][y]; mas01mc@292: mas01mc@292: // Write the Z projectors r2[][] mas01mc@292: for( x = 0 ; x < H::L ; x++) mas01mc@292: for( y = 0; y < H::k ; y++) mas01mc@293: *pu++ = H::r2[x][y]; mas01mc@292: mas01mc@292: serial_munmap(db, get_serial_hashtable_offset()); mas01mc@292: return 1; mas01mc@292: } mas01mc@292: mas01mc@292: int G::serialize_lsh_hashtables_format1(int fid, int merge){ mas01mc@292: SerialElementT *pe, *pt; mas01mc@292: Uns32T x,y; mas01mc@292: mas01mc@292: if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT1) ) mas01mc@292: error("Cannot merge core and serial LSH, data structure dimensions mismatch."); mas01mc@292: mas01mc@292: Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols; mas01mc@292: Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount; mas01mc@292: // Write the hash tables mas01mc@292: for( x = 0 ; x < H::L ; x++ ){ mas01mc@292: std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT1..."; mas01mc@292: std::cout.flush(); mas01mc@292: // memory map a single hash table for sequential access mas01mc@292: // Align each hash table to page boundary mas01mc@292: char* dbtable = serial_mmap(fid, hashTableSize, 1, mas01mc@292: align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn())); mas01mc@316: #ifdef __CYGWIN__ mas01mc@316: // No madvise in CYGWIN mas01mc@316: #else mas01mc@292: if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0) mas01mc@292: error("could not advise hashtable memory","","madvise"); mas01mc@316: #endif mas01mc@292: maxColCount=0; mas01mc@292: minColCount=O2_SERIAL_MAX_COLS; mas01mc@292: meanColCount=0; mas01mc@292: colCountN=0; mas01mc@292: pt=(SerialElementT*)dbtable; mas01mc@292: for( y = 0 ; y < H::N ; y++ ){ mas01mc@292: // Move disk pointer to beginning of row mas01mc@292: pe=pt+y*lshHeader->numCols; mas01mc@292: mas01mc@292: colCount=0; mas01mc@292: if(bucket* bPtr = h[x][y]) mas01mc@292: if(merge) mas01mc@292: #ifdef LSH_BLOCK_FULL_ROWS mas01mc@292: serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket mas01mc@292: else mas01mc@292: serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket mas01mc@292: #else mas01mc@292: serial_merge_hashtable_row_format1(pe, bPtr, colCount); mas01mc@292: else mas01mc@292: serial_write_hashtable_row_format1(pe, bPtr, colCount); mas01mc@292: #endif mas01mc@292: if(colCount){ mas01mc@292: if(colCountmaxColCount) mas01mc@292: maxColCount=colCount; mas01mc@292: meanColCount+=colCount; mas01mc@292: colCountN++; mas01mc@292: } mas01mc@292: } mas01mc@292: if(colCountN) mas01mc@292: std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN mas01mc@292: << ", min = " << minColCount << ", max = " << maxColCount mas01mc@292: << endl; mas01mc@292: serial_munmap(dbtable, hashTableSize); mas01mc@292: } mas01mc@292: mas01mc@292: // We're done writing mas01mc@292: return 1; mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* b, Uns32T& colCount){ mas01mc@292: while(b && b->t2!=IFLAG){ mas01mc@292: SerialElementT*pe=pr; // reset disk pointer to beginning of row mas01mc@292: serial_merge_element_format1(pe, b->snext, b->t2, colCount); mas01mc@292: b=b->next; mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount){ mas01mc@292: while(sb){ mas01mc@292: if(colCount==lshHeader->numCols){ mas01mc@292: std::cout << "!point-chain full " << endl; mas01mc@292: return; mas01mc@292: } mas01mc@292: Uns32T c=0; mas01mc@292: // Merge collision chains mas01mc@292: while(cnumCols){ mas01mc@292: if( (pe+c)->hashValue==IFLAG){ mas01mc@292: (pe+c)->hashValue=t2; mas01mc@292: (pe+c)->pointID=sb->pointID; mas01mc@292: colCount=c+1; mas01mc@292: if(c+1numCols) mas01mc@292: (pe+c+1)->hashValue=IFLAG; mas01mc@292: break; mas01mc@292: } mas01mc@292: c++; mas01mc@292: } mas01mc@292: sb=sb->snext; mas01mc@292: } mas01mc@292: return; mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* b, Uns32T& colCount){ mas01mc@292: pe->hashValue=IFLAG; mas01mc@292: while(b && b->t2!=IFLAG){ mas01mc@292: serial_write_element_format1(pe, b->snext, b->t2, colCount); mas01mc@292: b=b->next; mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount){ mas01mc@292: while(sb){ mas01mc@292: if(colCount==lshHeader->numCols){ mas01mc@292: std::cout << "!point-chain full " << endl; mas01mc@292: return; mas01mc@292: } mas01mc@292: pe->hashValue=t2; mas01mc@292: pe->pointID=sb->pointID; mas01mc@292: pe++; mas01mc@292: colCount++; mas01mc@292: sb=sb->snext; mas01mc@292: } mas01mc@292: pe->hashValue=IFLAG; mas01mc@292: return; mas01mc@292: } mas01mc@292: mas01mc@292: int G::serialize_lsh_hashtables_format2(int fid, int merge){ mas01mc@292: Uns32T x,y; mas01mc@292: mas01mc@292: if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) ) mas01mc@292: error("Cannot merge core and serial LSH, data structure dimensions mismatch."); mas01mc@292: mas01mc@306: // We must pereform FORMAT2 merges in core mas01mc@292: if(merge) mas01mc@292: unserialize_lsh_hashtables_format2(fid); mas01mc@292: mas01mc@292: Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1; mas01mc@292: lseek(fid, get_serial_hashtable_offset(), SEEK_SET); mas01mc@292: mas01mc@292: // Write the hash tables mas01mc@292: for( x = 0 ; x < H::L ; x++ ){ mas01mc@292: std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT2..."; mas01mc@292: std::cout.flush(); mas01mc@292: maxColCount=0; mas01mc@292: minColCount=O2_SERIAL_MAX_COLS; mas01mc@292: meanColCount=0; mas01mc@292: colCountN=0; mas01mc@292: for( y = 0 ; y < H::N ; y++ ){ mas01mc@292: colCount=0; mas01mc@292: if(bucket* bPtr = h[x][y]){ mas01mc@306: // Check for empty row (even though row was allocated) mas01mc@306: #ifdef LSH_BLOCK_FULL_ROWS mas01mc@306: if(bPtr->next->t2==IFLAG){ mas01mc@306: close(fid); mas01mc@306: error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()"); mas01mc@306: } mas01mc@306: #else mas01mc@306: if(bPtr->t2==IFLAG){ mas01mc@306: close(fid); mas01mc@306: error("b->t2==IFLAG","serialize_lsh_hashtables_format2()"); mas01mc@306: } mas01mc@306: #endif mas01mc@306: t1 = O2_SERIAL_TOKEN_T1; mas01mc@306: if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){ mas01mc@306: close(fid); mas01mc@306: error("write error in serial_write_hashtable_format2() [T1]"); mas01mc@306: } mas01mc@306: t1 = y; mas01mc@292: if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){ mas01mc@292: close(fid); mas01mc@292: error("write error in serial_write_hashtable_format2() [t1]"); mas01mc@292: } mas01mc@292: #ifdef LSH_BLOCK_FULL_ROWS mas01mc@292: serial_write_hashtable_row_format2(fid, bPtr->next, colCount); // skip collision counter bucket mas01mc@292: #else mas01mc@292: serial_write_hashtable_row_format2(fid, bPtr, colCount); mas01mc@292: #endif mas01mc@292: } mas01mc@292: if(colCount){ mas01mc@292: if(colCountmaxColCount) mas01mc@292: maxColCount=colCount; mas01mc@292: meanColCount+=colCount; mas01mc@292: colCountN++; mas01mc@292: } mas01mc@292: } mas01mc@292: // Write END of table marker mas01mc@306: t1 = O2_SERIAL_TOKEN_ENDTABLE; mas01mc@292: if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){ mas01mc@292: close(fid); mas01mc@292: error("write error in serial_write_hashtable_format2() [end]"); mas01mc@292: } mas01mc@292: mas01mc@292: if(colCountN) mas01mc@292: std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN mas01mc@292: << ", min = " << minColCount << ", max = " << maxColCount mas01mc@292: << endl; mas01mc@292: } mas01mc@292: mas01mc@292: // We're done writing mas01mc@292: return 1; mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_write_hashtable_row_format2(int fid, bucket* b, Uns32T& colCount){ mas01mc@292: while(b && b->t2!=IFLAG){ mas01mc@306: if(!b->snext){ mas01mc@306: close(fid); mas01mc@306: error("Empty collision chain in serial_write_hashtable_row_format2()"); mas01mc@306: } mas01mc@306: t2 = O2_SERIAL_TOKEN_T2; mas01mc@292: if( write(fid, &t2, sizeof(Uns32T)) != sizeof(Uns32T) ){ mas01mc@292: close(fid); mas01mc@292: error("write error in serial_write_hashtable_row_format2()"); mas01mc@292: } mas01mc@292: t2 = b->t2; mas01mc@292: if( write(fid, &t2, sizeof(Uns32T)) != sizeof(Uns32T) ){ mas01mc@292: close(fid); mas01mc@292: error("write error in serial_write_hashtable_row_format2()"); mas01mc@292: } mas01mc@292: serial_write_element_format2(fid, b->snext, colCount); mas01mc@292: b=b->next; mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_write_element_format2(int fid, sbucket* sb, Uns32T& colCount){ mas01mc@292: while(sb){ mas01mc@292: if(write(fid, &sb->pointID, sizeof(Uns32T))!=sizeof(Uns32T)){ mas01mc@292: close(fid); mas01mc@292: error("Write error in serial_write_element_format2()"); mas01mc@292: } mas01mc@292: colCount++; mas01mc@292: sb=sb->snext; mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@292: int G::serial_create(char* filename, Uns32T FMT){ mas01mc@292: return serial_create(filename, w, L, N, C, k, d, FMT); mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@292: int G::serial_create(char* filename, float binWidth, Uns32T numTables, Uns32T numRows, Uns32T numCols, mas01mc@292: Uns32T numFuns, Uns32T dim, Uns32T FMT){ mas01mc@292: mas01mc@292: if(numTables > O2_SERIAL_MAX_TABLES || numRows > O2_SERIAL_MAX_ROWS mas01mc@292: || numCols > O2_SERIAL_MAX_COLS || numFuns > O2_SERIAL_MAX_FUNS mas01mc@292: || dim>O2_SERIAL_MAX_DIM){ mas01mc@292: error("LSH parameters out of bounds for serialization"); mas01mc@292: } mas01mc@292: mas01mc@292: int dbfid; mas01mc@292: if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) mas01mc@292: error("Can't create serial file", filename, "open"); mas01mc@292: get_lock(dbfid, 1); mas01mc@292: mas01mc@292: // Make header first to get size of serialized database mas01mc@296: lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount); mas01mc@296: mas01mc@296: cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl; mas01mc@296: if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE) mas01mc@296: error("Maximum size of LSH file exceded: > 4000MB"); mas01mc@296: mas01mc@292: // go to the location corresponding to the last byte mas01mc@292: if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1) mas01mc@292: error("lseek error in db file", "", "lseek"); mas01mc@292: mas01mc@292: // write a dummy byte at the last location mas01mc@292: if (write (dbfid, "", 1) != 1) mas01mc@292: error("write error", "", "write"); mas01mc@292: mas01mc@293: char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); mas01mc@296: mas01mc@292: memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE); mas01mc@296: mas01mc@292: serial_munmap(db, O2_SERIAL_HEADER_SIZE); mas01mc@296: mas01mc@292: close(dbfid); mas01mc@292: mas01mc@292: std::cout << "done initializing tables." << endl; mas01mc@292: mas01mc@292: return 1; mas01mc@292: } mas01mc@292: mas01mc@292: char* G::serial_mmap(int dbfid, Uns32T memSize, Uns32T forWrite, off_t offset){ mas01mc@293: char* db; mas01mc@292: if(forWrite){ mas01mc@292: if ((db = (char*) mmap(0, memSize, PROT_READ | PROT_WRITE, mas01mc@292: MAP_SHARED, dbfid, offset)) == (caddr_t) -1) mas01mc@292: error("mmap error in request for writable serialized database", "", "mmap"); mas01mc@292: } mas01mc@292: else if ((db = (char*) mmap(0, memSize, PROT_READ, MAP_SHARED, dbfid, offset)) == (caddr_t) -1) mas01mc@292: error("mmap error in read-only serialized database", "", "mmap"); mas01mc@292: mas01mc@292: return db; mas01mc@292: } mas01mc@292: mas01mc@292: SerialHeaderT* G::serial_get_header(char* db){ mas01mc@292: lshHeader = new SerialHeaderT(); mas01mc@292: memcpy((char*)lshHeader, db, sizeof(SerialHeaderT)); mas01mc@292: mas01mc@292: if(lshHeader->lshMagic!=O2_SERIAL_MAGIC) mas01mc@292: error("Not an LSH database file"); mas01mc@292: mas01mc@292: return lshHeader; mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_munmap(char* db, Uns32T N){ mas01mc@292: munmap(db, N); mas01mc@292: } mas01mc@292: mas01mc@292: int G::serial_open(char* filename, int writeFlag){ mas01mc@292: int dbfid; mas01mc@292: if(writeFlag){ mas01mc@292: if ((dbfid = open (filename, O_RDWR)) < 0) mas01mc@292: error("Can't open serial file for read/write", filename, "open"); mas01mc@292: get_lock(dbfid, writeFlag); mas01mc@292: } mas01mc@292: else{ mas01mc@292: if ((dbfid = open (filename, O_RDONLY)) < 0) mas01mc@292: error("Can't open serial file for read", filename, "open"); mas01mc@292: get_lock(dbfid, 0); mas01mc@292: } mas01mc@292: mas01mc@292: return dbfid; mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_close(int dbfid){ mas01mc@292: mas01mc@292: release_lock(dbfid); mas01mc@292: close(dbfid); mas01mc@292: } mas01mc@292: mas01mc@292: int G::unserialize_lsh_header(char* filename){ mas01mc@292: mas01mc@292: int dbfid; mas01mc@292: char* db; mas01mc@292: // Test to see if file exists mas01mc@292: if((dbfid = open (filename, O_RDONLY)) < 0) mas01mc@292: error("Can't open the file", filename, "open"); mas01mc@292: close(dbfid); mas01mc@292: dbfid = serial_open(filename, 0); // open for read mas01mc@292: db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer mas01mc@292: serial_get_header(db); // read header mas01mc@292: serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap mas01mc@292: mas01mc@292: // Unserialize header parameters mas01mc@292: H::L = lshHeader->numTables; mas01mc@292: H::m = (Uns32T)( (1.0 + sqrt(1 + 8.0*(int)H::L)) / 2.0); mas01mc@292: H::N = lshHeader->numRows; mas01mc@292: H::C = lshHeader->numCols; mas01mc@292: H::k = lshHeader->numFuns; mas01mc@292: H::d = lshHeader->dataDim; mas01mc@293: H::w = lshHeader->binWidth; mas01mc@293: H::radius = lshHeader->radius; mas01mc@293: H::maxp = lshHeader->maxp; mas01mc@296: H::pointCount = lshHeader->pointCount; mas01mc@292: mas01mc@292: return dbfid; mas01mc@292: } mas01mc@292: mas01mc@292: // unserialize the LSH parameters mas01mc@292: // we leave the LSH tree on disk as a flat file mas01mc@292: // it is this flat file that we search by memory mapping mas01mc@292: void G::unserialize_lsh_functions(int dbfid){ mas01mc@292: Uns32T j, kk; mas01mc@292: float* pf; mas01mc@292: Uns32T* pu; mas01mc@292: mas01mc@292: // Load the hash functions into core mas01mc@292: char* db = serial_mmap(dbfid, get_serial_hashtable_offset(), 0);// get database pointer again mas01mc@292: mas01mc@292: pf = get_serial_hashfunction_base(db); mas01mc@292: mas01mc@292: #ifdef USE_U_FUNCTIONS mas01mc@292: for( j = 0 ; j < H::m ; j++ ){ // L functions gj(v) mas01mc@292: for( kk = 0 ; kk < H::k/2 ; kk++ ){ // Normally distributed hash functions mas01mc@292: #else mas01mc@292: for( j = 0 ; j < H::L ; j++ ){ // L functions gj(v) mas01mc@292: for( kk = 0 ; kk < H::k ; kk++ ){ // Normally distributed hash functions mas01mc@292: #endif mas01mc@292: for(Uns32T i = 0 ; i < H::d ; i++ ) mas01mc@293: H::A[j][kk][i] = *pf++; // Normally distributed random vectors mas01mc@292: } mas01mc@292: } mas01mc@292: #ifdef USE_U_FUNCTIONS mas01mc@292: for( j = 0 ; j < H::m ; j++ ) // biases b mas01mc@292: for( kk = 0 ; kk < H::k/2 ; kk++ ) mas01mc@292: #else mas01mc@292: for( j = 0 ; j < H::L ; j++ ) // biases b mas01mc@292: for( kk = 0 ; kk < H::k ; kk++ ) mas01mc@292: #endif mas01mc@293: H::b[j][kk] = *pf++; mas01mc@292: mas01mc@292: pu = (Uns32T*)pf; mas01mc@292: for( j = 0 ; j < H::L ; j++ ) // Z projectors r1 mas01mc@292: for( kk = 0 ; kk < H::k ; kk++ ) mas01mc@292: H::r1[j][kk] = *pu++; mas01mc@292: mas01mc@292: for( j = 0 ; j < H::L ; j++ ) // Z projectors r2 mas01mc@292: for( kk = 0 ; kk < H::k ; kk++ ) mas01mc@292: H::r2[j][kk] = *pu++; mas01mc@292: mas01mc@293: serial_munmap(db, get_serial_hashtable_offset()); mas01mc@292: } mas01mc@292: mas01mc@292: void G::unserialize_lsh_hashtables_format1(int fid){ mas01mc@292: SerialElementT *pe, *pt; mas01mc@292: Uns32T x,y; mas01mc@292: Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols; mas01mc@292: // Read the hash tables into core mas01mc@292: for( x = 0 ; x < H::L ; x++ ){ mas01mc@292: // memory map a single hash table mas01mc@292: // Align each hash table to page boundary mas01mc@292: char* dbtable = serial_mmap(fid, hashTableSize, 0, mas01mc@292: align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn())); mas01mc@316: #ifdef __CYGWIN__ mas01mc@316: // No madvise in CYGWIN mas01mc@316: #else mas01mc@292: if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0) mas01mc@292: error("could not advise hashtable memory","","madvise"); mas01mc@316: #endif mas01mc@292: pt=(SerialElementT*)dbtable; mas01mc@292: for( y = 0 ; y < H::N ; y++ ){ mas01mc@292: // Move disk pointer to beginning of row mas01mc@292: pe=pt+y*lshHeader->numCols; mas01mc@292: unserialize_hashtable_row_format1(pe, h[x]+y); mas01mc@292: #ifdef __LSH_DUMP_CORE_TABLES__ mas01mc@292: printf("S[%d,%d]", x, y); mas01mc@292: serial_bucket_dump(pe); mas01mc@292: printf("C[%d,%d]", x, y); mas01mc@292: dump_hashtable_row(h[x][y]); mas01mc@292: #endif mas01mc@292: } mas01mc@292: serial_munmap(dbtable, hashTableSize); mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b){ mas01mc@292: Uns32T colCount = 0; mas01mc@292: while(colCount!=lshHeader->numCols && pe->hashValue !=IFLAG){ mas01mc@292: H::p = pe->pointID; // current point ID mas01mc@292: t2 = pe->hashValue; mas01mc@292: bucket_insert_point(b); mas01mc@292: pe++; mas01mc@292: colCount++; mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::unserialize_lsh_hashtables_format2(int fid){ mas01mc@292: Uns32T x=0,y=0; mas01mc@292: mas01mc@292: // Seek to hashtable base offset mas01mc@292: if(lseek(fid, get_serial_hashtable_offset(), SEEK_SET)!=get_serial_hashtable_offset()){ mas01mc@292: close(fid); mas01mc@292: error("Seek error in unserialize_lsh_hashtables_format2"); mas01mc@292: } mas01mc@292: mas01mc@292: // Read the hash tables into core (structure is given in header) mas01mc@292: while( x < H::L){ mas01mc@292: if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){ mas01mc@292: close(fid); mas01mc@292: error("Read error","unserialize_lsh_hashtables_format2()"); mas01mc@292: } mas01mc@306: if(H::t1==O2_SERIAL_TOKEN_ENDTABLE) mas01mc@292: x++; // End of table mas01mc@292: else mas01mc@292: while(y < H::N){ mas01mc@306: // Read a row and move file pointer to beginning of next row or _bittable mas01mc@306: if(!(H::t1==O2_SERIAL_TOKEN_T1)){ mas01mc@292: close(fid); mas01mc@306: error("State matchine error T1","unserialize_lsh_hashtables_format2()"); mas01mc@292: } mas01mc@306: if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){ mas01mc@306: close(fid); mas01mc@306: error("Read error: t1","unserialize_lsh_hashtables_format2()"); mas01mc@306: } mas01mc@306: y = H::t1; mas01mc@292: if(y>=H::N){ mas01mc@292: close(fid); mas01mc@292: error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()"); mas01mc@292: } mas01mc@292: Uns32T token = unserialize_hashtable_row_format2(fid, h[x]+y); mas01mc@292: mas01mc@292: #ifdef __LSH_DUMP_CORE_TABLES__ mas01mc@292: printf("C[%d,%d]", x, y); mas01mc@292: dump_hashtable_row(h[x][y]); mas01mc@292: #endif mas01mc@292: // Check that token is valid mas01mc@306: if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){ mas01mc@292: close(fid); mas01mc@292: error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()"); mas01mc@292: } mas01mc@292: // Check for end of table flag mas01mc@306: if(token==O2_SERIAL_TOKEN_ENDTABLE){ mas01mc@292: x++; mas01mc@292: break; mas01mc@292: } mas01mc@292: // Check for new row flag mas01mc@306: if(token==O2_SERIAL_TOKEN_T1) mas01mc@292: H::t1 = token; mas01mc@292: } mas01mc@292: } mas01mc@306: } mas01mc@292: mas01mc@292: Uns32T G::unserialize_hashtable_row_format2(int fid, bucket** b){ mas01mc@292: bool pointFound = false; mas01mc@292: if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){ mas01mc@292: close(fid); mas01mc@292: error("Read error T2 token","unserialize_hashtable_row_format2"); mas01mc@292: } mas01mc@306: if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){ mas01mc@292: close(fid); mas01mc@292: error("State machine error: expected E or T2"); mas01mc@292: } mas01mc@306: while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){ mas01mc@292: pointFound=false; mas01mc@292: // Check for T2 token mas01mc@306: if(H::t2!=O2_SERIAL_TOKEN_T2){ mas01mc@306: close(fid); mas01mc@292: error("State machine error T2 token", "unserialize_hashtable_row_format2()"); mas01mc@306: } mas01mc@292: // Read t2 value mas01mc@292: if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){ mas01mc@292: close(fid); mas01mc@292: error("Read error t2","unserialize_hashtable_row_format2"); mas01mc@292: } mas01mc@292: if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){ mas01mc@292: close(fid); mas01mc@292: error("Read error H::p","unserialize_hashtable_row_format2"); mas01mc@292: } mas01mc@306: while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){ mas01mc@292: pointFound=true; mas01mc@292: bucket_insert_point(b); mas01mc@292: if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){ mas01mc@292: close(fid); mas01mc@292: error("Read error H::p","unserialize_hashtable_row_format2"); mas01mc@292: } mas01mc@292: } mas01mc@292: if(!pointFound) mas01mc@292: error("State machine error: point", "unserialize_hashtable_row_format2()"); mas01mc@306: H::t2 = H::p; // Copy last found token to t2 mas01mc@292: } mas01mc@292: return H::t2; // holds current token mas01mc@292: } mas01mc@292: mas01mc@292: void G::dump_hashtable_row(bucket* p){ mas01mc@292: while(p && p->t2!=IFLAG){ mas01mc@292: sbucket* sbp = p->snext; mas01mc@292: while(sbp){ mas01mc@292: printf("(%0X,%u)", p->t2, sbp->pointID); mas01mc@292: fflush(stdout); mas01mc@292: sbp=sbp->snext; mas01mc@292: } mas01mc@292: p=p->next; mas01mc@292: } mas01mc@292: printf("\n"); mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@292: // G::serial_retrieve_point( ... ) mas01mc@292: // retrieves (pointID) from a serialized LSH database mas01mc@292: // mas01mc@292: // inputs: mas01mc@292: // filename - file name of serialized LSH database mas01mc@292: // vv - query point set mas01mc@292: // mas01mc@292: // outputs: mas01mc@292: // inserts retrieved points into add_point() callback method mas01mc@292: void G::serial_retrieve_point_set(char* filename, vector >& vv, ReporterCallbackPtr add_point, void* caller) mas01mc@292: { mas01mc@292: int dbfid = serial_open(filename, 0); // open for read mas01mc@292: char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer mas01mc@292: serial_get_header(dbheader); // read header mas01mc@292: serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap mas01mc@292: mas01mc@292: if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){ mas01mc@292: close(dbfid); mas01mc@292: error("serial_retrieve_point_set is for SERIAL_FILEFORMAT1 only"); mas01mc@292: } mas01mc@292: mas01mc@292: // size of each hash table mas01mc@292: Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols; mas01mc@292: calling_instance = caller; // class instance variable used in ...bucket_chain_point() mas01mc@292: add_point_callback = add_point; mas01mc@292: mas01mc@292: for(Uns32T j=0; jnumCols, qpos); // Point to correct row mas01mc@292: } mas01mc@292: serial_munmap(db, hashTableSize); // drop hashtable mmap mas01mc@292: } mas01mc@292: serial_close(dbfid); mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_retrieve_point(char* filename, vector& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){ mas01mc@292: int dbfid = serial_open(filename, 0); // open for read mas01mc@292: char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer mas01mc@292: serial_get_header(dbheader); // read header mas01mc@292: serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap mas01mc@292: mas01mc@292: if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){ mas01mc@292: close(dbfid); mas01mc@292: error("serial_retrieve_point is for SERIAL_FILEFORMAT1 only"); mas01mc@292: } mas01mc@292: mas01mc@292: // size of each hash table mas01mc@292: Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols; mas01mc@292: calling_instance = caller; mas01mc@292: add_point_callback = add_point; mas01mc@293: H::compute_hash_functions(v); mas01mc@292: for(Uns32T j=0; jnumCols, qpos); // Point to correct row mas01mc@292: serial_munmap(db, hashTableSize); // drop hashtable mmap mas01mc@292: } mas01mc@292: serial_close(dbfid); mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_dump_tables(char* filename){ mas01mc@292: int dbfid = serial_open(filename, 0); // open for read mas01mc@292: char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer mas01mc@292: serial_get_header(dbheader); // read header mas01mc@292: serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap mas01mc@292: Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols; mas01mc@292: for(Uns32T j=0; jnumCols; mas01mc@292: }while(pe<(SerialElementT*)db+lshHeader->numRows*lshHeader->numCols); mas01mc@292: } mas01mc@292: mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_bucket_dump(SerialElementT* pe){ mas01mc@292: SerialElementT* pend = pe+lshHeader->numCols; mas01mc@292: while( !(pe->hashValue==IFLAG || pe==pend ) ){ mas01mc@292: printf("(%0X,%u)",pe->hashValue,pe->pointID); mas01mc@292: pe++; mas01mc@292: } mas01mc@292: printf("\n"); mas01mc@292: fflush(stdout); mas01mc@292: } mas01mc@292: mas01mc@292: void G::serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos){ mas01mc@292: SerialElementT* pend = pe+lshHeader->numCols; mas01mc@292: while( !(pe->hashValue==IFLAG || pe==pend ) ){ mas01mc@292: if(pe->hashValue==t2){ // new match mas01mc@292: add_point_callback(calling_instance, pe->pointID, qpos, radius); mas01mc@292: } mas01mc@292: pe++; mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::bucket_chain_point(bucket* p, Uns32T qpos){ mas01mc@292: if(!p || p->t2==IFLAG) mas01mc@292: return; mas01mc@292: if(p->t2==t2){ // match mas01mc@292: sbucket_chain_point(p->snext, qpos); // add to reporter mas01mc@292: } mas01mc@292: if(p->next){ mas01mc@292: bucket_chain_point(p->next, qpos); // recurse mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::sbucket_chain_point(sbucket* p, Uns32T qpos){ mas01mc@292: add_point_callback(calling_instance, p->pointID, qpos, radius); mas01mc@292: if(p->snext){ mas01mc@292: sbucket_chain_point(p->snext, qpos); mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::get_lock(int fd, bool exclusive) { mas01mc@292: struct flock lock; mas01mc@292: int status; mas01mc@292: lock.l_type = exclusive ? F_WRLCK : F_RDLCK; mas01mc@292: lock.l_whence = SEEK_SET; mas01mc@292: lock.l_start = 0; mas01mc@292: lock.l_len = 0; /* "the whole file" */ mas01mc@292: retry: mas01mc@292: do { mas01mc@292: status = fcntl(fd, F_SETLKW, &lock); mas01mc@292: } while (status != 0 && errno == EINTR); mas01mc@292: if (status) { mas01mc@292: if (errno == EAGAIN) { mas01mc@292: sleep(1); mas01mc@292: goto retry; mas01mc@292: } else { mas01mc@292: error("fcntl lock error", "", "fcntl"); mas01mc@292: } mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: void G::release_lock(int fd) { mas01mc@292: struct flock lock; mas01mc@292: int status; mas01mc@292: mas01mc@292: lock.l_type = F_UNLCK; mas01mc@292: lock.l_whence = SEEK_SET; mas01mc@292: lock.l_start = 0; mas01mc@292: lock.l_len = 0; mas01mc@292: mas01mc@292: status = fcntl(fd, F_SETLKW, &lock); mas01mc@292: mas01mc@292: if (status) mas01mc@292: error("fcntl unlock error", "", "fcntl"); mas01mc@292: }