annotate lshlib.cpp @ 340:a6edbe97fddf

Added LSH_CORE_ARRAY structure for hashtables instead of linked lists. Maintained Backwards Compatibiliity with indexes build for linked list format. Added tests for indexing and merging. Tested backwards compatibility OK.\n\n The purpose of the LSH_CORE_ARRAY data structure is greater space efficiency and L1/2 cache usage. Essential for multiple indexes with multiple hashtables in RAM
author mas01mc
date Wed, 10 Sep 2008 18:55:16 +0000
parents fe4d5b763086
children fdeb8db97678
rev   line source
mas01mc@292 1 #include "lshlib.h"
mas01mc@292 2
mas01mc@340 3 //#define LSH_DUMP_CORE_TABLES
mas01mc@292 4 //#define USE_U_FUNCTIONS
mas01mc@340 5
mas01mc@340 6 // Use backward-compatible CORE ARRAY lsh index
mas01mc@340 7 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists
mas01mc@340 8 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads
mas01mc@340 9
mas01mc@340 10 // Critical path logic
mas01mc@340 11 #if defined LSH_CORE_ARRAY && !defined LSH_LIST_HEAD_COUNTERS
mas01mc@340 12 #define LSH_LIST_HEAD_COUNTERS
mas01mc@340 13 #endif
mas01mc@340 14
mas01mc@340 15 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head
mas01mc@340 16
mas01mc@292 17
mas01mc@292 18 void err(char*s){cout << s << endl;exit(2);}
mas01mc@292 19
mas01mc@292 20 Uns32T get_page_logn(){
mas01mc@292 21 int pagesz = (int)sysconf(_SC_PAGESIZE);
mas01mc@292 22 return (Uns32T)log2((double)pagesz);
mas01mc@292 23 }
mas01mc@292 24
mas01mc@292 25 unsigned align_up(unsigned x, unsigned w){ return ((x) + ((1<<w)-1) & ~((1<<w)-1)); }
mas01mc@292 26
mas01mc@292 27 void H::error(const char* a, const char* b, const char *sysFunc) {
mas01mc@292 28 cerr << a << ": " << b << endl;
mas01mc@292 29 if (sysFunc) {
mas01mc@292 30 perror(sysFunc);
mas01mc@292 31 }
mas01mc@292 32 exit(1);
mas01mc@292 33 }
mas01mc@292 34
mas01mc@293 35 H::H(){
mas01mc@293 36 // Delay initialization of lsh functions until we know the parameters
mas01mc@293 37 }
mas01mc@293 38
mas01mc@293 39 H::H(Uns32T kk, Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float ww, float rr):
mas01mc@292 40 #ifdef USE_U_FUNCTIONS
mas01mc@292 41 use_u_functions(true),
mas01mc@292 42 #else
mas01mc@292 43 use_u_functions(false),
mas01mc@292 44 #endif
mas01mc@293 45 maxp(0),
mas01mc@292 46 bucketCount(0),
mas01mc@292 47 pointCount(0),
mas01mc@292 48 N(NN),
mas01mc@292 49 C(CC),
mas01mc@292 50 k(kk),
mas01mc@292 51 m(mm),
mas01mc@293 52 L((mm*(mm-1))/2),
mas01mc@293 53 d(dd),
mas01mc@293 54 w(ww),
mas01mc@293 55 radius(rr)
mas01mc@292 56 {
mas01mc@292 57
mas01mc@292 58 if(m<2){
mas01mc@292 59 m=2;
mas01mc@292 60 L=1; // check value of L
mas01mc@292 61 cout << "warning: setting m=2, L=1" << endl;
mas01mc@292 62 }
mas01mc@292 63 if(use_u_functions && k%2){
mas01mc@292 64 k++; // make sure k is even
mas01mc@292 65 cout << "warning: setting k even" << endl;
mas01mc@292 66 }
mas01mc@293 67
mas01mc@293 68 // We have the necessary parameters, so construct hashfunction datastructures
mas01mc@293 69 initialize_lsh_functions();
mas01mc@292 70 }
mas01mc@292 71
mas01mc@293 72 void H::initialize_lsh_functions(){
mas01mc@292 73 H::P = UH_PRIME_DEFAULT;
mas01mc@292 74
mas01mc@292 75 /* FIXME: don't use time(); instead use /dev/random or similar */
mas01mc@292 76 /* FIXME: write out the seed somewhere, so that we can get
mas01mc@292 77 repeatability */
mas01mc@292 78 #ifdef MT19937
mas01mc@292 79 init_genrand(time(NULL));
mas01mc@292 80 #else
mas01mc@292 81 srand(time(NULL)); // seed random number generator
mas01mc@292 82 #endif
mas01mc@293 83 Uns32T i,j, kk;
mas01mc@293 84 #ifdef USE_U_FUNCTIONS
mas01mc@293 85 H::A = new float**[ H::m ]; // m x k x d random projectors
mas01mc@293 86 H::b = new float*[ H::m ]; // m x k random biases
mas01mc@293 87 #else
mas01mc@293 88 H::A = new float**[ H::L ]; // m x k x d random projectors
mas01mc@293 89 H::b = new float*[ H::L ]; // m x k random biases
mas01mc@293 90 #endif
mas01mc@293 91 H::g = new Uns32T*[ H::L ]; // L x k random projections
mas01mc@293 92 assert( H::g && H::A && H::b ); // failure
mas01mc@293 93 #ifdef USE_U_FUNCTIONS
mas01mc@293 94 // Use m \times u_i functions \in R^{(k/2) \times (d)}
mas01mc@293 95 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
mas01mc@293 96 for( j = 0; j < H::m ; j++ ){ // m functions u_i(v)
mas01mc@293 97 H::A[j] = new float*[ H::k/2 ]; // k/2 x d 2-stable distribution coefficients
mas01mc@293 98 H::b[j] = new float[ H::k/2 ]; // bias
mas01mc@293 99 assert( H::A[j] && H::b[j] ); // failure
mas01mc@293 100 for( kk = 0; kk < H::k/2 ; kk++ ){
mas01mc@293 101 H::A[j][kk] = new float[ H::d ];
mas01mc@293 102 assert( H::A[j][kk] ); // failure
mas01mc@293 103 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 104 H::A[j][kk][i] = H::randn(); // Normal
mas01mc@293 105 H::b[j][kk] = H::ranf()*H::w; // Uniform
mas01mc@293 106 }
mas01mc@293 107 }
mas01mc@293 108 #else
mas01mc@293 109 // Use m \times u_i functions \in R^{k \times (d)}
mas01mc@293 110 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
mas01mc@293 111 for( j = 0; j < H::L ; j++ ){ // m functions u_i(v)
mas01mc@293 112 H::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients
mas01mc@293 113 H::b[j] = new float[ H::k ]; // bias
mas01mc@293 114 assert( H::A[j] && H::b[j] ); // failure
mas01mc@293 115 for( kk = 0; kk < H::k ; kk++ ){
mas01mc@293 116 H::A[j][kk] = new float[ H::d ];
mas01mc@293 117 assert( H::A[j][kk] ); // failure
mas01mc@293 118 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 119 H::A[j][kk][i] = H::randn(); // Normal
mas01mc@293 120 H::b[j][kk] = H::ranf()*H::w; // Uniform
mas01mc@293 121 }
mas01mc@293 122 }
mas01mc@293 123 #endif
mas01mc@293 124
mas01mc@293 125 // Storage for LSH hash function output (Uns32T)
mas01mc@293 126 for( j = 0 ; j < H::L ; j++ ){ // L functions g_j(u_a, u_b) a,b \in nchoosek(m,2)
mas01mc@293 127 H::g[j] = new Uns32T[ H::k ]; // k x 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z
mas01mc@293 128 assert( H::g[j] );
mas01mc@292 129 }
mas01mc@292 130
mas01mc@293 131 // LSH Hash tables
mas01mc@293 132 H::h = new bucket**[ H::L ];
mas01mc@293 133 assert( H::h );
mas01mc@292 134 for( j = 0 ; j < H::L ; j++ ){
mas01mc@292 135 H::h[j] = new bucket*[ H::N ];
mas01mc@292 136 assert( H::h[j] );
mas01mc@292 137 for( i = 0 ; i < H::N ; i++)
mas01mc@292 138 H::h[j][i] = 0;
mas01mc@292 139 }
mas01mc@293 140
mas01mc@293 141 // Standard hash functions
mas01mc@293 142 H::r1 = new Uns32T*[ H::L ];
mas01mc@293 143 H::r2 = new Uns32T*[ H::L ];
mas01mc@293 144 assert( H::r1 && H::r2 ); // failure
mas01mc@293 145 for( j = 0 ; j < H::L ; j++ ){
mas01mc@293 146 H::r1[ j ] = new Uns32T[ H::k ];
mas01mc@293 147 H::r2[ j ] = new Uns32T[ H::k ];
mas01mc@293 148 assert( H::r1[j] && H::r2[j] ); // failure
mas01mc@293 149 for( i = 0; i<H::k; i++){
mas01mc@293 150 H::r1[j][i] = randr();
mas01mc@293 151 H::r2[j][i] = randr();
mas01mc@293 152 }
mas01mc@293 153 }
mas01mc@293 154
mas01mc@293 155 // Storage for whole or partial function evaluation depdenting on USE_U_FUNCTIONS
mas01mc@293 156 H::initialize_partial_functions();
mas01mc@293 157 }
mas01mc@293 158
mas01mc@293 159 void H::initialize_partial_functions(){
mas01mc@293 160
mas01mc@293 161 #ifdef USE_U_FUNCTIONS
mas01mc@293 162 H::uu = vector<vector<Uns32T> >(H::m);
mas01mc@293 163 for( Uns32T aa=0 ; aa < H::m ; aa++ )
mas01mc@293 164 H::uu[aa] = vector<Uns32T>( H::k/2 );
mas01mc@293 165 #else
mas01mc@293 166 H::uu = vector<vector<Uns32T> >(H::L);
mas01mc@293 167 for( Uns32T aa=0 ; aa < H::L ; aa++ )
mas01mc@293 168 H::uu[aa] = vector<Uns32T>( H::k );
mas01mc@293 169 #endif
mas01mc@293 170 }
mas01mc@293 171
mas01mc@293 172
mas01mc@293 173 // Generate z ~ N(0,1)
mas01mc@293 174 float H::randn(){
mas01mc@293 175 // Box-Muller
mas01mc@293 176 float x1, x2;
mas01mc@293 177 do{
mas01mc@293 178 x1 = ranf();
mas01mc@293 179 } while (x1 == 0); // cannot take log of 0
mas01mc@293 180 x2 = ranf();
mas01mc@293 181 float z;
mas01mc@293 182 z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2);
mas01mc@293 183 return z;
mas01mc@293 184 }
mas01mc@293 185
mas01mc@293 186 float H::ranf(){
mas01mc@293 187 #ifdef MT19937
mas01mc@293 188 return (float) genrand_real2();
mas01mc@293 189 #else
mas01mc@293 190 return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) );
mas01mc@293 191 #endif
mas01mc@293 192 }
mas01mc@293 193
mas01mc@293 194 // range is 1..2^29
mas01mc@293 195 /* FIXME: that looks like an ... odd range. Still. */
mas01mc@293 196 Uns32T H::randr(){
mas01mc@293 197 #ifdef MT19937
mas01mc@293 198 return (Uns32T)((genrand_int32() >> 3) + 1);
mas01mc@293 199 #else
mas01mc@293 200 return (Uns32T) ((rand() >> 2) + 1);
mas01mc@293 201 #endif
mas01mc@292 202 }
mas01mc@292 203
mas01mc@292 204 // Destruct hash tables
mas01mc@292 205 H::~H(){
mas01mc@293 206 Uns32T i,j,kk;
mas01mc@340 207 bucket** pp;
mas01mc@293 208 #ifdef USE_U_FUNCTIONS
mas01mc@293 209 for( j = 0 ; j < H::m ; j++ ){
mas01mc@293 210 for( kk = 0 ; kk < H::k/2 ; kk++ )
mas01mc@293 211 delete[] A[j][kk];
mas01mc@293 212 delete[] A[j];
mas01mc@293 213 }
mas01mc@293 214 delete[] A;
mas01mc@293 215 for( j = 0 ; j < H::m ; j++ )
mas01mc@293 216 delete[] b[j];
mas01mc@293 217 delete[] b;
mas01mc@293 218 #else
mas01mc@293 219 for( j = 0 ; j < H::L ; j++ ){
mas01mc@293 220 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@293 221 delete[] A[j][kk];
mas01mc@293 222 delete[] A[j];
mas01mc@293 223 }
mas01mc@293 224 delete[] A;
mas01mc@293 225 for( j = 0 ; j < H::L ; j++ )
mas01mc@293 226 delete[] b[j];
mas01mc@293 227 delete[] b;
mas01mc@293 228 #endif
mas01mc@293 229
mas01mc@293 230 for( j = 0 ; j < H::L ; j++ )
mas01mc@293 231 delete[] g[j];
mas01mc@293 232 delete[] g;
mas01mc@292 233 for( j=0 ; j < H::L ; j++ ){
mas01mc@292 234 delete[] H::r1[ j ];
mas01mc@292 235 delete[] H::r2[ j ];
mas01mc@340 236 for(i = 0; i< H::N ; i++){
mas01mc@340 237 pp = 0;
mas01mc@340 238 #ifdef LSH_CORE_ARRAY
mas01mc@340 239 if(H::h[ j ][ i ])
mas01mc@340 240 if(H::h[ j ][ i ]->t2 & LSH_CORE_ARRAY_BIT){
mas01mc@340 241 pp = get_pointer_to_bucket_linked_list(H::h[ j ][ i ]);
mas01mc@340 242 if(*pp){
mas01mc@340 243 (*pp)->snext=0; // Head of list uses snext as a non-pointer value
mas01mc@340 244 delete *pp; // Now the destructor can do its work properly
mas01mc@340 245 }
mas01mc@340 246 free(H::h[ j ][ i ]->next);
mas01mc@340 247 H::h[ j ][ i ]->next = 0; // Zero next pointer
mas01mc@340 248 H::h[ j ][ i ]->snext = 0; // Zero head-of-list pointer as above
mas01mc@340 249 }
mas01mc@340 250 #endif
mas01mc@292 251 delete H::h[ j ][ i ];
mas01mc@340 252 }
mas01mc@292 253 delete[] H::h[ j ];
mas01mc@292 254 }
mas01mc@292 255 delete[] H::r1;
mas01mc@292 256 delete[] H::r2;
mas01mc@292 257 delete[] H::h;
mas01mc@292 258 }
mas01mc@292 259
mas01mc@292 260
mas01mc@293 261 // Compute all hash functions for vector v
mas01mc@293 262 // #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d}
mas01mc@293 263 // to make L \times g_j functions \in Z^k
mas01mc@293 264 void H::compute_hash_functions(vector<float>& v){ // v \in R^d
mas01mc@293 265 float iw = 1. / H::w; // hash bucket width
mas01mc@293 266 Uns32T aa, kk;
mas01mc@293 267 if( v.size() != H::d )
mas01mc@293 268 error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality
mas01mc@293 269 double tmp = 0;
mas01mc@293 270 float *pA, *pb;
mas01mc@293 271 Uns32T *pg;
mas01mc@293 272 int dd;
mas01mc@293 273 vector<float>::iterator vi;
mas01mc@293 274 vector<Uns32T>::iterator ui;
mas01mc@293 275
mas01mc@293 276 #ifdef USE_U_FUNCTIONS
mas01mc@293 277 Uns32T bb;
mas01mc@293 278 // Store m dot products to expand
mas01mc@293 279 for( aa=0; aa < H::m ; aa++ ){
mas01mc@293 280 ui = H::uu[aa].begin();
mas01mc@293 281 for( kk = 0 ; kk < H::k/2 ; kk++ ){
mas01mc@293 282 pb = *( H::b + aa ) + kk;
mas01mc@293 283 pA = * ( * ( H::A + aa ) + kk );
mas01mc@293 284 dd = H::d;
mas01mc@293 285 tmp = 0.;
mas01mc@293 286 vi = v.begin();
mas01mc@293 287 while( dd-- )
mas01mc@293 288 tmp += *pA++ * *vi++; // project
mas01mc@293 289 tmp += *pb; // translate
mas01mc@293 290 tmp *= iw; // scale
mas01mc@293 291 *ui++ = (Uns32T) floor(tmp); // floor
mas01mc@293 292 }
mas01mc@293 293 }
mas01mc@293 294 // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d}
mas01mc@293 295 Uns32T j;
mas01mc@293 296 for( aa=0, j=0 ; aa < H::m-1 ; aa++ )
mas01mc@293 297 for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){
mas01mc@293 298 pg= *( H::g + j ); // L \times functions g_j(v) \in Z^k
mas01mc@293 299 // u_1 \in Z^{(k/2) \times d}
mas01mc@293 300 ui = H::uu[aa].begin();
mas01mc@293 301 kk=H::k/2;
mas01mc@293 302 while( kk-- )
mas01mc@293 303 *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z
mas01mc@293 304 // u_2 \in Z^{(k/2) \times d}
mas01mc@293 305 ui = H::uu[bb].begin();
mas01mc@293 306 kk=H::k/2;
mas01mc@293 307 while( kk--)
mas01mc@293 308 *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z
mas01mc@293 309 }
mas01mc@293 310 #else
mas01mc@293 311 for( aa=0; aa < H::L ; aa++ ){
mas01mc@293 312 ui = H::uu[aa].begin();
mas01mc@293 313 for( kk = 0 ; kk < H::k ; kk++ ){
mas01mc@293 314 pb = *( H::b + aa ) + kk;
mas01mc@293 315 pA = * ( * ( H::A + aa ) + kk );
mas01mc@293 316 dd = H::d;
mas01mc@293 317 tmp = 0.;
mas01mc@293 318 vi = v.begin();
mas01mc@293 319 while( dd-- )
mas01mc@293 320 tmp += *pA++ * *vi++; // project
mas01mc@293 321 tmp += *pb; // translate
mas01mc@293 322 tmp *= iw; // scale
mas01mc@293 323 *ui++ = (Uns32T) (floor(tmp)); // floor
mas01mc@293 324 }
mas01mc@293 325 }
mas01mc@293 326 // Compute hash functions
mas01mc@293 327 for( aa=0 ; aa < H::L ; aa++ ){
mas01mc@293 328 pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k
mas01mc@293 329 // u_1 \in Z^{k \times d}
mas01mc@293 330 ui = H::uu[aa].begin();
mas01mc@293 331 kk=H::k;
mas01mc@293 332 while( kk-- )
mas01mc@293 333 *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z
mas01mc@293 334 }
mas01mc@293 335 #endif
mas01mc@293 336 }
mas01mc@293 337
mas01mc@292 338 // make hash value \in Z
mas01mc@293 339 void H::generate_hash_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){
mas01mc@293 340 H::t1 = computeProductModDefaultPrime( g, r1, H::k ) % H::N;
mas01mc@293 341 H::t2 = computeProductModDefaultPrime( g, r2, H::k );
mas01mc@292 342 }
mas01mc@292 343
mas01mc@292 344 #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}}
mas01mc@292 345
mas01mc@292 346 // Computes (a.b) mod UH_PRIME_DEFAULT
mas01mc@293 347 inline Uns32T H::computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){
mas01mc@292 348 LongUns64T h = 0;
mas01mc@292 349
mas01mc@292 350 for(IntT i = 0; i < size; i++){
mas01mc@292 351 h = h + (LongUns64T)a[i] * (LongUns64T)b[i];
mas01mc@292 352 h = (h & TWO_TO_32_MINUS_1) + 5 * (h >> 32);
mas01mc@292 353 if (h >= UH_PRIME_DEFAULT) {
mas01mc@292 354 h = h - UH_PRIME_DEFAULT;
mas01mc@292 355 }
mas01mc@292 356 CR_ASSERT(h < UH_PRIME_DEFAULT);
mas01mc@292 357 }
mas01mc@292 358 return h;
mas01mc@292 359 }
mas01mc@292 360
mas01mc@292 361 Uns32T H::bucket_insert_point(bucket **pp){
mas01mc@296 362 collisionCount = 0;
mas01mc@340 363 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@292 364 if(!*pp){
mas01mc@292 365 *pp = new bucket();
mas01mc@292 366 (*pp)->t2 = 0; // Use t2 as a collision counter for the row
mas01mc@292 367 (*pp)->next = new bucket();
mas01mc@292 368 }
mas01mc@340 369 // The list head holds point collision count
mas01mc@340 370 if( (*pp)->t2 & LSH_CORE_ARRAY_BIT )
mas01mc@340 371 bucket_insert_point(get_pointer_to_bucket_linked_list(*pp)); // recurse
mas01mc@340 372 else{
mas01mc@340 373 collisionCount = (*pp)->t2;
mas01mc@340 374 if(collisionCount < H::C){ // Block if row is full
mas01mc@340 375 (*pp)->t2++; // Increment collision counter (numPoints in row)
mas01mc@340 376 pointCount++;
mas01mc@340 377 collisionCount++;
mas01mc@340 378 // Locate the bucket linked list
mas01mc@340 379 __bucket_insert_point( (*pp)->next );
mas01mc@340 380 }
mas01mc@292 381 }
mas01mc@340 382 #else // NOT USING LSH_LIST_HEAD_COUNTERS
mas01mc@340 383 if(!*pp)
mas01mc@340 384 *pp = new bucket();
mas01mc@296 385 pointCount++;
mas01mc@296 386 __bucket_insert_point(*pp); // No collision count storage
mas01mc@292 387 #endif
mas01mc@292 388 return collisionCount;
mas01mc@292 389 }
mas01mc@292 390
mas01mc@340 391 // insert points into hashtable row collision chain
mas01mc@292 392 void H::__bucket_insert_point(bucket* p){
mas01mc@292 393 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2?
mas01mc@292 394 p->t2 = H::t2;
mas01mc@292 395 bucketCount++; // Record start of new point-locale collision chain
mas01mc@292 396 p->snext = new sbucket();
mas01mc@292 397 __sbucket_insert_point(p->snext);
mas01mc@292 398 return;
mas01mc@292 399 }
mas01mc@292 400
mas01mc@292 401 if(p->t2 == H::t2){
mas01mc@292 402 __sbucket_insert_point(p->snext);
mas01mc@292 403 return;
mas01mc@292 404 }
mas01mc@292 405
mas01mc@292 406 if(p->next){
mas01mc@340 407 // Construct list in t2 order
mas01mc@340 408 if(H::t2 < p->next->t2){
mas01mc@340 409 bucket* tmp = new bucket();
mas01mc@340 410 tmp->next = p->next;
mas01mc@340 411 p->next = tmp;
mas01mc@340 412 __bucket_insert_point(tmp);
mas01mc@340 413 }
mas01mc@340 414 else
mas01mc@340 415 __bucket_insert_point(p->next);
mas01mc@292 416 }
mas01mc@340 417 else {
mas01mc@292 418 p->next = new bucket();
mas01mc@292 419 __bucket_insert_point(p->next);
mas01mc@292 420 }
mas01mc@292 421 }
mas01mc@292 422
mas01mc@340 423 // insert points into point-locale collision chain
mas01mc@292 424 void H::__sbucket_insert_point(sbucket* p){
mas01mc@292 425 if(p->pointID==IFLAG){
mas01mc@292 426 p->pointID = H::p;
mas01mc@292 427 return;
mas01mc@292 428 }
mas01mc@340 429
mas01mc@292 430 // Search for pointID
mas01mc@340 431 if(p->snext){
mas01mc@292 432 __sbucket_insert_point(p->snext);
mas01mc@292 433 }
mas01mc@292 434 else{
mas01mc@340 435 // Make new point collision bucket at end of list
mas01mc@340 436 p->snext = new sbucket();
mas01mc@340 437 __sbucket_insert_point(p->snext);
mas01mc@292 438 }
mas01mc@292 439 }
mas01mc@292 440
mas01mc@293 441 inline bucket** H::get_bucket(int j){
mas01mc@292 442 return *(h+j);
mas01mc@292 443 }
mas01mc@292 444
mas01mc@340 445 // Find the linked-list pointer at the end of the CORE_ARRAY
mas01mc@340 446 bucket** H::get_pointer_to_bucket_linked_list(bucket* rowPtr){
mas01mc@340 447 Uns32T numBuckets = (Uns32T) rowPtr->snext; // Cast pointer to unsigned int
mas01mc@340 448 Uns32T numPoints = rowPtr->t2 & 0x7FFFFFFF; // Value is stored in low 31 bits of t2 field
mas01mc@340 449 bucket** listPtr = reinterpret_cast<bucket**> (reinterpret_cast<unsigned int*>(rowPtr->next)+numPoints+numBuckets+1);
mas01mc@340 450 return listPtr;
mas01mc@340 451 }
mas01mc@340 452
mas01mc@293 453 // Interface to Locality Sensitive Hashing G
mas01mc@293 454 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr):
mas01mc@293 455 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures
mas01mc@308 456 indexName(0),
mas01mc@293 457 lshHeader(0),
mas01mc@292 458 calling_instance(0),
mas01mc@293 459 add_point_callback(0)
mas01mc@292 460 {
mas01mc@293 461
mas01mc@292 462 }
mas01mc@292 463
mas01mc@292 464 // Serialize from file LSH constructor
mas01mc@292 465 // Read parameters from database file
mas01mc@292 466 // Load the hash functions, close the database
mas01mc@292 467 // Optionally load the LSH tables into head-allocated lists in core
mas01mc@292 468 G::G(char* filename, bool lshInCoreFlag):
mas01mc@293 469 H(), // default base-class constructor call delays data-structure initialization
mas01mc@309 470 indexName(0),
mas01mc@293 471 lshHeader(0),
mas01mc@292 472 calling_instance(0),
mas01mc@292 473 add_point_callback(0)
mas01mc@292 474 {
mas01mc@292 475 int dbfid = unserialize_lsh_header(filename);
mas01mc@309 476 indexName = new char[O2_INDEX_MAXSTR];
mas01mc@309 477 strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER
mas01mc@293 478 H::initialize_lsh_functions(); // Base-class data-structure initialization
mas01mc@293 479 unserialize_lsh_functions(dbfid); // populate with on-disk hashfunction values
mas01mc@292 480
mas01mc@292 481 // Format1 only needs unserializing if specifically requested
mas01mc@292 482 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2) && lshInCoreFlag){
mas01mc@292 483 unserialize_lsh_hashtables_format1(dbfid);
mas01mc@292 484 }
mas01mc@292 485
mas01mc@292 486 // Format2 always needs unserializing
mas01mc@292 487 if(lshHeader->flags&O2_SERIAL_FILEFORMAT2 && lshInCoreFlag){
mas01mc@336 488 FILE* dbFile = fdopen(dbfid, "rb");
mas01mc@336 489 if(!dbFile)
mas01mc@336 490 error("Cannot open LSH file for reading", filename);
mas01mc@336 491 unserialize_lsh_hashtables_format2(dbFile);
mas01mc@292 492 }
mas01mc@336 493 serial_close(dbfid);
mas01mc@336 494 }
mas01mc@292 495
mas01mc@292 496 G::~G(){
mas01mc@292 497 delete lshHeader;
mas01mc@292 498 }
mas01mc@292 499
mas01mc@292 500 // single point insertion; inserted values are hash value and pointID
mas01mc@292 501 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
mas01mc@292 502 Uns32T collisionCount = 0;
mas01mc@292 503 H::p = pp;
mas01mc@299 504 if(H::maxp && pp<=H::maxp)
mas01mc@296 505 error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
mas01mc@296 506 H::maxp=pp; // Store highest pointID in database
mas01mc@293 507 H::compute_hash_functions( v );
mas01mc@292 508 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
mas01mc@293 509 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
mas01mc@292 510 collisionCount += bucket_insert_point( *(h + j) + t1 );
mas01mc@292 511 }
mas01mc@292 512 return collisionCount;
mas01mc@292 513 }
mas01mc@292 514
mas01mc@292 515
mas01mc@292 516 // batch insert for a point set
mas01mc@292 517 // inserted values are vector hash value and pointID starting at basePointID
mas01mc@292 518 void G::insert_point_set(vector<vector<float> >& vv, Uns32T basePointID){
mas01mc@292 519 for(Uns32T point=0; point<vv.size(); point++)
mas01mc@292 520 insert_point(vv[point], basePointID+point);
mas01mc@292 521 }
mas01mc@292 522
mas01mc@292 523 // point retrieval routine
mas01mc@292 524 void G::retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 525 calling_instance = caller;
mas01mc@292 526 add_point_callback = add_point;
mas01mc@293 527 H::compute_hash_functions( v );
mas01mc@292 528 for(Uns32T j = 0 ; j < H::L ; j++ ){
mas01mc@293 529 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
mas01mc@293 530 if( bucket* bPtr = *(get_bucket(j) + get_t1()) )
mas01mc@340 531 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@340 532 if(bPtr->t2&LSH_CORE_ARRAY_BIT)
mas01mc@340 533 retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos);
mas01mc@340 534 else
mas01mc@340 535 bucket_chain_point( bPtr->next, qpos);
mas01mc@292 536 #else
mas01mc@292 537 bucket_chain_point( bPtr , qpos);
mas01mc@292 538 #endif
mas01mc@292 539 }
mas01mc@292 540 }
mas01mc@292 541
mas01mc@292 542 void G::retrieve_point_set(vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 543 for(Uns32T qpos = 0 ; qpos < vv.size() ; qpos++ )
mas01mc@292 544 retrieve_point(vv[qpos], qpos, add_point, caller);
mas01mc@292 545 }
mas01mc@292 546
mas01mc@292 547 // export lsh tables to table structure on disk
mas01mc@292 548 //
mas01mc@292 549 // LSH TABLE STRUCTURE
mas01mc@292 550 // ---header 64 bytes ---
mas01mc@292 551 // [magic #tables #rows #cols elementSize databaseSize version flags dim #funs 0 0 0 0 0 0]
mas01mc@292 552 //
mas01mc@292 553 // ---random projections L x k x d float ---
mas01mc@292 554 // A[0][0][0] A[0][0][1] ... A[0][0][d-1]
mas01mc@292 555 // A[0][1][0] A[0][1][1] ... A[1][1][d-1]
mas01mc@292 556 // ...
mas01mc@292 557 // A[0][K-1][0] A[0][1][1] ... A[0][k-1][d-1]
mas01mc@292 558 // ...
mas01mc@292 559 // ...
mas01mc@292 560 // A[L-1][0][0] A[M-1][0][1] ... A[L-1][0][d-1]
mas01mc@292 561 // A[L-1][1][0] A[M-1][1][1] ... A[L-1][1][d-1]
mas01mc@292 562 // ...
mas01mc@292 563 // A[L-1][k-1][0] A[M-1][1][1] ... A[L-1][k-1][d-1]
mas01mc@292 564 //
mas01mc@292 565 // ---bias L x k float ---
mas01mc@292 566 // b[0][0] b[0][1] ... b[0][k-1]
mas01mc@292 567 // b[1][0] b[1][1] ... b[1][k-1]
mas01mc@292 568 // ...
mas01mc@292 569 // b[L-1][0] b[L-1][1] ... b[L-1][k-1]
mas01mc@292 570 //
mas01mc@292 571 // ---random r1 L x k float ---
mas01mc@292 572 // r1[0][0] r1[0][1] ... r1[0][k-1]
mas01mc@292 573 // r1[1][0] r1[1][1] ... r1[1][k-1]
mas01mc@292 574 // ...
mas01mc@292 575 // r1[L-1][0] r1[L-1][1] ... r1[L-1][k-1]
mas01mc@292 576 //
mas01mc@292 577 // ---random r2 L x k float ---
mas01mc@292 578 // r2[0][0] r2[0][1] ... r2[0][k-1]
mas01mc@292 579 // r2[1][0] r2[1][1] ... r2[1][k-1]
mas01mc@292 580 // ...
mas01mc@292 581 // r2[L-1][0] r2[L-1][1] ... r2[L-1][k-1]
mas01mc@292 582 //
mas01mc@293 583 // ******* HASHTABLES FORMAT1 (optimized for LSH_ON_DISK retrieval) *******
mas01mc@292 584 // ---hash table 0: N x C x 8 ---
mas01mc@292 585 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 586 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 587 // ...
mas01mc@292 588 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 589 //
mas01mc@292 590 // ---hash table 1: N x C x 8 ---
mas01mc@292 591 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 592 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 593 // ...
mas01mc@292 594 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 595 //
mas01mc@292 596 // ...
mas01mc@292 597 //
mas01mc@292 598 // ---hash table L-1: N x C x 8 ---
mas01mc@292 599 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 600 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 601 // ...
mas01mc@292 602 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 603 //
mas01mc@293 604 // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) *******
mas01mc@293 605 //
mas01mc@293 606 // State machine controlled by regular expression.
mas01mc@293 607 // legend:
mas01mc@293 608 //
mas01mc@306 609 // O2_SERIAL_TOKEN_T1 = 0xFFFFFFFCU
mas01mc@306 610 // O2_SERIAL_TOKEN_T2 = 0xFFFFFFFDU
mas01mc@306 611 // O2_SERIAL_TOKEN_ENDTABLE = 0xFFFFFFFEU
mas01mc@293 612 //
mas01mc@306 613 // T1 - T1 hash token
mas01mc@306 614 // t1 - t1 hash key (t1 range 0..2^29-1)
mas01mc@306 615 // T2 - T2 token
mas01mc@293 616 // t2 - t2 hash key (range 1..2^32-6)
mas01mc@293 617 // p - point identifier (range 0..2^32-1)
mas01mc@306 618 // E - end hash table token
mas01mc@293 619 // {...} required arguments
mas01mc@293 620 // [...] optional arguments
mas01mc@293 621 // * - match zero or more occurences
mas01mc@293 622 // + - match one or more occurences
mas01mc@293 623 // {...}^L - repeat argument L times
mas01mc@293 624 //
mas01mc@293 625 // FORMAT2 Regular expression:
mas01mc@306 626 // { [T1 t1 [T2 t2 p+]+ ]* E }^L
mas01mc@293 627 //
mas01mc@292 628
mas01mc@292 629 // Serial header constructors
mas01mc@292 630 SerialHeader::SerialHeader(){;}
mas01mc@296 631 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
mas01mc@292 632 lshMagic(O2_SERIAL_MAGIC),
mas01mc@292 633 binWidth(W),
mas01mc@292 634 numTables(L),
mas01mc@292 635 numRows(N),
mas01mc@292 636 numCols(C),
mas01mc@292 637 elementSize(O2_SERIAL_ELEMENT_SIZE),
mas01mc@296 638 version(O2_SERIAL_VERSION),
mas01mc@296 639 size(0), // we are deprecating this value
mas01mc@292 640 flags(FMT),
mas01mc@292 641 dataDim(d),
mas01mc@292 642 numFuns(k),
mas01mc@292 643 radius(r),
mas01mc@296 644 maxp(p),
mas01mc@296 645 size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables
mas01mc@296 646 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
mas01mc@296 647 (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
mas01mc@296 648 pointCount(pc){
mas01mc@296 649
mas01mc@296 650 if(FMT==O2_SERIAL_FILEFORMAT2)
mas01mc@296 651 size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE
mas01mc@296 652 + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
mas01mc@296 653 +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
mas01mc@296 654 } // header
mas01mc@292 655
mas01mc@292 656 float* G::get_serial_hashfunction_base(char* db){
mas01mc@292 657 if(db&&lshHeader)
mas01mc@292 658 return (float*)(db+O2_SERIAL_HEADER_SIZE);
mas01mc@292 659 else return NULL;
mas01mc@292 660 }
mas01mc@292 661
mas01mc@292 662 SerialElementT* G::get_serial_hashtable_base(char* db){
mas01mc@292 663 if(db&&lshHeader)
mas01mc@292 664 return (SerialElementT*)(db+get_serial_hashtable_offset());
mas01mc@292 665 else
mas01mc@292 666 return NULL;
mas01mc@292 667 }
mas01mc@292 668
mas01mc@292 669 Uns32T G::get_serial_hashtable_offset(){
mas01mc@292 670 if(lshHeader)
mas01mc@292 671 return align_up(O2_SERIAL_HEADER_SIZE +
mas01mc@292 672 L*lshHeader->numFuns*( sizeof(float)*lshHeader->dataDim+2*sizeof(Uns32T)+sizeof(float)),get_page_logn());
mas01mc@292 673 else
mas01mc@292 674 return 0;
mas01mc@292 675 }
mas01mc@292 676
mas01mc@292 677 void G::serialize(char* filename, Uns32T serialFormat){
mas01mc@292 678 int dbfid;
mas01mc@292 679 char* db;
mas01mc@292 680 int dbIsNew=0;
mas01mc@292 681
mas01mc@292 682 // Check requested serialFormat
mas01mc@292 683 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
mas01mc@292 684 error("Unrecognized serial file format request: ", "serialize()");
mas01mc@296 685
mas01mc@292 686 // Test to see if file exists
mas01mc@292 687 if((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 688 // If it doesn't, then create the file (CREATE)
mas01mc@292 689 if(errno == ENOENT){
mas01mc@292 690 // Create the file
mas01mc@292 691 std::cout << "Creating new serialized LSH database:" << filename << "...";
mas01mc@292 692 std::cout.flush();
mas01mc@292 693 serial_create(filename, serialFormat);
mas01mc@292 694 dbIsNew=1;
mas01mc@292 695 }
mas01mc@292 696 else
mas01mc@292 697 // The file can't be opened
mas01mc@292 698 error("Can't open the file", filename, "open");
mas01mc@292 699
mas01mc@292 700 // Load the on-disk header into core
mas01mc@292 701 dbfid = serial_open(filename, 1); // open for write
mas01mc@292 702 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
mas01mc@292 703 serial_get_header(db); // read header
mas01mc@292 704 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@292 705
mas01mc@292 706 // Check compatibility of core and disk data structures
mas01mc@292 707 if( !serial_can_merge(serialFormat) )
mas01mc@292 708 error("Incompatible core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 709
mas01mc@292 710 // For new LSH databases write the hashfunctions
mas01mc@292 711 if(dbIsNew)
mas01mc@292 712 serialize_lsh_hashfunctions(dbfid);
mas01mc@292 713 // Write the hashtables in the requested format
mas01mc@292 714 if(serialFormat == O2_SERIAL_FILEFORMAT1)
mas01mc@292 715 serialize_lsh_hashtables_format1(dbfid, !dbIsNew);
mas01mc@336 716 else{
mas01mc@336 717 FILE* dbFile = fdopen(dbfid, "r+b");
mas01mc@336 718 if(!dbFile)
mas01mc@336 719 error("Cannot open LSH file for writing",filename);
mas01mc@336 720 serialize_lsh_hashtables_format2(dbFile, !dbIsNew);
mas01mc@336 721 fflush(dbFile);
mas01mc@336 722 }
mas01mc@292 723
mas01mc@336 724 if(!dbIsNew) {
mas01mc@292 725 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
mas01mc@292 726 //serial_get_header(db); // read header
mas01mc@293 727 cout << "maxp = " << H::maxp << endl;
mas01mc@293 728 lshHeader->maxp=H::maxp;
mas01mc@292 729 // Default to FILEFORMAT1
mas01mc@292 730 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2))
mas01mc@294 731 lshHeader->flags|=O2_SERIAL_FILEFORMAT1;
mas01mc@292 732 memcpy((char*)db, (char*)lshHeader, sizeof(SerialHeaderT));
mas01mc@292 733 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@336 734 }
mas01mc@336 735 serial_close(dbfid);
mas01mc@292 736 }
mas01mc@292 737
mas01mc@292 738 // Test to see if core structure and requested format is
mas01mc@292 739 // compatible with currently opened database
mas01mc@292 740 int G::serial_can_merge(Uns32T format){
mas01mc@292 741 SerialHeaderT* that = lshHeader;
mas01mc@292 742 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2)
mas01mc@292 743 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
mas01mc@292 744 || !( this->w == that->binWidth &&
mas01mc@296 745 this->L == that->numTables &&
mas01mc@296 746 this->N == that->numRows &&
mas01mc@296 747 this->k == that->numFuns &&
mas01mc@296 748 this->d == that->dataDim &&
mas01mc@296 749 sizeof(SerialElementT) == that->elementSize &&
mas01mc@296 750 this->radius == that->radius)){
mas01mc@292 751 serial_print_header(format);
mas01mc@292 752 return 0;
mas01mc@292 753 }
mas01mc@292 754 else
mas01mc@292 755 return 1;
mas01mc@292 756 }
mas01mc@292 757
mas01mc@292 758 // Used as an error message for serial_can_merge()
mas01mc@292 759 void G::serial_print_header(Uns32T format){
mas01mc@292 760 std::cout << "Fc:" << format << " Fs:" << lshHeader->flags << endl;
mas01mc@292 761 std::cout << "Wc:" << w << " Ls:" << lshHeader->binWidth << endl;
mas01mc@292 762 std::cout << "Lc:" << L << " Ls:" << lshHeader->numTables << endl;
mas01mc@292 763 std::cout << "Nc:" << N << " Ns:" << lshHeader->numRows << endl;
mas01mc@292 764 std::cout << "kc:" << k << " ks:" << lshHeader->numFuns << endl;
mas01mc@292 765 std::cout << "dc:" << d << " ds:" << lshHeader->dataDim << endl;
mas01mc@292 766 std::cout << "sc:" << sizeof(SerialElementT) << " ss:" << lshHeader->elementSize << endl;
mas01mc@292 767 std::cout << "rc:" << this->radius << " rs:" << lshHeader->radius << endl;
mas01mc@292 768 }
mas01mc@292 769
mas01mc@292 770 int G::serialize_lsh_hashfunctions(int fid){
mas01mc@292 771 float* pf;
mas01mc@292 772 Uns32T *pu;
mas01mc@292 773 Uns32T x,y,z;
mas01mc@292 774
mas01mc@293 775 char* db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer
mas01mc@292 776 pf = get_serial_hashfunction_base(db);
mas01mc@292 777
mas01mc@292 778 // HASH FUNCTIONS
mas01mc@292 779 // Write the random projectors A[][][]
mas01mc@292 780 #ifdef USE_U_FUNCTIONS
mas01mc@292 781 for( x = 0 ; x < H::m ; x++ )
mas01mc@292 782 for( y = 0 ; y < H::k/2 ; y++ )
mas01mc@292 783 #else
mas01mc@292 784 for( x = 0 ; x < H::L ; x++ )
mas01mc@292 785 for( y = 0 ; y < H::k ; y++ )
mas01mc@292 786 #endif
mas01mc@292 787 for( z = 0 ; z < d ; z++ )
mas01mc@293 788 *pf++ = H::A[x][y][z];
mas01mc@292 789
mas01mc@292 790 // Write the random biases b[][]
mas01mc@292 791 #ifdef USE_U_FUNCTIONS
mas01mc@292 792 for( x = 0 ; x < H::m ; x++ )
mas01mc@292 793 for( y = 0 ; y < H::k/2 ; y++ )
mas01mc@292 794 #else
mas01mc@292 795 for( x = 0 ; x < H::L ; x++ )
mas01mc@292 796 for( y = 0 ; y < H::k ; y++ )
mas01mc@292 797 #endif
mas01mc@293 798 *pf++ = H::b[x][y];
mas01mc@292 799
mas01mc@292 800 pu = (Uns32T*)pf;
mas01mc@292 801
mas01mc@292 802 // Write the Z projectors r1[][]
mas01mc@292 803 for( x = 0 ; x < H::L ; x++)
mas01mc@292 804 for( y = 0 ; y < H::k ; y++)
mas01mc@293 805 *pu++ = H::r1[x][y];
mas01mc@292 806
mas01mc@292 807 // Write the Z projectors r2[][]
mas01mc@292 808 for( x = 0 ; x < H::L ; x++)
mas01mc@292 809 for( y = 0; y < H::k ; y++)
mas01mc@293 810 *pu++ = H::r2[x][y];
mas01mc@292 811
mas01mc@292 812 serial_munmap(db, get_serial_hashtable_offset());
mas01mc@292 813 return 1;
mas01mc@292 814 }
mas01mc@292 815
mas01mc@292 816 int G::serialize_lsh_hashtables_format1(int fid, int merge){
mas01mc@292 817 SerialElementT *pe, *pt;
mas01mc@292 818 Uns32T x,y;
mas01mc@292 819
mas01mc@292 820 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT1) )
mas01mc@292 821 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 822
mas01mc@292 823 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 824 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount;
mas01mc@292 825 // Write the hash tables
mas01mc@292 826 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 827 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT1...";
mas01mc@292 828 std::cout.flush();
mas01mc@292 829 // memory map a single hash table for sequential access
mas01mc@292 830 // Align each hash table to page boundary
mas01mc@292 831 char* dbtable = serial_mmap(fid, hashTableSize, 1,
mas01mc@292 832 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
mas01mc@324 833 #ifdef __CYGWIN__
mas01mc@324 834 // No madvise in CYGWIN
mas01mc@324 835 #else
mas01mc@292 836 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 837 error("could not advise hashtable memory","","madvise");
mas01mc@324 838 #endif
mas01mc@292 839 maxColCount=0;
mas01mc@292 840 minColCount=O2_SERIAL_MAX_COLS;
mas01mc@292 841 meanColCount=0;
mas01mc@292 842 colCountN=0;
mas01mc@292 843 pt=(SerialElementT*)dbtable;
mas01mc@292 844 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 845 // Move disk pointer to beginning of row
mas01mc@292 846 pe=pt+y*lshHeader->numCols;
mas01mc@292 847
mas01mc@292 848 colCount=0;
mas01mc@292 849 if(bucket* bPtr = h[x][y])
mas01mc@292 850 if(merge)
mas01mc@340 851 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@292 852 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 853 else
mas01mc@292 854 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 855 #else
mas01mc@292 856 serial_merge_hashtable_row_format1(pe, bPtr, colCount);
mas01mc@292 857 else
mas01mc@292 858 serial_write_hashtable_row_format1(pe, bPtr, colCount);
mas01mc@292 859 #endif
mas01mc@292 860 if(colCount){
mas01mc@292 861 if(colCount<minColCount)
mas01mc@292 862 minColCount=colCount;
mas01mc@292 863 if(colCount>maxColCount)
mas01mc@292 864 maxColCount=colCount;
mas01mc@292 865 meanColCount+=colCount;
mas01mc@292 866 colCountN++;
mas01mc@292 867 }
mas01mc@292 868 }
mas01mc@292 869 if(colCountN)
mas01mc@292 870 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
mas01mc@292 871 << ", min = " << minColCount << ", max = " << maxColCount
mas01mc@292 872 << endl;
mas01mc@292 873 serial_munmap(dbtable, hashTableSize);
mas01mc@292 874 }
mas01mc@292 875
mas01mc@292 876 // We're done writing
mas01mc@292 877 return 1;
mas01mc@292 878 }
mas01mc@292 879
mas01mc@292 880 void G::serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* b, Uns32T& colCount){
mas01mc@292 881 while(b && b->t2!=IFLAG){
mas01mc@292 882 SerialElementT*pe=pr; // reset disk pointer to beginning of row
mas01mc@292 883 serial_merge_element_format1(pe, b->snext, b->t2, colCount);
mas01mc@292 884 b=b->next;
mas01mc@292 885 }
mas01mc@292 886 }
mas01mc@292 887
mas01mc@292 888 void G::serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
mas01mc@292 889 while(sb){
mas01mc@292 890 if(colCount==lshHeader->numCols){
mas01mc@292 891 std::cout << "!point-chain full " << endl;
mas01mc@292 892 return;
mas01mc@292 893 }
mas01mc@292 894 Uns32T c=0;
mas01mc@292 895 // Merge collision chains
mas01mc@292 896 while(c<lshHeader->numCols){
mas01mc@292 897 if( (pe+c)->hashValue==IFLAG){
mas01mc@292 898 (pe+c)->hashValue=t2;
mas01mc@292 899 (pe+c)->pointID=sb->pointID;
mas01mc@292 900 colCount=c+1;
mas01mc@292 901 if(c+1<lshHeader->numCols)
mas01mc@292 902 (pe+c+1)->hashValue=IFLAG;
mas01mc@292 903 break;
mas01mc@292 904 }
mas01mc@292 905 c++;
mas01mc@292 906 }
mas01mc@292 907 sb=sb->snext;
mas01mc@292 908 }
mas01mc@292 909 return;
mas01mc@292 910 }
mas01mc@292 911
mas01mc@292 912 void G::serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* b, Uns32T& colCount){
mas01mc@292 913 pe->hashValue=IFLAG;
mas01mc@292 914 while(b && b->t2!=IFLAG){
mas01mc@292 915 serial_write_element_format1(pe, b->snext, b->t2, colCount);
mas01mc@292 916 b=b->next;
mas01mc@292 917 }
mas01mc@292 918 }
mas01mc@292 919
mas01mc@292 920 void G::serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
mas01mc@292 921 while(sb){
mas01mc@292 922 if(colCount==lshHeader->numCols){
mas01mc@292 923 std::cout << "!point-chain full " << endl;
mas01mc@292 924 return;
mas01mc@292 925 }
mas01mc@292 926 pe->hashValue=t2;
mas01mc@292 927 pe->pointID=sb->pointID;
mas01mc@292 928 pe++;
mas01mc@292 929 colCount++;
mas01mc@292 930 sb=sb->snext;
mas01mc@292 931 }
mas01mc@292 932 pe->hashValue=IFLAG;
mas01mc@292 933 return;
mas01mc@292 934 }
mas01mc@292 935
mas01mc@336 936 int G::serialize_lsh_hashtables_format2(FILE* dbFile, int merge){
mas01mc@292 937 Uns32T x,y;
mas01mc@292 938
mas01mc@292 939 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
mas01mc@292 940 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 941
mas01mc@340 942 // We must pereform FORMAT2 merges in core FORMAT1 (dynamic list structure)
mas01mc@292 943 if(merge)
mas01mc@340 944 unserialize_lsh_hashtables_format2(dbFile, merge);
mas01mc@292 945
mas01mc@292 946 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
mas01mc@336 947 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
mas01mc@336 948 fclose(dbFile);
mas01mc@336 949 error("fSeek error in serialize_lsh_hashtables_format2");
mas01mc@336 950 }
mas01mc@292 951
mas01mc@292 952 // Write the hash tables
mas01mc@292 953 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 954 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT2...";
mas01mc@292 955 std::cout.flush();
mas01mc@292 956 maxColCount=0;
mas01mc@292 957 minColCount=O2_SERIAL_MAX_COLS;
mas01mc@292 958 meanColCount=0;
mas01mc@292 959 colCountN=0;
mas01mc@292 960 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 961 colCount=0;
mas01mc@292 962 if(bucket* bPtr = h[x][y]){
mas01mc@306 963 // Check for empty row (even though row was allocated)
mas01mc@340 964 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@306 965 if(bPtr->next->t2==IFLAG){
mas01mc@336 966 fclose(dbFile);
mas01mc@306 967 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
mas01mc@306 968 }
mas01mc@306 969 #else
mas01mc@306 970 if(bPtr->t2==IFLAG){
mas01mc@336 971 fclose(dbFile);
mas01mc@306 972 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
mas01mc@306 973 }
mas01mc@306 974 #endif
mas01mc@306 975 t1 = O2_SERIAL_TOKEN_T1;
mas01mc@340 976 WRITE_UNS32(&t1, "[T1]");
mas01mc@306 977 t1 = y;
mas01mc@340 978 WRITE_UNS32(&t1, "[t1]");
mas01mc@340 979 #ifdef LSH_CORE_ARRAY
mas01mc@340 980 t1 = count_buckets_and_points_hashtable_row(bPtr);
mas01mc@340 981 WRITE_UNS32(&t1,"[count]"); // write numElements
mas01mc@340 982 #endif
mas01mc@340 983 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@336 984 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 985 #else
mas01mc@340 986 serial_write_hashtable_row_format2(dbFile, bPtr, colCount);
mas01mc@292 987 #endif
mas01mc@292 988 }
mas01mc@292 989 if(colCount){
mas01mc@292 990 if(colCount<minColCount)
mas01mc@292 991 minColCount=colCount;
mas01mc@292 992 if(colCount>maxColCount)
mas01mc@292 993 maxColCount=colCount;
mas01mc@292 994 meanColCount+=colCount;
mas01mc@292 995 colCountN++;
mas01mc@292 996 }
mas01mc@292 997 }
mas01mc@292 998 // Write END of table marker
mas01mc@306 999 t1 = O2_SERIAL_TOKEN_ENDTABLE;
mas01mc@340 1000 WRITE_UNS32(&t1,"[end]");
mas01mc@292 1001 if(colCountN)
mas01mc@292 1002 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
mas01mc@292 1003 << ", min = " << minColCount << ", max = " << maxColCount
mas01mc@292 1004 << endl;
mas01mc@340 1005 }
mas01mc@292 1006 // We're done writing
mas01mc@292 1007 return 1;
mas01mc@292 1008 }
mas01mc@292 1009
mas01mc@340 1010 Uns32T G::count_buckets_and_points_hashtable_row(bucket* bPtr){
mas01mc@340 1011 Uns32T total_count = 0;
mas01mc@340 1012 bucket* p = 0;
mas01mc@340 1013
mas01mc@340 1014 // count points
mas01mc@340 1015 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@340 1016 total_count = bPtr->t2; // points already counted
mas01mc@340 1017 p = bPtr->next;
mas01mc@340 1018 #else
mas01mc@340 1019 total_count = count_points_hashtable_row(bPtr);
mas01mc@340 1020 p = bPtr;
mas01mc@340 1021 #endif
mas01mc@340 1022
mas01mc@340 1023 // count buckets
mas01mc@340 1024 do{
mas01mc@340 1025 total_count++;
mas01mc@340 1026 }while((p=p->next));
mas01mc@340 1027
mas01mc@340 1028 return total_count;
mas01mc@340 1029 }
mas01mc@340 1030
mas01mc@340 1031 Uns32T G::count_points_hashtable_row(bucket* bPtr){
mas01mc@340 1032 Uns32T point_count = 0;
mas01mc@340 1033 bucket* p = bPtr;
mas01mc@340 1034 sbucket* s = 0;
mas01mc@340 1035 while(p){
mas01mc@340 1036 s = p->snext;
mas01mc@340 1037 while(s){
mas01mc@340 1038 point_count++;
mas01mc@340 1039 s=s->snext;
mas01mc@340 1040 }
mas01mc@340 1041 p=p->next;
mas01mc@340 1042 }
mas01mc@340 1043 return point_count;
mas01mc@340 1044 }
mas01mc@340 1045
mas01mc@336 1046 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
mas01mc@292 1047 while(b && b->t2!=IFLAG){
mas01mc@306 1048 if(!b->snext){
mas01mc@336 1049 fclose(dbFile);
mas01mc@306 1050 error("Empty collision chain in serial_write_hashtable_row_format2()");
mas01mc@306 1051 }
mas01mc@306 1052 t2 = O2_SERIAL_TOKEN_T2;
mas01mc@336 1053 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 1054 fclose(dbFile);
mas01mc@292 1055 error("write error in serial_write_hashtable_row_format2()");
mas01mc@292 1056 }
mas01mc@292 1057 t2 = b->t2;
mas01mc@336 1058 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 1059 fclose(dbFile);
mas01mc@292 1060 error("write error in serial_write_hashtable_row_format2()");
mas01mc@292 1061 }
mas01mc@336 1062 serial_write_element_format2(dbFile, b->snext, colCount);
mas01mc@292 1063 b=b->next;
mas01mc@292 1064 }
mas01mc@292 1065 }
mas01mc@292 1066
mas01mc@336 1067 void G::serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount){
mas01mc@292 1068 while(sb){
mas01mc@336 1069 if(fwrite(&sb->pointID, sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1070 fclose(dbFile);
mas01mc@292 1071 error("Write error in serial_write_element_format2()");
mas01mc@292 1072 }
mas01mc@292 1073 colCount++;
mas01mc@292 1074 sb=sb->snext;
mas01mc@292 1075 }
mas01mc@292 1076 }
mas01mc@292 1077
mas01mc@292 1078
mas01mc@292 1079 int G::serial_create(char* filename, Uns32T FMT){
mas01mc@292 1080 return serial_create(filename, w, L, N, C, k, d, FMT);
mas01mc@292 1081 }
mas01mc@292 1082
mas01mc@292 1083
mas01mc@292 1084 int G::serial_create(char* filename, float binWidth, Uns32T numTables, Uns32T numRows, Uns32T numCols,
mas01mc@292 1085 Uns32T numFuns, Uns32T dim, Uns32T FMT){
mas01mc@292 1086
mas01mc@292 1087 if(numTables > O2_SERIAL_MAX_TABLES || numRows > O2_SERIAL_MAX_ROWS
mas01mc@292 1088 || numCols > O2_SERIAL_MAX_COLS || numFuns > O2_SERIAL_MAX_FUNS
mas01mc@292 1089 || dim>O2_SERIAL_MAX_DIM){
mas01mc@292 1090 error("LSH parameters out of bounds for serialization");
mas01mc@292 1091 }
mas01mc@292 1092
mas01mc@292 1093 int dbfid;
mas01mc@292 1094 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0)
mas01mc@292 1095 error("Can't create serial file", filename, "open");
mas01mc@292 1096 get_lock(dbfid, 1);
mas01mc@292 1097
mas01mc@292 1098 // Make header first to get size of serialized database
mas01mc@296 1099 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);
mas01mc@296 1100
mas01mc@296 1101 cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
mas01mc@296 1102 if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
mas01mc@296 1103 error("Maximum size of LSH file exceded: > 4000MB");
mas01mc@296 1104
mas01mc@292 1105 // go to the location corresponding to the last byte
mas01mc@292 1106 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
mas01mc@292 1107 error("lseek error in db file", "", "lseek");
mas01mc@292 1108
mas01mc@292 1109 // write a dummy byte at the last location
mas01mc@292 1110 if (write (dbfid, "", 1) != 1)
mas01mc@292 1111 error("write error", "", "write");
mas01mc@292 1112
mas01mc@293 1113 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
mas01mc@296 1114
mas01mc@292 1115 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
mas01mc@296 1116
mas01mc@292 1117 serial_munmap(db, O2_SERIAL_HEADER_SIZE);
mas01mc@296 1118
mas01mc@292 1119 close(dbfid);
mas01mc@292 1120
mas01mc@292 1121 std::cout << "done initializing tables." << endl;
mas01mc@292 1122
mas01mc@292 1123 return 1;
mas01mc@292 1124 }
mas01mc@292 1125
mas01mc@292 1126 char* G::serial_mmap(int dbfid, Uns32T memSize, Uns32T forWrite, off_t offset){
mas01mc@293 1127 char* db;
mas01mc@292 1128 if(forWrite){
mas01mc@292 1129 if ((db = (char*) mmap(0, memSize, PROT_READ | PROT_WRITE,
mas01mc@292 1130 MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
mas01mc@292 1131 error("mmap error in request for writable serialized database", "", "mmap");
mas01mc@292 1132 }
mas01mc@292 1133 else if ((db = (char*) mmap(0, memSize, PROT_READ, MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
mas01mc@292 1134 error("mmap error in read-only serialized database", "", "mmap");
mas01mc@292 1135
mas01mc@292 1136 return db;
mas01mc@292 1137 }
mas01mc@292 1138
mas01mc@292 1139 SerialHeaderT* G::serial_get_header(char* db){
mas01mc@292 1140 lshHeader = new SerialHeaderT();
mas01mc@292 1141 memcpy((char*)lshHeader, db, sizeof(SerialHeaderT));
mas01mc@292 1142
mas01mc@292 1143 if(lshHeader->lshMagic!=O2_SERIAL_MAGIC)
mas01mc@292 1144 error("Not an LSH database file");
mas01mc@292 1145
mas01mc@292 1146 return lshHeader;
mas01mc@292 1147 }
mas01mc@292 1148
mas01mc@292 1149 void G::serial_munmap(char* db, Uns32T N){
mas01mc@292 1150 munmap(db, N);
mas01mc@292 1151 }
mas01mc@292 1152
mas01mc@292 1153 int G::serial_open(char* filename, int writeFlag){
mas01mc@292 1154 int dbfid;
mas01mc@292 1155 if(writeFlag){
mas01mc@292 1156 if ((dbfid = open (filename, O_RDWR)) < 0)
mas01mc@292 1157 error("Can't open serial file for read/write", filename, "open");
mas01mc@292 1158 get_lock(dbfid, writeFlag);
mas01mc@292 1159 }
mas01mc@292 1160 else{
mas01mc@292 1161 if ((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 1162 error("Can't open serial file for read", filename, "open");
mas01mc@292 1163 get_lock(dbfid, 0);
mas01mc@292 1164 }
mas01mc@292 1165
mas01mc@292 1166 return dbfid;
mas01mc@292 1167 }
mas01mc@292 1168
mas01mc@292 1169 void G::serial_close(int dbfid){
mas01mc@292 1170
mas01mc@292 1171 release_lock(dbfid);
mas01mc@292 1172 close(dbfid);
mas01mc@292 1173 }
mas01mc@292 1174
mas01mc@292 1175 int G::unserialize_lsh_header(char* filename){
mas01mc@292 1176
mas01mc@292 1177 int dbfid;
mas01mc@292 1178 char* db;
mas01mc@292 1179 // Test to see if file exists
mas01mc@292 1180 if((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 1181 error("Can't open the file", filename, "open");
mas01mc@292 1182 close(dbfid);
mas01mc@292 1183 dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1184 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1185 serial_get_header(db); // read header
mas01mc@292 1186 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@292 1187
mas01mc@292 1188 // Unserialize header parameters
mas01mc@292 1189 H::L = lshHeader->numTables;
mas01mc@292 1190 H::m = (Uns32T)( (1.0 + sqrt(1 + 8.0*(int)H::L)) / 2.0);
mas01mc@292 1191 H::N = lshHeader->numRows;
mas01mc@292 1192 H::C = lshHeader->numCols;
mas01mc@292 1193 H::k = lshHeader->numFuns;
mas01mc@292 1194 H::d = lshHeader->dataDim;
mas01mc@293 1195 H::w = lshHeader->binWidth;
mas01mc@293 1196 H::radius = lshHeader->radius;
mas01mc@293 1197 H::maxp = lshHeader->maxp;
mas01mc@296 1198 H::pointCount = lshHeader->pointCount;
mas01mc@292 1199
mas01mc@292 1200 return dbfid;
mas01mc@292 1201 }
mas01mc@292 1202
mas01mc@292 1203 // unserialize the LSH parameters
mas01mc@292 1204 // we leave the LSH tree on disk as a flat file
mas01mc@292 1205 // it is this flat file that we search by memory mapping
mas01mc@292 1206 void G::unserialize_lsh_functions(int dbfid){
mas01mc@292 1207 Uns32T j, kk;
mas01mc@292 1208 float* pf;
mas01mc@292 1209 Uns32T* pu;
mas01mc@292 1210
mas01mc@292 1211 // Load the hash functions into core
mas01mc@292 1212 char* db = serial_mmap(dbfid, get_serial_hashtable_offset(), 0);// get database pointer again
mas01mc@292 1213
mas01mc@292 1214 pf = get_serial_hashfunction_base(db);
mas01mc@292 1215
mas01mc@292 1216 #ifdef USE_U_FUNCTIONS
mas01mc@292 1217 for( j = 0 ; j < H::m ; j++ ){ // L functions gj(v)
mas01mc@292 1218 for( kk = 0 ; kk < H::k/2 ; kk++ ){ // Normally distributed hash functions
mas01mc@292 1219 #else
mas01mc@292 1220 for( j = 0 ; j < H::L ; j++ ){ // L functions gj(v)
mas01mc@292 1221 for( kk = 0 ; kk < H::k ; kk++ ){ // Normally distributed hash functions
mas01mc@292 1222 #endif
mas01mc@292 1223 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 1224 H::A[j][kk][i] = *pf++; // Normally distributed random vectors
mas01mc@292 1225 }
mas01mc@292 1226 }
mas01mc@292 1227 #ifdef USE_U_FUNCTIONS
mas01mc@292 1228 for( j = 0 ; j < H::m ; j++ ) // biases b
mas01mc@292 1229 for( kk = 0 ; kk < H::k/2 ; kk++ )
mas01mc@292 1230 #else
mas01mc@292 1231 for( j = 0 ; j < H::L ; j++ ) // biases b
mas01mc@292 1232 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1233 #endif
mas01mc@293 1234 H::b[j][kk] = *pf++;
mas01mc@292 1235
mas01mc@292 1236 pu = (Uns32T*)pf;
mas01mc@292 1237 for( j = 0 ; j < H::L ; j++ ) // Z projectors r1
mas01mc@292 1238 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1239 H::r1[j][kk] = *pu++;
mas01mc@292 1240
mas01mc@292 1241 for( j = 0 ; j < H::L ; j++ ) // Z projectors r2
mas01mc@292 1242 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1243 H::r2[j][kk] = *pu++;
mas01mc@292 1244
mas01mc@293 1245 serial_munmap(db, get_serial_hashtable_offset());
mas01mc@292 1246 }
mas01mc@292 1247
mas01mc@292 1248 void G::unserialize_lsh_hashtables_format1(int fid){
mas01mc@292 1249 SerialElementT *pe, *pt;
mas01mc@292 1250 Uns32T x,y;
mas01mc@292 1251 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1252 // Read the hash tables into core
mas01mc@292 1253 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 1254 // memory map a single hash table
mas01mc@292 1255 // Align each hash table to page boundary
mas01mc@292 1256 char* dbtable = serial_mmap(fid, hashTableSize, 0,
mas01mc@292 1257 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
mas01mc@324 1258 #ifdef __CYGWIN__
mas01mc@324 1259 // No madvise in CYGWIN
mas01mc@324 1260 #else
mas01mc@292 1261 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 1262 error("could not advise hashtable memory","","madvise");
mas01mc@324 1263 #endif
mas01mc@292 1264 pt=(SerialElementT*)dbtable;
mas01mc@292 1265 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 1266 // Move disk pointer to beginning of row
mas01mc@292 1267 pe=pt+y*lshHeader->numCols;
mas01mc@292 1268 unserialize_hashtable_row_format1(pe, h[x]+y);
mas01mc@340 1269 #ifdef LSH_DUMP_CORE_TABLES
mas01mc@292 1270 printf("S[%d,%d]", x, y);
mas01mc@292 1271 serial_bucket_dump(pe);
mas01mc@292 1272 printf("C[%d,%d]", x, y);
mas01mc@292 1273 dump_hashtable_row(h[x][y]);
mas01mc@292 1274 #endif
mas01mc@292 1275 }
mas01mc@292 1276 serial_munmap(dbtable, hashTableSize);
mas01mc@292 1277 }
mas01mc@292 1278 }
mas01mc@292 1279
mas01mc@292 1280 void G::unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b){
mas01mc@292 1281 Uns32T colCount = 0;
mas01mc@292 1282 while(colCount!=lshHeader->numCols && pe->hashValue !=IFLAG){
mas01mc@292 1283 H::p = pe->pointID; // current point ID
mas01mc@292 1284 t2 = pe->hashValue;
mas01mc@292 1285 bucket_insert_point(b);
mas01mc@292 1286 pe++;
mas01mc@292 1287 colCount++;
mas01mc@292 1288 }
mas01mc@292 1289 }
mas01mc@292 1290
mas01mc@340 1291 void G::unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge){
mas01mc@292 1292 Uns32T x=0,y=0;
mas01mc@292 1293
mas01mc@292 1294 // Seek to hashtable base offset
mas01mc@336 1295 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
mas01mc@336 1296 fclose(dbFile);
mas01mc@336 1297 error("fSeek error in unserialize_lsh_hashtables_format2");
mas01mc@292 1298 }
mas01mc@292 1299
mas01mc@292 1300 // Read the hash tables into core (structure is given in header)
mas01mc@292 1301 while( x < H::L){
mas01mc@336 1302 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1303 fclose(dbFile);
mas01mc@292 1304 error("Read error","unserialize_lsh_hashtables_format2()");
mas01mc@292 1305 }
mas01mc@306 1306 if(H::t1==O2_SERIAL_TOKEN_ENDTABLE)
mas01mc@292 1307 x++; // End of table
mas01mc@292 1308 else
mas01mc@292 1309 while(y < H::N){
mas01mc@306 1310 // Read a row and move file pointer to beginning of next row or _bittable
mas01mc@306 1311 if(!(H::t1==O2_SERIAL_TOKEN_T1)){
mas01mc@336 1312 fclose(dbFile);
mas01mc@306 1313 error("State matchine error T1","unserialize_lsh_hashtables_format2()");
mas01mc@292 1314 }
mas01mc@336 1315 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1316 fclose(dbFile);
mas01mc@306 1317 error("Read error: t1","unserialize_lsh_hashtables_format2()");
mas01mc@306 1318 }
mas01mc@306 1319 y = H::t1;
mas01mc@292 1320 if(y>=H::N){
mas01mc@336 1321 fclose(dbFile);
mas01mc@292 1322 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
mas01mc@292 1323 }
mas01mc@340 1324 Uns32T token = 0;
mas01mc@340 1325 #ifdef LSH_CORE_ARRAY
mas01mc@340 1326 Uns32T numElements;
mas01mc@340 1327 if(fread(&numElements, sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@340 1328 fclose(dbFile);
mas01mc@340 1329 error("Read error: numElements","unserialize_lsh_hashtables_format2()");
mas01mc@340 1330 }
mas01mc@292 1331
mas01mc@340 1332 // BACKWARD COMPATIBILITY: check to see if T2 or END token was read
mas01mc@340 1333 if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){
mas01mc@340 1334 forMerge=true; // Force use of dynamic linked list core format
mas01mc@340 1335 token = numElements;
mas01mc@340 1336 }
mas01mc@340 1337
mas01mc@340 1338 if(forMerge)
mas01mc@340 1339 // Use linked list CORE format
mas01mc@340 1340 token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token);
mas01mc@340 1341 else
mas01mc@340 1342 // Use ARRAY CORE format with numElements counter
mas01mc@340 1343 token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements);
mas01mc@340 1344 #else
mas01mc@340 1345 token = unserialize_hashtable_row_format2(dbFile, h[x]+y);
mas01mc@340 1346 #endif
mas01mc@340 1347
mas01mc@340 1348 #ifdef LSH_DUMP_CORE_TABLES
mas01mc@292 1349 printf("C[%d,%d]", x, y);
mas01mc@292 1350 dump_hashtable_row(h[x][y]);
mas01mc@292 1351 #endif
mas01mc@292 1352 // Check that token is valid
mas01mc@306 1353 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
mas01mc@336 1354 fclose(dbFile);
mas01mc@292 1355 error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()");
mas01mc@292 1356 }
mas01mc@292 1357 // Check for end of table flag
mas01mc@306 1358 if(token==O2_SERIAL_TOKEN_ENDTABLE){
mas01mc@292 1359 x++;
mas01mc@292 1360 break;
mas01mc@292 1361 }
mas01mc@292 1362 // Check for new row flag
mas01mc@306 1363 if(token==O2_SERIAL_TOKEN_T1)
mas01mc@292 1364 H::t1 = token;
mas01mc@292 1365 }
mas01mc@292 1366 }
mas01mc@306 1367 }
mas01mc@292 1368
mas01mc@340 1369 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token){
mas01mc@292 1370 bool pointFound = false;
mas01mc@340 1371
mas01mc@340 1372 if(token)
mas01mc@340 1373 H::t2 = token;
mas01mc@340 1374 else if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@340 1375 fclose(dbFile);
mas01mc@340 1376 error("Read error T2 token","unserialize_hashtable_row_format2");
mas01mc@340 1377 }
mas01mc@340 1378
mas01mc@306 1379 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
mas01mc@336 1380 fclose(dbFile);
mas01mc@292 1381 error("State machine error: expected E or T2");
mas01mc@292 1382 }
mas01mc@340 1383
mas01mc@306 1384 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
mas01mc@292 1385 pointFound=false;
mas01mc@292 1386 // Check for T2 token
mas01mc@306 1387 if(H::t2!=O2_SERIAL_TOKEN_T2){
mas01mc@336 1388 fclose(dbFile);
mas01mc@292 1389 error("State machine error T2 token", "unserialize_hashtable_row_format2()");
mas01mc@306 1390 }
mas01mc@292 1391 // Read t2 value
mas01mc@336 1392 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1393 fclose(dbFile);
mas01mc@292 1394 error("Read error t2","unserialize_hashtable_row_format2");
mas01mc@292 1395 }
mas01mc@336 1396 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1397 fclose(dbFile);
mas01mc@292 1398 error("Read error H::p","unserialize_hashtable_row_format2");
mas01mc@292 1399 }
mas01mc@306 1400 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
mas01mc@292 1401 pointFound=true;
mas01mc@292 1402 bucket_insert_point(b);
mas01mc@336 1403 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1404 fclose(dbFile);
mas01mc@292 1405 error("Read error H::p","unserialize_hashtable_row_format2");
mas01mc@292 1406 }
mas01mc@292 1407 }
mas01mc@292 1408 if(!pointFound)
mas01mc@292 1409 error("State machine error: point", "unserialize_hashtable_row_format2()");
mas01mc@306 1410 H::t2 = H::p; // Copy last found token to t2
mas01mc@292 1411 }
mas01mc@292 1412 return H::t2; // holds current token
mas01mc@292 1413 }
mas01mc@292 1414
mas01mc@340 1415 // Unserialize format2 hashtable row to a CORE_ARRAY pointed to
mas01mc@340 1416 // by the hashtable row pointer: rowPtr
mas01mc@340 1417 //
mas01mc@340 1418 // numElements is the total number of t2 buckets plus points
mas01mc@340 1419 // memory required is numElements+1+sizeof(hashtable ptr)
mas01mc@340 1420 //
mas01mc@340 1421 // numElements = numPoints + numBuckets
mas01mc@340 1422 //
mas01mc@340 1423 // During inserts (merges) new hashtable entries are appened at rowPtr+numPoints+numBuckets+1
mas01mc@340 1424 //
mas01mc@340 1425 // ASSUME: that LSH_LIST_HEAD_COUNTERS is set so that the first hashtable bucket is used to count
mas01mc@340 1426 // point and bucket entries
mas01mc@340 1427 //
mas01mc@340 1428 // We store the values of numPoints and numBuckets in separate fields of the first bucket
mas01mc@340 1429 // rowPtr->t2 // numPoints
mas01mc@340 1430 // (Uns32T)(rowPtr->snext) // numBuckets
mas01mc@340 1431 //
mas01mc@340 1432 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*))
mas01mc@340 1433 // To get to the fist bucket, we use
mas01mc@340 1434 //
mas01mc@340 1435
mas01mc@340 1436 #define READ_UNS32T(VAL,TOKENSTR) if(fread(VAL, sizeof(Uns32T), 1, dbFile) != 1){\
mas01mc@340 1437 fclose(dbFile);error("Read error unserialize_hashtable_format2",TOKENSTR);}
mas01mc@340 1438
mas01mc@340 1439 #define TEST_TOKEN(TEST, TESTSTR) if(TEST){fclose(dbFile);error("State machine error: ", TESTSTR);}
mas01mc@340 1440
mas01mc@340 1441 #define SKIP_BITS_LEFT_SHIFT_MSB (30)
mas01mc@340 1442
mas01mc@340 1443 #define SKIP_BITS_RIGHT_SHIFT_MSB (28)
mas01mc@340 1444 #define SKIP_BITS_RIGHT_SHIFT_LSB (30)
mas01mc@340 1445
mas01mc@340 1446 #define MAX_POINTS_IN_BUCKET_CORE_ARRAY (16)
mas01mc@340 1447 #define LSH_CORE_ARRAY_END_ROW_TOKEN (0xFFFFFFFD)
mas01mc@340 1448
mas01mc@340 1449 // Encode the skip bits. Zero if only one point, MAX 8 (plus first == 9)
mas01mc@340 1450 #define ENCODE_POINT_SKIP_BITS TEST_TOKEN(!numPointsThisBucket, "no points found");\
mas01mc@340 1451 if(numPointsThisBucket==1){\
mas01mc@340 1452 secondPtr=ap++;\
mas01mc@340 1453 *secondPtr=0;\
mas01mc@340 1454 numPoints++;\
mas01mc@340 1455 }\
mas01mc@340 1456 if(numPointsThisBucket>1){\
mas01mc@340 1457 *firstPtr |= ( (numPointsThisBucket-1) & 0x3 ) << SKIP_BITS_LEFT_SHIFT_MSB;\
mas01mc@340 1458 *secondPtr |= ( ( (numPointsThisBucket-1) & 0xC) >> 2 ) << SKIP_BITS_LEFT_SHIFT_MSB;}
mas01mc@340 1459
mas01mc@340 1460 Uns32T G::unserialize_hashtable_row_to_array(FILE* dbFile, bucket** rowPP, Uns32T numElements){
mas01mc@340 1461 Uns32T numPointsThisBucket = 0;
mas01mc@340 1462 Uns32T numBuckets = 0;
mas01mc@340 1463 Uns32T numPoints = 0;
mas01mc@340 1464 Uns32T* firstPtr = 0;
mas01mc@340 1465 Uns32T* secondPtr = 0;
mas01mc@340 1466
mas01mc@340 1467 // Initialize new row
mas01mc@340 1468 if(!*rowPP){
mas01mc@340 1469 *rowPP = new bucket();
mas01mc@340 1470 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@340 1471 (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row
mas01mc@340 1472 (*rowPP)->next = 0;
mas01mc@340 1473 #endif
mas01mc@340 1474 }
mas01mc@340 1475 bucket* rowPtr = *rowPP;
mas01mc@340 1476
mas01mc@340 1477 READ_UNS32T(&(H::t2),"t2");
mas01mc@340 1478 TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2");
mas01mc@340 1479 // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements
mas01mc@340 1480 // So over-allocate by a factor of two and realloc later to actual numElements
mas01mc@340 1481 CR_ASSERT(rowPtr->next = (bucket*) malloc((2*numElements+1)*sizeof(Uns32T)+sizeof(bucket**)));
mas01mc@340 1482 Uns32T* ap = reinterpret_cast<Uns32T*>(rowPtr->next); // Cast pointer to Uns32T* for array format
mas01mc@340 1483 while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){
mas01mc@340 1484 numPointsThisBucket = 0;// reset bucket point counter
mas01mc@340 1485 secondPtr = 0; // reset second-point pointer
mas01mc@340 1486 TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");
mas01mc@340 1487 READ_UNS32T(&(H::t2), "Read error t2");
mas01mc@340 1488 *ap++ = H::t2; // Insert t2 value into array
mas01mc@340 1489 numBuckets++;
mas01mc@340 1490 READ_UNS32T(&(H::p), "Read error H::p");
mas01mc@340 1491 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
mas01mc@340 1492 if(numPointsThisBucket==MAX_POINTS_IN_BUCKET_CORE_ARRAY){
mas01mc@340 1493 ENCODE_POINT_SKIP_BITS;
mas01mc@340 1494 *ap++ = H::t2; // Extra element
mas01mc@340 1495 numBuckets++; // record this as a new bucket
mas01mc@340 1496 numPointsThisBucket=0; // reset bucket point counter
mas01mc@340 1497 secondPtr = 0; // reset second-point pointer
mas01mc@340 1498 }
mas01mc@340 1499 if( ++numPointsThisBucket == 1 )
mas01mc@340 1500 firstPtr = ap; // store pointer to first point to insert skip bits later on
mas01mc@340 1501 else if( numPointsThisBucket == 2 )
mas01mc@340 1502 secondPtr = ap; // store pointer to first point to insert skip bits later on
mas01mc@340 1503 numPoints++;
mas01mc@340 1504 *ap++ = H::p;
mas01mc@340 1505 READ_UNS32T(&(H::p), "Read error H::p");
mas01mc@340 1506 }
mas01mc@340 1507 ENCODE_POINT_SKIP_BITS;
mas01mc@340 1508 H::t2 = H::p; // Copy last found token to t2
mas01mc@340 1509 }
mas01mc@340 1510 // Reallocate the row to its actual size
mas01mc@340 1511 CR_ASSERT(rowPtr->next = (bucket*) realloc(rowPtr->next, (numBuckets+numPoints+1)*sizeof(Uns32T)+sizeof(bucket**)));
mas01mc@340 1512 // Record the sizes at the head of the row
mas01mc@340 1513 rowPtr->snext = (sbucket*) numBuckets;
mas01mc@340 1514 rowPtr->t2 = numPoints;
mas01mc@340 1515 // Place end of row marker
mas01mc@340 1516 *ap++ = LSH_CORE_ARRAY_END_ROW_TOKEN;
mas01mc@340 1517 // Set the LSH_CORE_ARRAY_BIT to identify data structure for insertion and retrieval
mas01mc@340 1518 rowPtr->t2 |= LSH_CORE_ARRAY_BIT;
mas01mc@340 1519 // Allocate a new dynamic list head at the end of the array
mas01mc@340 1520 bucket** listPtr = reinterpret_cast<bucket**> (ap);
mas01mc@340 1521 *listPtr = 0;
mas01mc@340 1522 // Return current token
mas01mc@340 1523 return H::t2; // return H::t2 which holds current token [E or T1]
mas01mc@340 1524 }
mas01mc@340 1525
mas01mc@340 1526
mas01mc@340 1527
mas01mc@340 1528 // *p is a pointer to the beginning of a hashtable row array
mas01mc@340 1529 // The array consists of t2 hash keys and one or more point identifiers p for each hash key
mas01mc@340 1530 // Retrieval is performed by generating a hash key query_t2 for query point q
mas01mc@340 1531 // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry
mas01mc@340 1532 // point for retrieve_from_core_hashtable_array
mas01mc@340 1533 #define SKIP_BITS (0xC0000000)
mas01mc@340 1534 void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){
mas01mc@340 1535 Uns32T skip;
mas01mc@340 1536 Uns32T t2;
mas01mc@340 1537 Uns32T p1;
mas01mc@340 1538 Uns32T p2;
mas01mc@340 1539
mas01mc@340 1540 CR_ASSERT(p);
mas01mc@340 1541
mas01mc@340 1542 do{
mas01mc@340 1543 t2 = *p++;
mas01mc@340 1544 if( t2 > H::t2 )
mas01mc@340 1545 return;
mas01mc@340 1546 p1 = *p++;
mas01mc@340 1547 p2 = *p++;
mas01mc@340 1548 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
mas01mc@340 1549 if( t2 == H::t2 ){
mas01mc@340 1550 add_point_callback(calling_instance, p1 ^ (p1 & SKIP_BITS), qpos, radius);
mas01mc@340 1551 if(skip--){
mas01mc@340 1552 add_point_callback(calling_instance, p2 ^ (p2 & SKIP_BITS), qpos, radius);
mas01mc@340 1553 while(skip-- )
mas01mc@340 1554 add_point_callback(calling_instance, *p++, qpos, radius);
mas01mc@340 1555 }
mas01mc@340 1556 }
mas01mc@340 1557 else
mas01mc@340 1558 if(*p != LSH_CORE_ARRAY_END_ROW_TOKEN)
mas01mc@340 1559 p = p + skip;
mas01mc@340 1560 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
mas01mc@340 1561 }
mas01mc@340 1562
mas01mc@292 1563 void G::dump_hashtable_row(bucket* p){
mas01mc@292 1564 while(p && p->t2!=IFLAG){
mas01mc@292 1565 sbucket* sbp = p->snext;
mas01mc@292 1566 while(sbp){
mas01mc@292 1567 printf("(%0X,%u)", p->t2, sbp->pointID);
mas01mc@292 1568 fflush(stdout);
mas01mc@292 1569 sbp=sbp->snext;
mas01mc@292 1570 }
mas01mc@292 1571 p=p->next;
mas01mc@292 1572 }
mas01mc@292 1573 printf("\n");
mas01mc@292 1574 }
mas01mc@292 1575
mas01mc@292 1576
mas01mc@292 1577 // G::serial_retrieve_point( ... )
mas01mc@292 1578 // retrieves (pointID) from a serialized LSH database
mas01mc@292 1579 //
mas01mc@292 1580 // inputs:
mas01mc@292 1581 // filename - file name of serialized LSH database
mas01mc@292 1582 // vv - query point set
mas01mc@292 1583 //
mas01mc@292 1584 // outputs:
mas01mc@292 1585 // inserts retrieved points into add_point() callback method
mas01mc@292 1586 void G::serial_retrieve_point_set(char* filename, vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller)
mas01mc@292 1587 {
mas01mc@292 1588 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1589 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1590 serial_get_header(dbheader); // read header
mas01mc@292 1591 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1592
mas01mc@292 1593 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
mas01mc@336 1594 serial_close(dbfid);
mas01mc@292 1595 error("serial_retrieve_point_set is for SERIAL_FILEFORMAT1 only");
mas01mc@292 1596 }
mas01mc@292 1597
mas01mc@292 1598 // size of each hash table
mas01mc@292 1599 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1600 calling_instance = caller; // class instance variable used in ...bucket_chain_point()
mas01mc@292 1601 add_point_callback = add_point;
mas01mc@292 1602
mas01mc@292 1603 for(Uns32T j=0; j<L; j++){
mas01mc@292 1604 // memory map a single hash table for random access
mas01mc@292 1605 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1606 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1607 #ifdef __CYGWIN__
mas01mc@324 1608 // No madvise in CYGWIN
mas01mc@324 1609 #else
mas01mc@292 1610 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
mas01mc@292 1611 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1612 #endif
mas01mc@292 1613 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@292 1614 for(Uns32T qpos=0; qpos<vv.size(); qpos++){
mas01mc@293 1615 H::compute_hash_functions(vv[qpos]);
mas01mc@293 1616 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
mas01mc@292 1617 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
mas01mc@292 1618 }
mas01mc@292 1619 serial_munmap(db, hashTableSize); // drop hashtable mmap
mas01mc@292 1620 }
mas01mc@292 1621 serial_close(dbfid);
mas01mc@292 1622 }
mas01mc@292 1623
mas01mc@292 1624 void G::serial_retrieve_point(char* filename, vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 1625 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1626 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1627 serial_get_header(dbheader); // read header
mas01mc@292 1628 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1629
mas01mc@292 1630 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
mas01mc@336 1631 serial_close(dbfid);
mas01mc@292 1632 error("serial_retrieve_point is for SERIAL_FILEFORMAT1 only");
mas01mc@292 1633 }
mas01mc@292 1634
mas01mc@292 1635 // size of each hash table
mas01mc@292 1636 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1637 calling_instance = caller;
mas01mc@292 1638 add_point_callback = add_point;
mas01mc@293 1639 H::compute_hash_functions(v);
mas01mc@292 1640 for(Uns32T j=0; j<L; j++){
mas01mc@292 1641 // memory map a single hash table for random access
mas01mc@292 1642 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1643 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1644 #ifdef __CYGWIN__
mas01mc@324 1645 // No madvise in CYGWIN
mas01mc@324 1646 #else
mas01mc@292 1647 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
mas01mc@292 1648 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1649 #endif
mas01mc@292 1650 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@293 1651 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
mas01mc@292 1652 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
mas01mc@292 1653 serial_munmap(db, hashTableSize); // drop hashtable mmap
mas01mc@292 1654 }
mas01mc@292 1655 serial_close(dbfid);
mas01mc@292 1656 }
mas01mc@292 1657
mas01mc@292 1658 void G::serial_dump_tables(char* filename){
mas01mc@292 1659 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1660 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1661 serial_get_header(dbheader); // read header
mas01mc@292 1662 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1663 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1664 for(Uns32T j=0; j<L; j++){
mas01mc@292 1665 // memory map a single hash table for random access
mas01mc@292 1666 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1667 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1668 #ifdef __CYGWIN__
mas01mc@324 1669 // No madvise in CYGWIN
mas01mc@324 1670 #else
mas01mc@292 1671 if(madvise(db, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 1672 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1673 #endif
mas01mc@292 1674 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@292 1675 printf("*********** TABLE %d ***************\n", j);
mas01mc@292 1676 fflush(stdout);
mas01mc@292 1677 int count=0;
mas01mc@292 1678 do{
mas01mc@292 1679 printf("[%d,%d]", j, count++);
mas01mc@292 1680 fflush(stdout);
mas01mc@292 1681 serial_bucket_dump(pe);
mas01mc@292 1682 pe+=lshHeader->numCols;
mas01mc@292 1683 }while(pe<(SerialElementT*)db+lshHeader->numRows*lshHeader->numCols);
mas01mc@292 1684 }
mas01mc@292 1685
mas01mc@292 1686 }
mas01mc@292 1687
mas01mc@292 1688 void G::serial_bucket_dump(SerialElementT* pe){
mas01mc@292 1689 SerialElementT* pend = pe+lshHeader->numCols;
mas01mc@292 1690 while( !(pe->hashValue==IFLAG || pe==pend ) ){
mas01mc@292 1691 printf("(%0X,%u)",pe->hashValue,pe->pointID);
mas01mc@292 1692 pe++;
mas01mc@292 1693 }
mas01mc@292 1694 printf("\n");
mas01mc@292 1695 fflush(stdout);
mas01mc@292 1696 }
mas01mc@292 1697
mas01mc@292 1698 void G::serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos){
mas01mc@292 1699 SerialElementT* pend = pe+lshHeader->numCols;
mas01mc@292 1700 while( !(pe->hashValue==IFLAG || pe==pend ) ){
mas01mc@292 1701 if(pe->hashValue==t2){ // new match
mas01mc@292 1702 add_point_callback(calling_instance, pe->pointID, qpos, radius);
mas01mc@292 1703 }
mas01mc@292 1704 pe++;
mas01mc@292 1705 }
mas01mc@292 1706 }
mas01mc@292 1707
mas01mc@292 1708 void G::bucket_chain_point(bucket* p, Uns32T qpos){
mas01mc@292 1709 if(!p || p->t2==IFLAG)
mas01mc@292 1710 return;
mas01mc@292 1711 if(p->t2==t2){ // match
mas01mc@292 1712 sbucket_chain_point(p->snext, qpos); // add to reporter
mas01mc@292 1713 }
mas01mc@292 1714 if(p->next){
mas01mc@292 1715 bucket_chain_point(p->next, qpos); // recurse
mas01mc@292 1716 }
mas01mc@292 1717 }
mas01mc@292 1718
mas01mc@292 1719 void G::sbucket_chain_point(sbucket* p, Uns32T qpos){
mas01mc@292 1720 add_point_callback(calling_instance, p->pointID, qpos, radius);
mas01mc@292 1721 if(p->snext){
mas01mc@292 1722 sbucket_chain_point(p->snext, qpos);
mas01mc@292 1723 }
mas01mc@292 1724 }
mas01mc@292 1725
mas01mc@292 1726 void G::get_lock(int fd, bool exclusive) {
mas01mc@292 1727 struct flock lock;
mas01mc@292 1728 int status;
mas01mc@292 1729 lock.l_type = exclusive ? F_WRLCK : F_RDLCK;
mas01mc@292 1730 lock.l_whence = SEEK_SET;
mas01mc@292 1731 lock.l_start = 0;
mas01mc@292 1732 lock.l_len = 0; /* "the whole file" */
mas01mc@292 1733 retry:
mas01mc@292 1734 do {
mas01mc@292 1735 status = fcntl(fd, F_SETLKW, &lock);
mas01mc@292 1736 } while (status != 0 && errno == EINTR);
mas01mc@292 1737 if (status) {
mas01mc@292 1738 if (errno == EAGAIN) {
mas01mc@292 1739 sleep(1);
mas01mc@292 1740 goto retry;
mas01mc@292 1741 } else {
mas01mc@292 1742 error("fcntl lock error", "", "fcntl");
mas01mc@292 1743 }
mas01mc@292 1744 }
mas01mc@292 1745 }
mas01mc@292 1746
mas01mc@292 1747 void G::release_lock(int fd) {
mas01mc@292 1748 struct flock lock;
mas01mc@292 1749 int status;
mas01mc@292 1750
mas01mc@292 1751 lock.l_type = F_UNLCK;
mas01mc@292 1752 lock.l_whence = SEEK_SET;
mas01mc@292 1753 lock.l_start = 0;
mas01mc@292 1754 lock.l_len = 0;
mas01mc@292 1755
mas01mc@292 1756 status = fcntl(fd, F_SETLKW, &lock);
mas01mc@292 1757
mas01mc@292 1758 if (status)
mas01mc@292 1759 error("fcntl unlock error", "", "fcntl");
mas01mc@292 1760 }