annotate lshlib.cpp @ 601:82d23418d867

Fix some fd leaks in the command-line binary Strictly speaking, they're not really leaks, because the only codepath that suffers from these leaks exits immediately afterwards. On the other hand, this fix makes valgrind on e.g. tests/0025 happier, going from 5 errors to none.
author mas01cr
date Fri, 14 Aug 2009 16:39:32 +0000
parents 9119f2fa3efe
children 4b79043f90ba
rev   line source
mas01cr@589 1 #include <vector>
mas01cr@589 2 #include <queue>
mas01cr@589 3 #include <stdio.h>
mas01cr@589 4 #include <stdlib.h>
mas01cr@589 5 #include <sys/types.h>
mas01cr@589 6 #include <sys/stat.h>
mas01cr@589 7 #include <sys/mman.h>
mas01cr@589 8 #include <fcntl.h>
mas01cr@589 9 #include <string.h>
mas01cr@589 10 #include <iostream>
mas01cr@589 11 #include <fstream>
mas01cr@589 12 #include <math.h>
mas01cr@589 13 #include <sys/time.h>
mas01cr@589 14 #include <assert.h>
mas01cr@589 15 #include <float.h>
mas01cr@589 16 #include <signal.h>
mas01cr@589 17 #include <time.h>
mas01cr@589 18 #include <limits.h>
mas01cr@589 19 #include <errno.h>
mas01cr@589 20 #ifdef MT19937
mas01cr@589 21 #include "mt19937/mt19937ar.h"
mas01cr@589 22 #endif
mas01cr@589 23
mas01mc@292 24 #include "lshlib.h"
mas01mc@292 25
mas01mc@292 26 void err(char*s){cout << s << endl;exit(2);}
mas01mc@292 27
mas01mc@292 28 Uns32T get_page_logn(){
mas01mc@292 29 int pagesz = (int)sysconf(_SC_PAGESIZE);
mas01mc@292 30 return (Uns32T)log2((double)pagesz);
mas01mc@292 31 }
mas01mc@292 32
mas01mc@292 33 void H::error(const char* a, const char* b, const char *sysFunc) {
mas01mc@292 34 cerr << a << ": " << b << endl;
mas01mc@292 35 if (sysFunc) {
mas01mc@292 36 perror(sysFunc);
mas01mc@292 37 }
mas01mc@292 38 exit(1);
mas01mc@292 39 }
mas01mc@292 40
mas01mc@293 41 H::H(){
mas01mc@293 42 // Delay initialization of lsh functions until we know the parameters
mas01mc@293 43 }
mas01mc@293 44
mas01mc@293 45 H::H(Uns32T kk, Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float ww, float rr):
mas01mc@292 46 #ifdef USE_U_FUNCTIONS
mas01mc@292 47 use_u_functions(true),
mas01mc@292 48 #else
mas01mc@292 49 use_u_functions(false),
mas01mc@292 50 #endif
mas01mc@293 51 maxp(0),
mas01mc@292 52 bucketCount(0),
mas01mc@292 53 pointCount(0),
mas01mc@292 54 N(NN),
mas01mc@292 55 C(CC),
mas01mc@292 56 k(kk),
mas01mc@292 57 m(mm),
mas01mc@293 58 L((mm*(mm-1))/2),
mas01mc@293 59 d(dd),
mas01mc@293 60 w(ww),
mas01mc@293 61 radius(rr)
mas01mc@292 62 {
mas01mc@292 63
mas01mc@292 64 if(m<2){
mas01mc@292 65 m=2;
mas01mc@292 66 L=1; // check value of L
mas01mc@292 67 cout << "warning: setting m=2, L=1" << endl;
mas01mc@292 68 }
mas01mc@292 69 if(use_u_functions && k%2){
mas01mc@292 70 k++; // make sure k is even
mas01mc@292 71 cout << "warning: setting k even" << endl;
mas01mc@292 72 }
mas01mc@293 73
mas01mc@293 74 // We have the necessary parameters, so construct hashfunction datastructures
mas01mc@293 75 initialize_lsh_functions();
mas01mc@292 76 }
mas01mc@292 77
mas01mc@293 78 void H::initialize_lsh_functions(){
mas01mc@292 79 H::P = UH_PRIME_DEFAULT;
mas01mc@292 80
mas01mc@292 81 /* FIXME: don't use time(); instead use /dev/random or similar */
mas01mc@292 82 /* FIXME: write out the seed somewhere, so that we can get
mas01mc@292 83 repeatability */
mas01mc@292 84 #ifdef MT19937
mas01mc@292 85 init_genrand(time(NULL));
mas01mc@292 86 #else
mas01mc@292 87 srand(time(NULL)); // seed random number generator
mas01mc@292 88 #endif
mas01mc@293 89 Uns32T i,j, kk;
mas01mc@293 90 #ifdef USE_U_FUNCTIONS
mas01mc@293 91 H::A = new float**[ H::m ]; // m x k x d random projectors
mas01mc@293 92 H::b = new float*[ H::m ]; // m x k random biases
mas01mc@293 93 #else
mas01mc@293 94 H::A = new float**[ H::L ]; // m x k x d random projectors
mas01mc@293 95 H::b = new float*[ H::L ]; // m x k random biases
mas01mc@293 96 #endif
mas01mc@293 97 H::g = new Uns32T*[ H::L ]; // L x k random projections
mas01mc@293 98 assert( H::g && H::A && H::b ); // failure
mas01mc@293 99 #ifdef USE_U_FUNCTIONS
mas01mc@293 100 // Use m \times u_i functions \in R^{(k/2) \times (d)}
mas01mc@293 101 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
mas01mc@293 102 for( j = 0; j < H::m ; j++ ){ // m functions u_i(v)
mas01mc@293 103 H::A[j] = new float*[ H::k/2 ]; // k/2 x d 2-stable distribution coefficients
mas01mc@293 104 H::b[j] = new float[ H::k/2 ]; // bias
mas01mc@293 105 assert( H::A[j] && H::b[j] ); // failure
mas01mc@293 106 for( kk = 0; kk < H::k/2 ; kk++ ){
mas01mc@293 107 H::A[j][kk] = new float[ H::d ];
mas01mc@293 108 assert( H::A[j][kk] ); // failure
mas01mc@293 109 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 110 H::A[j][kk][i] = H::randn(); // Normal
mas01mc@293 111 H::b[j][kk] = H::ranf()*H::w; // Uniform
mas01mc@293 112 }
mas01mc@293 113 }
mas01mc@293 114 #else
mas01mc@293 115 // Use m \times u_i functions \in R^{k \times (d)}
mas01mc@293 116 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
mas01mc@293 117 for( j = 0; j < H::L ; j++ ){ // m functions u_i(v)
mas01mc@293 118 H::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients
mas01mc@293 119 H::b[j] = new float[ H::k ]; // bias
mas01mc@293 120 assert( H::A[j] && H::b[j] ); // failure
mas01mc@293 121 for( kk = 0; kk < H::k ; kk++ ){
mas01mc@293 122 H::A[j][kk] = new float[ H::d ];
mas01mc@293 123 assert( H::A[j][kk] ); // failure
mas01mc@293 124 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 125 H::A[j][kk][i] = H::randn(); // Normal
mas01mc@293 126 H::b[j][kk] = H::ranf()*H::w; // Uniform
mas01mc@293 127 }
mas01mc@293 128 }
mas01mc@293 129 #endif
mas01mc@293 130
mas01mc@293 131 // Storage for LSH hash function output (Uns32T)
mas01mc@293 132 for( j = 0 ; j < H::L ; j++ ){ // L functions g_j(u_a, u_b) a,b \in nchoosek(m,2)
mas01mc@293 133 H::g[j] = new Uns32T[ H::k ]; // k x 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z
mas01mc@293 134 assert( H::g[j] );
mas01mc@292 135 }
mas01mc@292 136
mas01mc@293 137 // LSH Hash tables
mas01mc@293 138 H::h = new bucket**[ H::L ];
mas01mc@293 139 assert( H::h );
mas01mc@292 140 for( j = 0 ; j < H::L ; j++ ){
mas01mc@292 141 H::h[j] = new bucket*[ H::N ];
mas01mc@292 142 assert( H::h[j] );
mas01mc@292 143 for( i = 0 ; i < H::N ; i++)
mas01mc@292 144 H::h[j][i] = 0;
mas01mc@292 145 }
mas01mc@293 146
mas01mc@293 147 // Standard hash functions
mas01mc@293 148 H::r1 = new Uns32T*[ H::L ];
mas01mc@293 149 H::r2 = new Uns32T*[ H::L ];
mas01mc@293 150 assert( H::r1 && H::r2 ); // failure
mas01mc@293 151 for( j = 0 ; j < H::L ; j++ ){
mas01mc@293 152 H::r1[ j ] = new Uns32T[ H::k ];
mas01mc@293 153 H::r2[ j ] = new Uns32T[ H::k ];
mas01mc@293 154 assert( H::r1[j] && H::r2[j] ); // failure
mas01mc@293 155 for( i = 0; i<H::k; i++){
mas01mc@293 156 H::r1[j][i] = randr();
mas01mc@293 157 H::r2[j][i] = randr();
mas01mc@293 158 }
mas01mc@293 159 }
mas01mc@293 160
mas01mc@534 161 // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS
mas01mc@293 162 H::initialize_partial_functions();
mas01mc@293 163 }
mas01mc@293 164
mas01mc@293 165 void H::initialize_partial_functions(){
mas01mc@293 166
mas01mc@293 167 #ifdef USE_U_FUNCTIONS
mas01mc@293 168 H::uu = vector<vector<Uns32T> >(H::m);
mas01mc@293 169 for( Uns32T aa=0 ; aa < H::m ; aa++ )
mas01mc@293 170 H::uu[aa] = vector<Uns32T>( H::k/2 );
mas01mc@293 171 #endif
mas01mc@293 172 }
mas01mc@293 173
mas01mc@293 174
mas01mc@293 175 // Generate z ~ N(0,1)
mas01mc@293 176 float H::randn(){
mas01mc@293 177 // Box-Muller
mas01mc@293 178 float x1, x2;
mas01mc@293 179 do{
mas01mc@293 180 x1 = ranf();
mas01mc@293 181 } while (x1 == 0); // cannot take log of 0
mas01mc@293 182 x2 = ranf();
mas01mc@293 183 float z;
mas01mc@293 184 z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2);
mas01mc@293 185 return z;
mas01mc@293 186 }
mas01mc@293 187
mas01mc@293 188 float H::ranf(){
mas01mc@293 189 #ifdef MT19937
mas01mc@293 190 return (float) genrand_real2();
mas01mc@293 191 #else
mas01mc@293 192 return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) );
mas01mc@293 193 #endif
mas01mc@293 194 }
mas01mc@293 195
mas01mc@293 196 // range is 1..2^29
mas01mc@293 197 /* FIXME: that looks like an ... odd range. Still. */
mas01mc@293 198 Uns32T H::randr(){
mas01mc@293 199 #ifdef MT19937
mas01mc@293 200 return (Uns32T)((genrand_int32() >> 3) + 1);
mas01mc@293 201 #else
mas01mc@293 202 return (Uns32T) ((rand() >> 2) + 1);
mas01mc@293 203 #endif
mas01mc@292 204 }
mas01mc@292 205
mas01mc@292 206 // Destruct hash tables
mas01mc@292 207 H::~H(){
mas01mc@293 208 Uns32T i,j,kk;
mas01mc@340 209 bucket** pp;
mas01mc@293 210 #ifdef USE_U_FUNCTIONS
mas01mc@293 211 for( j = 0 ; j < H::m ; j++ ){
mas01mc@293 212 for( kk = 0 ; kk < H::k/2 ; kk++ )
mas01mc@293 213 delete[] A[j][kk];
mas01mc@293 214 delete[] A[j];
mas01mc@293 215 }
mas01mc@293 216 delete[] A;
mas01mc@293 217 for( j = 0 ; j < H::m ; j++ )
mas01mc@293 218 delete[] b[j];
mas01mc@293 219 delete[] b;
mas01mc@293 220 #else
mas01mc@293 221 for( j = 0 ; j < H::L ; j++ ){
mas01mc@293 222 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@293 223 delete[] A[j][kk];
mas01mc@293 224 delete[] A[j];
mas01mc@293 225 }
mas01mc@293 226 delete[] A;
mas01mc@293 227 for( j = 0 ; j < H::L ; j++ )
mas01mc@293 228 delete[] b[j];
mas01mc@293 229 delete[] b;
mas01mc@293 230 #endif
mas01mc@293 231
mas01mc@293 232 for( j = 0 ; j < H::L ; j++ )
mas01mc@293 233 delete[] g[j];
mas01mc@293 234 delete[] g;
mas01mc@292 235 for( j=0 ; j < H::L ; j++ ){
mas01mc@292 236 delete[] H::r1[ j ];
mas01mc@292 237 delete[] H::r2[ j ];
mas01mc@340 238 for(i = 0; i< H::N ; i++){
mas01mc@340 239 pp = 0;
mas01mc@340 240 #ifdef LSH_CORE_ARRAY
mas01mc@340 241 if(H::h[ j ][ i ])
mas01mc@340 242 if(H::h[ j ][ i ]->t2 & LSH_CORE_ARRAY_BIT){
mas01mc@340 243 pp = get_pointer_to_bucket_linked_list(H::h[ j ][ i ]);
mas01mc@340 244 if(*pp){
mas01mc@344 245 (*pp)->snext.ptr=0; // Head of list uses snext as a non-pointer value
mas01mc@340 246 delete *pp; // Now the destructor can do its work properly
mas01mc@340 247 }
mas01mc@340 248 free(H::h[ j ][ i ]->next);
mas01mc@340 249 H::h[ j ][ i ]->next = 0; // Zero next pointer
mas01mc@344 250 H::h[ j ][ i ]->snext.ptr = 0; // Zero head-of-list pointer as above
mas01mc@340 251 }
mas01mc@340 252 #endif
mas01mc@292 253 delete H::h[ j ][ i ];
mas01mc@340 254 }
mas01mc@292 255 delete[] H::h[ j ];
mas01mc@292 256 }
mas01mc@474 257 delete[] H::r1;
mas01mc@474 258 delete[] H::r2;
mas01mc@474 259 delete[] H::h;
mas01mc@292 260 }
mas01mc@292 261
mas01mc@292 262
mas01mc@293 263 // Compute all hash functions for vector v
mas01mc@293 264 // #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d}
mas01mc@293 265 // to make L \times g_j functions \in Z^k
mas01mc@293 266 void H::compute_hash_functions(vector<float>& v){ // v \in R^d
mas01mc@293 267 float iw = 1. / H::w; // hash bucket width
mas01mc@293 268 Uns32T aa, kk;
mas01mc@293 269 if( v.size() != H::d )
mas01mc@293 270 error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality
mas01mc@293 271 double tmp = 0;
mas01mc@293 272 float *pA, *pb;
mas01mc@293 273 Uns32T *pg;
mas01mc@293 274 int dd;
mas01mc@293 275 vector<float>::iterator vi;
mas01mc@293 276 vector<Uns32T>::iterator ui;
mas01mc@293 277
mas01mc@293 278 #ifdef USE_U_FUNCTIONS
mas01mc@293 279 Uns32T bb;
mas01mc@293 280 // Store m dot products to expand
mas01mc@293 281 for( aa=0; aa < H::m ; aa++ ){
mas01mc@293 282 ui = H::uu[aa].begin();
mas01mc@293 283 for( kk = 0 ; kk < H::k/2 ; kk++ ){
mas01mc@293 284 pb = *( H::b + aa ) + kk;
mas01mc@293 285 pA = * ( * ( H::A + aa ) + kk );
mas01mc@293 286 dd = H::d;
mas01mc@293 287 tmp = 0.;
mas01mc@293 288 vi = v.begin();
mas01mc@293 289 while( dd-- )
mas01mc@293 290 tmp += *pA++ * *vi++; // project
mas01mc@293 291 tmp += *pb; // translate
mas01mc@293 292 tmp *= iw; // scale
mas01mc@293 293 *ui++ = (Uns32T) floor(tmp); // floor
mas01mc@293 294 }
mas01mc@293 295 }
mas01mc@293 296 // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d}
mas01mc@293 297 Uns32T j;
mas01mc@293 298 for( aa=0, j=0 ; aa < H::m-1 ; aa++ )
mas01mc@293 299 for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){
mas01mc@293 300 pg= *( H::g + j ); // L \times functions g_j(v) \in Z^k
mas01mc@293 301 // u_1 \in Z^{(k/2) \times d}
mas01mc@293 302 ui = H::uu[aa].begin();
mas01mc@293 303 kk=H::k/2;
mas01mc@293 304 while( kk-- )
mas01mc@293 305 *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z
mas01mc@293 306 // u_2 \in Z^{(k/2) \times d}
mas01mc@293 307 ui = H::uu[bb].begin();
mas01mc@293 308 kk=H::k/2;
mas01mc@293 309 while( kk--)
mas01mc@293 310 *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z
mas01mc@293 311 }
mas01mc@293 312 #else
mas01mc@293 313 for( aa=0; aa < H::L ; aa++ ){
mas01mc@514 314 pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k
mas01mc@514 315 for( kk = 0 ; kk != H::k ; kk++ ){
mas01mc@293 316 pb = *( H::b + aa ) + kk;
mas01mc@293 317 pA = * ( * ( H::A + aa ) + kk );
mas01mc@293 318 dd = H::d;
mas01mc@293 319 tmp = 0.;
mas01mc@293 320 vi = v.begin();
mas01mc@293 321 while( dd-- )
mas01mc@293 322 tmp += *pA++ * *vi++; // project
mas01mc@293 323 tmp += *pb; // translate
mas01mc@293 324 tmp *= iw; // scale
mas01mc@514 325 *pg++ = (Uns32T) (floor(tmp)); // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z
mas01mc@293 326 }
mas01mc@293 327 }
mas01mc@293 328 #endif
mas01mc@293 329 }
mas01mc@293 330
mas01mc@292 331 // make hash value \in Z
mas01mc@293 332 void H::generate_hash_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){
mas01mc@293 333 H::t1 = computeProductModDefaultPrime( g, r1, H::k ) % H::N;
mas01mc@293 334 H::t2 = computeProductModDefaultPrime( g, r2, H::k );
mas01mc@292 335 }
mas01mc@292 336
mas01mc@292 337 #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}}
mas01mc@292 338
mas01mc@292 339 // Computes (a.b) mod UH_PRIME_DEFAULT
mas01mc@293 340 inline Uns32T H::computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){
mas01mc@292 341 LongUns64T h = 0;
mas01mc@292 342
mas01mc@292 343 for(IntT i = 0; i < size; i++){
mas01mc@292 344 h = h + (LongUns64T)a[i] * (LongUns64T)b[i];
mas01mc@292 345 h = (h & TWO_TO_32_MINUS_1) + 5 * (h >> 32);
mas01mc@292 346 if (h >= UH_PRIME_DEFAULT) {
mas01mc@292 347 h = h - UH_PRIME_DEFAULT;
mas01mc@292 348 }
mas01mc@292 349 CR_ASSERT(h < UH_PRIME_DEFAULT);
mas01mc@292 350 }
mas01mc@292 351 return h;
mas01mc@292 352 }
mas01mc@292 353
mas01mc@292 354 Uns32T H::bucket_insert_point(bucket **pp){
mas01mc@296 355 collisionCount = 0;
mas01mc@340 356 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@292 357 if(!*pp){
mas01mc@292 358 *pp = new bucket();
mas01mc@292 359 (*pp)->t2 = 0; // Use t2 as a collision counter for the row
mas01mc@292 360 (*pp)->next = new bucket();
mas01mc@292 361 }
mas01mc@340 362 // The list head holds point collision count
mas01mc@340 363 if( (*pp)->t2 & LSH_CORE_ARRAY_BIT )
mas01mc@340 364 bucket_insert_point(get_pointer_to_bucket_linked_list(*pp)); // recurse
mas01mc@340 365 else{
mas01mc@340 366 collisionCount = (*pp)->t2;
mas01mc@340 367 if(collisionCount < H::C){ // Block if row is full
mas01mc@340 368 (*pp)->t2++; // Increment collision counter (numPoints in row)
mas01mc@340 369 pointCount++;
mas01mc@340 370 collisionCount++;
mas01mc@340 371 // Locate the bucket linked list
mas01mc@340 372 __bucket_insert_point( (*pp)->next );
mas01mc@340 373 }
mas01mc@292 374 }
mas01mc@340 375 #else // NOT USING LSH_LIST_HEAD_COUNTERS
mas01mc@340 376 if(!*pp)
mas01mc@340 377 *pp = new bucket();
mas01mc@296 378 pointCount++;
mas01mc@296 379 __bucket_insert_point(*pp); // No collision count storage
mas01mc@292 380 #endif
mas01mc@292 381 return collisionCount;
mas01mc@292 382 }
mas01mc@292 383
mas01mc@340 384 // insert points into hashtable row collision chain
mas01mc@292 385 void H::__bucket_insert_point(bucket* p){
mas01mc@292 386 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2?
mas01mc@292 387 p->t2 = H::t2;
mas01mc@292 388 bucketCount++; // Record start of new point-locale collision chain
mas01mc@344 389 p->snext.ptr = new sbucket();
mas01mc@344 390 __sbucket_insert_point(p->snext.ptr);
mas01mc@292 391 return;
mas01mc@292 392 }
mas01mc@292 393
mas01mc@292 394 if(p->t2 == H::t2){
mas01mc@344 395 __sbucket_insert_point(p->snext.ptr);
mas01mc@292 396 return;
mas01mc@292 397 }
mas01mc@292 398
mas01mc@292 399 if(p->next){
mas01mc@340 400 // Construct list in t2 order
mas01mc@340 401 if(H::t2 < p->next->t2){
mas01mc@340 402 bucket* tmp = new bucket();
mas01mc@340 403 tmp->next = p->next;
mas01mc@340 404 p->next = tmp;
mas01mc@340 405 __bucket_insert_point(tmp);
mas01mc@340 406 }
mas01mc@340 407 else
mas01mc@340 408 __bucket_insert_point(p->next);
mas01mc@292 409 }
mas01mc@340 410 else {
mas01mc@292 411 p->next = new bucket();
mas01mc@292 412 __bucket_insert_point(p->next);
mas01mc@292 413 }
mas01mc@292 414 }
mas01mc@292 415
mas01mc@340 416 // insert points into point-locale collision chain
mas01mc@292 417 void H::__sbucket_insert_point(sbucket* p){
mas01mc@292 418 if(p->pointID==IFLAG){
mas01mc@292 419 p->pointID = H::p;
mas01mc@292 420 return;
mas01mc@292 421 }
mas01mc@340 422
mas01mc@292 423 // Search for pointID
mas01mc@340 424 if(p->snext){
mas01mc@292 425 __sbucket_insert_point(p->snext);
mas01mc@292 426 }
mas01mc@292 427 else{
mas01mc@340 428 // Make new point collision bucket at end of list
mas01mc@340 429 p->snext = new sbucket();
mas01mc@340 430 __sbucket_insert_point(p->snext);
mas01mc@292 431 }
mas01mc@292 432 }
mas01mc@292 433
mas01mc@293 434 inline bucket** H::get_bucket(int j){
mas01mc@292 435 return *(h+j);
mas01mc@292 436 }
mas01mc@292 437
mas01mc@340 438 // Find the linked-list pointer at the end of the CORE_ARRAY
mas01mc@340 439 bucket** H::get_pointer_to_bucket_linked_list(bucket* rowPtr){
mas01mc@344 440 Uns32T numBuckets = rowPtr->snext.numBuckets; // Cast pointer to unsigned int
mas01mc@343 441 Uns32T numPoints = rowPtr->t2 & 0x7FFFFFFF; // Value is stored in low 31 bits of t2 field
mas01mc@343 442 bucket** listPtr = reinterpret_cast<bucket**> (reinterpret_cast<unsigned int*>(rowPtr->next)+numPoints+numBuckets+1);
mas01mc@343 443 return listPtr;
mas01mc@340 444 }
mas01mc@340 445
mas01mc@293 446 // Interface to Locality Sensitive Hashing G
mas01mc@293 447 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr):
mas01mc@293 448 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures
mas01mc@308 449 indexName(0),
mas01mc@293 450 lshHeader(0),
mas01mc@292 451 calling_instance(0),
mas01mc@293 452 add_point_callback(0)
mas01mc@292 453 {
mas01mc@293 454
mas01mc@292 455 }
mas01mc@292 456
mas01mc@292 457 // Serialize from file LSH constructor
mas01mc@292 458 // Read parameters from database file
mas01mc@292 459 // Load the hash functions, close the database
mas01mc@292 460 // Optionally load the LSH tables into head-allocated lists in core
mas01mc@292 461 G::G(char* filename, bool lshInCoreFlag):
mas01mc@293 462 H(), // default base-class constructor call delays data-structure initialization
mas01mc@309 463 indexName(0),
mas01mc@293 464 lshHeader(0),
mas01mc@292 465 calling_instance(0),
mas01mc@292 466 add_point_callback(0)
mas01mc@292 467 {
mas01mc@474 468 FILE* dbFile = 0;
mas01mc@292 469 int dbfid = unserialize_lsh_header(filename);
mas01mc@309 470 indexName = new char[O2_INDEX_MAXSTR];
mas01mc@309 471 strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER
mas01mc@293 472 H::initialize_lsh_functions(); // Base-class data-structure initialization
mas01mc@293 473 unserialize_lsh_functions(dbfid); // populate with on-disk hashfunction values
mas01mc@292 474
mas01mc@292 475 // Format1 only needs unserializing if specifically requested
mas01mc@292 476 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2) && lshInCoreFlag){
mas01mc@292 477 unserialize_lsh_hashtables_format1(dbfid);
mas01mc@292 478 }
mas01mc@292 479
mas01mc@292 480 // Format2 always needs unserializing
mas01mc@292 481 if(lshHeader->flags&O2_SERIAL_FILEFORMAT2 && lshInCoreFlag){
mas01mc@474 482 dbFile = fdopen(dbfid, "rb");
mas01mc@336 483 if(!dbFile)
mas01mc@336 484 error("Cannot open LSH file for reading", filename);
mas01mc@336 485 unserialize_lsh_hashtables_format2(dbFile);
mas01mc@292 486 }
mas01mc@336 487 serial_close(dbfid);
mas01mc@474 488 if(dbFile){
mas01mc@474 489 fclose(dbFile);
mas01mc@474 490 dbFile = 0;
mas01mc@474 491 }
mas01mc@336 492 }
mas01mc@292 493
mas01mc@292 494 G::~G(){
mas01mc@292 495 delete lshHeader;
mas01mc@474 496 delete[] indexName;
mas01mc@292 497 }
mas01mc@292 498
mas01mc@292 499 // single point insertion; inserted values are hash value and pointID
mas01mc@292 500 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
mas01mc@292 501 Uns32T collisionCount = 0;
mas01mc@292 502 H::p = pp;
mas01mc@299 503 if(H::maxp && pp<=H::maxp)
mas01mc@296 504 error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
mas01mc@296 505 H::maxp=pp; // Store highest pointID in database
mas01mc@293 506 H::compute_hash_functions( v );
mas01mc@292 507 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
mas01mc@293 508 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
mas01mc@292 509 collisionCount += bucket_insert_point( *(h + j) + t1 );
mas01mc@292 510 }
mas01mc@292 511 return collisionCount;
mas01mc@292 512 }
mas01mc@292 513
mas01mc@292 514
mas01mc@292 515 // batch insert for a point set
mas01mc@292 516 // inserted values are vector hash value and pointID starting at basePointID
mas01mc@292 517 void G::insert_point_set(vector<vector<float> >& vv, Uns32T basePointID){
mas01mc@292 518 for(Uns32T point=0; point<vv.size(); point++)
mas01mc@292 519 insert_point(vv[point], basePointID+point);
mas01mc@292 520 }
mas01mc@292 521
mas01mc@292 522 // point retrieval routine
mas01mc@292 523 void G::retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 524 calling_instance = caller;
mas01mc@292 525 add_point_callback = add_point;
mas01mc@293 526 H::compute_hash_functions( v );
mas01mc@292 527 for(Uns32T j = 0 ; j < H::L ; j++ ){
mas01mc@293 528 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
mas01cr@370 529 if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) {
mas01mc@340 530 #ifdef LSH_LIST_HEAD_COUNTERS
mas01cr@370 531 if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
mas01mc@340 532 retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos);
mas01cr@370 533 } else {
mas01mc@340 534 bucket_chain_point( bPtr->next, qpos);
mas01cr@370 535 }
mas01mc@292 536 #else
mas01cr@370 537 bucket_chain_point( bPtr , qpos);
mas01mc@292 538 #endif
mas01cr@370 539 }
mas01mc@292 540 }
mas01mc@292 541 }
mas01mc@292 542
mas01mc@292 543 void G::retrieve_point_set(vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 544 for(Uns32T qpos = 0 ; qpos < vv.size() ; qpos++ )
mas01mc@292 545 retrieve_point(vv[qpos], qpos, add_point, caller);
mas01mc@292 546 }
mas01mc@292 547
mas01mc@292 548 // export lsh tables to table structure on disk
mas01mc@292 549 //
mas01mc@292 550 // LSH TABLE STRUCTURE
mas01mc@292 551 // ---header 64 bytes ---
mas01mc@292 552 // [magic #tables #rows #cols elementSize databaseSize version flags dim #funs 0 0 0 0 0 0]
mas01mc@292 553 //
mas01mc@292 554 // ---random projections L x k x d float ---
mas01mc@292 555 // A[0][0][0] A[0][0][1] ... A[0][0][d-1]
mas01mc@292 556 // A[0][1][0] A[0][1][1] ... A[1][1][d-1]
mas01mc@292 557 // ...
mas01mc@292 558 // A[0][K-1][0] A[0][1][1] ... A[0][k-1][d-1]
mas01mc@292 559 // ...
mas01mc@292 560 // ...
mas01mc@292 561 // A[L-1][0][0] A[M-1][0][1] ... A[L-1][0][d-1]
mas01mc@292 562 // A[L-1][1][0] A[M-1][1][1] ... A[L-1][1][d-1]
mas01mc@292 563 // ...
mas01mc@292 564 // A[L-1][k-1][0] A[M-1][1][1] ... A[L-1][k-1][d-1]
mas01mc@292 565 //
mas01mc@292 566 // ---bias L x k float ---
mas01mc@292 567 // b[0][0] b[0][1] ... b[0][k-1]
mas01mc@292 568 // b[1][0] b[1][1] ... b[1][k-1]
mas01mc@292 569 // ...
mas01mc@292 570 // b[L-1][0] b[L-1][1] ... b[L-1][k-1]
mas01mc@292 571 //
mas01mc@292 572 // ---random r1 L x k float ---
mas01mc@292 573 // r1[0][0] r1[0][1] ... r1[0][k-1]
mas01mc@292 574 // r1[1][0] r1[1][1] ... r1[1][k-1]
mas01mc@292 575 // ...
mas01mc@292 576 // r1[L-1][0] r1[L-1][1] ... r1[L-1][k-1]
mas01mc@292 577 //
mas01mc@292 578 // ---random r2 L x k float ---
mas01mc@292 579 // r2[0][0] r2[0][1] ... r2[0][k-1]
mas01mc@292 580 // r2[1][0] r2[1][1] ... r2[1][k-1]
mas01mc@292 581 // ...
mas01mc@292 582 // r2[L-1][0] r2[L-1][1] ... r2[L-1][k-1]
mas01mc@292 583 //
mas01mc@293 584 // ******* HASHTABLES FORMAT1 (optimized for LSH_ON_DISK retrieval) *******
mas01mc@292 585 // ---hash table 0: N x C x 8 ---
mas01mc@292 586 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 587 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 588 // ...
mas01mc@292 589 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 590 //
mas01mc@292 591 // ---hash table 1: N x C x 8 ---
mas01mc@292 592 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 593 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 594 // ...
mas01mc@292 595 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 596 //
mas01mc@292 597 // ...
mas01mc@292 598 //
mas01mc@292 599 // ---hash table L-1: N x C x 8 ---
mas01mc@292 600 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 601 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 602 // ...
mas01mc@292 603 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 604 //
mas01mc@293 605 // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) *******
mas01mc@293 606 //
mas01mc@293 607 // State machine controlled by regular expression.
mas01mc@293 608 // legend:
mas01mc@293 609 //
mas01mc@306 610 // O2_SERIAL_TOKEN_T1 = 0xFFFFFFFCU
mas01mc@306 611 // O2_SERIAL_TOKEN_T2 = 0xFFFFFFFDU
mas01mc@306 612 // O2_SERIAL_TOKEN_ENDTABLE = 0xFFFFFFFEU
mas01mc@293 613 //
mas01mc@306 614 // T1 - T1 hash token
mas01mc@306 615 // t1 - t1 hash key (t1 range 0..2^29-1)
mas01mc@306 616 // T2 - T2 token
mas01mc@293 617 // t2 - t2 hash key (range 1..2^32-6)
mas01mc@293 618 // p - point identifier (range 0..2^32-1)
mas01mc@306 619 // E - end hash table token
mas01mc@293 620 // {...} required arguments
mas01mc@293 621 // [...] optional arguments
mas01mc@293 622 // * - match zero or more occurences
mas01mc@293 623 // + - match one or more occurences
mas01mc@293 624 // {...}^L - repeat argument L times
mas01mc@293 625 //
mas01mc@293 626 // FORMAT2 Regular expression:
mas01mc@306 627 // { [T1 t1 [T2 t2 p+]+ ]* E }^L
mas01mc@293 628 //
mas01mc@292 629
mas01mc@292 630 // Serial header constructors
mas01mc@292 631 SerialHeader::SerialHeader(){;}
mas01mc@296 632 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
mas01mc@292 633 lshMagic(O2_SERIAL_MAGIC),
mas01mc@292 634 binWidth(W),
mas01mc@292 635 numTables(L),
mas01mc@292 636 numRows(N),
mas01mc@292 637 numCols(C),
mas01mc@292 638 elementSize(O2_SERIAL_ELEMENT_SIZE),
mas01mc@296 639 version(O2_SERIAL_VERSION),
mas01mc@296 640 size(0), // we are deprecating this value
mas01mc@292 641 flags(FMT),
mas01mc@292 642 dataDim(d),
mas01mc@292 643 numFuns(k),
mas01mc@292 644 radius(r),
mas01mc@296 645 maxp(p),
mas01mc@296 646 size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables
mas01mc@296 647 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
mas01mc@296 648 (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
mas01mc@296 649 pointCount(pc){
mas01mc@296 650
mas01mc@296 651 if(FMT==O2_SERIAL_FILEFORMAT2)
mas01mc@296 652 size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE
mas01mc@296 653 + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
mas01mc@296 654 +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
mas01mc@296 655 } // header
mas01mc@292 656
mas01mc@292 657 float* G::get_serial_hashfunction_base(char* db){
mas01mc@292 658 if(db&&lshHeader)
mas01mc@292 659 return (float*)(db+O2_SERIAL_HEADER_SIZE);
mas01mc@292 660 else return NULL;
mas01mc@292 661 }
mas01mc@292 662
mas01mc@292 663 SerialElementT* G::get_serial_hashtable_base(char* db){
mas01mc@292 664 if(db&&lshHeader)
mas01mc@292 665 return (SerialElementT*)(db+get_serial_hashtable_offset());
mas01mc@292 666 else
mas01mc@292 667 return NULL;
mas01mc@292 668 }
mas01mc@292 669
mas01mc@292 670 Uns32T G::get_serial_hashtable_offset(){
mas01mc@292 671 if(lshHeader)
mas01mc@292 672 return align_up(O2_SERIAL_HEADER_SIZE +
mas01mc@292 673 L*lshHeader->numFuns*( sizeof(float)*lshHeader->dataDim+2*sizeof(Uns32T)+sizeof(float)),get_page_logn());
mas01mc@292 674 else
mas01mc@292 675 return 0;
mas01mc@292 676 }
mas01mc@292 677
mas01mc@292 678 void G::serialize(char* filename, Uns32T serialFormat){
mas01mc@292 679 int dbfid;
mas01mc@292 680 char* db;
mas01mc@292 681 int dbIsNew=0;
mas01mc@474 682 FILE* dbFile = 0;
mas01mc@292 683 // Check requested serialFormat
mas01mc@292 684 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
mas01mc@292 685 error("Unrecognized serial file format request: ", "serialize()");
mas01mc@296 686
mas01mc@292 687 // Test to see if file exists
mas01cr@370 688 if((dbfid = open (filename, O_RDONLY)) < 0) {
mas01mc@292 689 // If it doesn't, then create the file (CREATE)
mas01cr@370 690 if(errno == ENOENT) {
mas01mc@292 691 // Create the file
mas01mc@292 692 std::cout << "Creating new serialized LSH database:" << filename << "...";
mas01mc@292 693 std::cout.flush();
mas01mc@292 694 serial_create(filename, serialFormat);
mas01mc@292 695 dbIsNew=1;
mas01cr@370 696 } else {
mas01mc@292 697 // The file can't be opened
mas01mc@292 698 error("Can't open the file", filename, "open");
mas01cr@370 699 }
mas01cr@370 700 }
mas01mc@292 701
mas01mc@292 702 // Load the on-disk header into core
mas01mc@292 703 dbfid = serial_open(filename, 1); // open for write
mas01mc@292 704 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
mas01mc@292 705 serial_get_header(db); // read header
mas01mc@292 706 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@292 707
mas01mc@292 708 // Check compatibility of core and disk data structures
mas01mc@292 709 if( !serial_can_merge(serialFormat) )
mas01mc@292 710 error("Incompatible core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 711
mas01mc@292 712 // For new LSH databases write the hashfunctions
mas01mc@292 713 if(dbIsNew)
mas01mc@292 714 serialize_lsh_hashfunctions(dbfid);
mas01mc@292 715 // Write the hashtables in the requested format
mas01mc@292 716 if(serialFormat == O2_SERIAL_FILEFORMAT1)
mas01mc@292 717 serialize_lsh_hashtables_format1(dbfid, !dbIsNew);
mas01mc@336 718 else{
mas01mc@474 719 dbFile = fdopen(dbfid, "r+b");
mas01mc@336 720 if(!dbFile)
mas01mc@336 721 error("Cannot open LSH file for writing",filename);
mas01mc@336 722 serialize_lsh_hashtables_format2(dbFile, !dbIsNew);
mas01mc@336 723 fflush(dbFile);
mas01mc@336 724 }
mas01mc@292 725
mas01mc@336 726 if(!dbIsNew) {
mas01mc@292 727 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
mas01mc@292 728 //serial_get_header(db); // read header
mas01mc@293 729 cout << "maxp = " << H::maxp << endl;
mas01mc@293 730 lshHeader->maxp=H::maxp;
mas01mc@292 731 // Default to FILEFORMAT1
mas01mc@292 732 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2))
mas01mc@294 733 lshHeader->flags|=O2_SERIAL_FILEFORMAT1;
mas01mc@292 734 memcpy((char*)db, (char*)lshHeader, sizeof(SerialHeaderT));
mas01mc@292 735 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@336 736 }
mas01mc@336 737 serial_close(dbfid);
mas01mc@474 738 if(dbFile){
mas01mc@474 739 fclose(dbFile);
mas01mc@474 740 dbFile = 0;
mas01mc@474 741 }
mas01mc@292 742 }
mas01mc@292 743
mas01mc@292 744 // Test to see if core structure and requested format is
mas01mc@292 745 // compatible with currently opened database
mas01mc@292 746 int G::serial_can_merge(Uns32T format){
mas01mc@292 747 SerialHeaderT* that = lshHeader;
mas01mc@292 748 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2)
mas01mc@292 749 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
mas01mc@292 750 || !( this->w == that->binWidth &&
mas01mc@296 751 this->L == that->numTables &&
mas01mc@296 752 this->N == that->numRows &&
mas01mc@296 753 this->k == that->numFuns &&
mas01mc@296 754 this->d == that->dataDim &&
mas01mc@296 755 sizeof(SerialElementT) == that->elementSize &&
mas01mc@296 756 this->radius == that->radius)){
mas01mc@292 757 serial_print_header(format);
mas01mc@292 758 return 0;
mas01mc@292 759 }
mas01mc@292 760 else
mas01mc@292 761 return 1;
mas01mc@292 762 }
mas01mc@292 763
mas01mc@292 764 // Used as an error message for serial_can_merge()
mas01mc@292 765 void G::serial_print_header(Uns32T format){
mas01mc@292 766 std::cout << "Fc:" << format << " Fs:" << lshHeader->flags << endl;
mas01mc@292 767 std::cout << "Wc:" << w << " Ls:" << lshHeader->binWidth << endl;
mas01mc@292 768 std::cout << "Lc:" << L << " Ls:" << lshHeader->numTables << endl;
mas01mc@292 769 std::cout << "Nc:" << N << " Ns:" << lshHeader->numRows << endl;
mas01mc@292 770 std::cout << "kc:" << k << " ks:" << lshHeader->numFuns << endl;
mas01mc@292 771 std::cout << "dc:" << d << " ds:" << lshHeader->dataDim << endl;
mas01mc@292 772 std::cout << "sc:" << sizeof(SerialElementT) << " ss:" << lshHeader->elementSize << endl;
mas01mc@292 773 std::cout << "rc:" << this->radius << " rs:" << lshHeader->radius << endl;
mas01mc@292 774 }
mas01mc@292 775
mas01mc@292 776 int G::serialize_lsh_hashfunctions(int fid){
mas01mc@292 777 float* pf;
mas01mc@292 778 Uns32T *pu;
mas01mc@292 779 Uns32T x,y,z;
mas01mc@292 780
mas01mc@293 781 char* db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer
mas01mc@292 782 pf = get_serial_hashfunction_base(db);
mas01mc@292 783
mas01mc@292 784 // HASH FUNCTIONS
mas01mc@292 785 // Write the random projectors A[][][]
mas01mc@292 786 #ifdef USE_U_FUNCTIONS
mas01mc@292 787 for( x = 0 ; x < H::m ; x++ )
mas01mc@292 788 for( y = 0 ; y < H::k/2 ; y++ )
mas01mc@292 789 #else
mas01mc@292 790 for( x = 0 ; x < H::L ; x++ )
mas01mc@292 791 for( y = 0 ; y < H::k ; y++ )
mas01mc@292 792 #endif
mas01mc@292 793 for( z = 0 ; z < d ; z++ )
mas01mc@293 794 *pf++ = H::A[x][y][z];
mas01mc@292 795
mas01mc@292 796 // Write the random biases b[][]
mas01mc@292 797 #ifdef USE_U_FUNCTIONS
mas01mc@292 798 for( x = 0 ; x < H::m ; x++ )
mas01mc@292 799 for( y = 0 ; y < H::k/2 ; y++ )
mas01mc@292 800 #else
mas01mc@292 801 for( x = 0 ; x < H::L ; x++ )
mas01mc@292 802 for( y = 0 ; y < H::k ; y++ )
mas01mc@292 803 #endif
mas01mc@293 804 *pf++ = H::b[x][y];
mas01mc@292 805
mas01mc@292 806 pu = (Uns32T*)pf;
mas01mc@292 807
mas01mc@292 808 // Write the Z projectors r1[][]
mas01mc@292 809 for( x = 0 ; x < H::L ; x++)
mas01mc@292 810 for( y = 0 ; y < H::k ; y++)
mas01mc@293 811 *pu++ = H::r1[x][y];
mas01mc@292 812
mas01mc@292 813 // Write the Z projectors r2[][]
mas01mc@292 814 for( x = 0 ; x < H::L ; x++)
mas01mc@292 815 for( y = 0; y < H::k ; y++)
mas01mc@293 816 *pu++ = H::r2[x][y];
mas01mc@292 817
mas01mc@292 818 serial_munmap(db, get_serial_hashtable_offset());
mas01mc@292 819 return 1;
mas01mc@292 820 }
mas01mc@292 821
mas01mc@292 822 int G::serialize_lsh_hashtables_format1(int fid, int merge){
mas01mc@292 823 SerialElementT *pe, *pt;
mas01mc@292 824 Uns32T x,y;
mas01mc@292 825
mas01mc@292 826 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT1) )
mas01mc@292 827 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 828
mas01mc@292 829 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 830 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount;
mas01mc@292 831 // Write the hash tables
mas01mc@292 832 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 833 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT1...";
mas01mc@292 834 std::cout.flush();
mas01mc@292 835 // memory map a single hash table for sequential access
mas01mc@292 836 // Align each hash table to page boundary
mas01mc@292 837 char* dbtable = serial_mmap(fid, hashTableSize, 1,
mas01mc@292 838 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
mas01mc@324 839 #ifdef __CYGWIN__
mas01mc@324 840 // No madvise in CYGWIN
mas01mc@324 841 #else
mas01mc@292 842 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 843 error("could not advise hashtable memory","","madvise");
mas01mc@324 844 #endif
mas01mc@292 845 maxColCount=0;
mas01mc@292 846 minColCount=O2_SERIAL_MAX_COLS;
mas01mc@292 847 meanColCount=0;
mas01mc@292 848 colCountN=0;
mas01mc@292 849 pt=(SerialElementT*)dbtable;
mas01mc@292 850 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 851 // Move disk pointer to beginning of row
mas01mc@292 852 pe=pt+y*lshHeader->numCols;
mas01mc@292 853
mas01mc@292 854 colCount=0;
mas01cr@370 855 if(bucket* bPtr = h[x][y]) {
mas01cr@370 856 if(merge) {
mas01mc@340 857 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@292 858 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
mas01cr@370 859 } else {
mas01mc@292 860 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 861 #else
mas01cr@370 862 serial_merge_hashtable_row_format1(pe, bPtr, colCount);
mas01cr@370 863 } else {
mas01cr@370 864 serial_write_hashtable_row_format1(pe, bPtr, colCount);
mas01mc@292 865 #endif
mas01cr@370 866 }
mas01cr@370 867 }
mas01mc@292 868 if(colCount){
mas01mc@292 869 if(colCount<minColCount)
mas01mc@292 870 minColCount=colCount;
mas01mc@292 871 if(colCount>maxColCount)
mas01mc@292 872 maxColCount=colCount;
mas01mc@292 873 meanColCount+=colCount;
mas01mc@292 874 colCountN++;
mas01mc@292 875 }
mas01mc@292 876 }
mas01mc@292 877 if(colCountN)
mas01mc@292 878 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
mas01mc@292 879 << ", min = " << minColCount << ", max = " << maxColCount
mas01mc@292 880 << endl;
mas01mc@292 881 serial_munmap(dbtable, hashTableSize);
mas01mc@292 882 }
mas01mc@292 883
mas01mc@292 884 // We're done writing
mas01mc@292 885 return 1;
mas01mc@292 886 }
mas01mc@292 887
mas01mc@292 888 void G::serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* b, Uns32T& colCount){
mas01mc@292 889 while(b && b->t2!=IFLAG){
mas01mc@292 890 SerialElementT*pe=pr; // reset disk pointer to beginning of row
mas01mc@344 891 serial_merge_element_format1(pe, b->snext.ptr, b->t2, colCount);
mas01mc@292 892 b=b->next;
mas01mc@292 893 }
mas01mc@292 894 }
mas01mc@292 895
mas01mc@292 896 void G::serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
mas01mc@292 897 while(sb){
mas01mc@292 898 if(colCount==lshHeader->numCols){
mas01mc@292 899 std::cout << "!point-chain full " << endl;
mas01mc@292 900 return;
mas01mc@292 901 }
mas01mc@292 902 Uns32T c=0;
mas01mc@292 903 // Merge collision chains
mas01mc@292 904 while(c<lshHeader->numCols){
mas01mc@292 905 if( (pe+c)->hashValue==IFLAG){
mas01mc@292 906 (pe+c)->hashValue=t2;
mas01mc@292 907 (pe+c)->pointID=sb->pointID;
mas01mc@292 908 colCount=c+1;
mas01mc@292 909 if(c+1<lshHeader->numCols)
mas01mc@292 910 (pe+c+1)->hashValue=IFLAG;
mas01mc@292 911 break;
mas01mc@292 912 }
mas01mc@292 913 c++;
mas01mc@292 914 }
mas01mc@292 915 sb=sb->snext;
mas01mc@292 916 }
mas01mc@292 917 return;
mas01mc@292 918 }
mas01mc@292 919
mas01mc@292 920 void G::serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* b, Uns32T& colCount){
mas01mc@292 921 pe->hashValue=IFLAG;
mas01mc@292 922 while(b && b->t2!=IFLAG){
mas01mc@344 923 serial_write_element_format1(pe, b->snext.ptr, b->t2, colCount);
mas01mc@292 924 b=b->next;
mas01mc@292 925 }
mas01mc@292 926 }
mas01mc@292 927
mas01mc@292 928 void G::serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
mas01mc@292 929 while(sb){
mas01mc@292 930 if(colCount==lshHeader->numCols){
mas01mc@292 931 std::cout << "!point-chain full " << endl;
mas01mc@292 932 return;
mas01mc@292 933 }
mas01mc@292 934 pe->hashValue=t2;
mas01mc@292 935 pe->pointID=sb->pointID;
mas01mc@292 936 pe++;
mas01mc@292 937 colCount++;
mas01mc@292 938 sb=sb->snext;
mas01mc@292 939 }
mas01mc@292 940 pe->hashValue=IFLAG;
mas01mc@292 941 return;
mas01mc@292 942 }
mas01mc@292 943
mas01mc@336 944 int G::serialize_lsh_hashtables_format2(FILE* dbFile, int merge){
mas01mc@292 945 Uns32T x,y;
mas01mc@292 946
mas01mc@292 947 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
mas01mc@292 948 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 949
mas01mc@340 950 // We must pereform FORMAT2 merges in core FORMAT1 (dynamic list structure)
mas01mc@292 951 if(merge)
mas01mc@340 952 unserialize_lsh_hashtables_format2(dbFile, merge);
mas01mc@292 953
mas01mc@292 954 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
mas01mc@336 955 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
mas01mc@336 956 fclose(dbFile);
mas01mc@336 957 error("fSeek error in serialize_lsh_hashtables_format2");
mas01mc@336 958 }
mas01mc@292 959
mas01mc@292 960 // Write the hash tables
mas01mc@292 961 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 962 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT2...";
mas01mc@292 963 std::cout.flush();
mas01mc@292 964 maxColCount=0;
mas01mc@292 965 minColCount=O2_SERIAL_MAX_COLS;
mas01mc@292 966 meanColCount=0;
mas01mc@292 967 colCountN=0;
mas01mc@292 968 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 969 colCount=0;
mas01mc@292 970 if(bucket* bPtr = h[x][y]){
mas01mc@306 971 // Check for empty row (even though row was allocated)
mas01mc@340 972 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@306 973 if(bPtr->next->t2==IFLAG){
mas01mc@336 974 fclose(dbFile);
mas01mc@306 975 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
mas01mc@306 976 }
mas01mc@306 977 #else
mas01mc@306 978 if(bPtr->t2==IFLAG){
mas01mc@336 979 fclose(dbFile);
mas01mc@306 980 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
mas01mc@306 981 }
mas01mc@306 982 #endif
mas01mc@306 983 t1 = O2_SERIAL_TOKEN_T1;
mas01mc@340 984 WRITE_UNS32(&t1, "[T1]");
mas01mc@306 985 t1 = y;
mas01mc@340 986 WRITE_UNS32(&t1, "[t1]");
mas01mc@340 987 #ifdef LSH_CORE_ARRAY
mas01mc@340 988 t1 = count_buckets_and_points_hashtable_row(bPtr);
mas01mc@340 989 WRITE_UNS32(&t1,"[count]"); // write numElements
mas01mc@340 990 #endif
mas01mc@340 991 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@336 992 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 993 #else
mas01mc@340 994 serial_write_hashtable_row_format2(dbFile, bPtr, colCount);
mas01mc@292 995 #endif
mas01mc@292 996 }
mas01mc@292 997 if(colCount){
mas01mc@292 998 if(colCount<minColCount)
mas01mc@292 999 minColCount=colCount;
mas01mc@292 1000 if(colCount>maxColCount)
mas01mc@292 1001 maxColCount=colCount;
mas01mc@292 1002 meanColCount+=colCount;
mas01mc@292 1003 colCountN++;
mas01mc@292 1004 }
mas01mc@292 1005 }
mas01mc@292 1006 // Write END of table marker
mas01mc@306 1007 t1 = O2_SERIAL_TOKEN_ENDTABLE;
mas01mc@340 1008 WRITE_UNS32(&t1,"[end]");
mas01mc@292 1009 if(colCountN)
mas01mc@292 1010 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
mas01mc@292 1011 << ", min = " << minColCount << ", max = " << maxColCount
mas01mc@292 1012 << endl;
mas01mc@340 1013 }
mas01mc@292 1014 // We're done writing
mas01mc@292 1015 return 1;
mas01mc@292 1016 }
mas01mc@292 1017
mas01mc@340 1018 Uns32T G::count_buckets_and_points_hashtable_row(bucket* bPtr){
mas01mc@340 1019 Uns32T total_count = 0;
mas01mc@340 1020 bucket* p = 0;
mas01mc@340 1021
mas01mc@340 1022 // count points
mas01mc@340 1023 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@340 1024 total_count = bPtr->t2; // points already counted
mas01mc@340 1025 p = bPtr->next;
mas01mc@340 1026 #else
mas01mc@340 1027 total_count = count_points_hashtable_row(bPtr);
mas01mc@340 1028 p = bPtr;
mas01mc@340 1029 #endif
mas01mc@340 1030
mas01mc@340 1031 // count buckets
mas01mc@340 1032 do{
mas01mc@340 1033 total_count++;
mas01mc@340 1034 }while((p=p->next));
mas01mc@340 1035
mas01mc@340 1036 return total_count;
mas01mc@340 1037 }
mas01mc@340 1038
mas01mc@340 1039 Uns32T G::count_points_hashtable_row(bucket* bPtr){
mas01mc@340 1040 Uns32T point_count = 0;
mas01mc@340 1041 bucket* p = bPtr;
mas01mc@340 1042 sbucket* s = 0;
mas01mc@340 1043 while(p){
mas01mc@344 1044 s = p->snext.ptr;
mas01mc@340 1045 while(s){
mas01mc@340 1046 point_count++;
mas01mc@340 1047 s=s->snext;
mas01mc@340 1048 }
mas01mc@340 1049 p=p->next;
mas01mc@340 1050 }
mas01mc@340 1051 return point_count;
mas01mc@340 1052 }
mas01mc@340 1053
mas01mc@336 1054 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
mas01mc@292 1055 while(b && b->t2!=IFLAG){
mas01mc@344 1056 if(!b->snext.ptr){
mas01mc@336 1057 fclose(dbFile);
mas01mc@306 1058 error("Empty collision chain in serial_write_hashtable_row_format2()");
mas01mc@306 1059 }
mas01mc@306 1060 t2 = O2_SERIAL_TOKEN_T2;
mas01mc@336 1061 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 1062 fclose(dbFile);
mas01mc@292 1063 error("write error in serial_write_hashtable_row_format2()");
mas01mc@292 1064 }
mas01mc@292 1065 t2 = b->t2;
mas01mc@336 1066 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 1067 fclose(dbFile);
mas01mc@292 1068 error("write error in serial_write_hashtable_row_format2()");
mas01mc@292 1069 }
mas01mc@344 1070 serial_write_element_format2(dbFile, b->snext.ptr, colCount);
mas01mc@292 1071 b=b->next;
mas01mc@292 1072 }
mas01mc@292 1073 }
mas01mc@292 1074
mas01mc@336 1075 void G::serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount){
mas01mc@292 1076 while(sb){
mas01mc@336 1077 if(fwrite(&sb->pointID, sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1078 fclose(dbFile);
mas01mc@292 1079 error("Write error in serial_write_element_format2()");
mas01mc@292 1080 }
mas01mc@292 1081 colCount++;
mas01mc@292 1082 sb=sb->snext;
mas01mc@292 1083 }
mas01mc@292 1084 }
mas01mc@292 1085
mas01mc@292 1086
mas01mc@292 1087 int G::serial_create(char* filename, Uns32T FMT){
mas01mc@292 1088 return serial_create(filename, w, L, N, C, k, d, FMT);
mas01mc@292 1089 }
mas01mc@292 1090
mas01mc@292 1091
mas01mc@292 1092 int G::serial_create(char* filename, float binWidth, Uns32T numTables, Uns32T numRows, Uns32T numCols,
mas01mc@292 1093 Uns32T numFuns, Uns32T dim, Uns32T FMT){
mas01mc@292 1094
mas01mc@292 1095 if(numTables > O2_SERIAL_MAX_TABLES || numRows > O2_SERIAL_MAX_ROWS
mas01mc@292 1096 || numCols > O2_SERIAL_MAX_COLS || numFuns > O2_SERIAL_MAX_FUNS
mas01mc@292 1097 || dim>O2_SERIAL_MAX_DIM){
mas01mc@292 1098 error("LSH parameters out of bounds for serialization");
mas01mc@292 1099 }
mas01mc@292 1100
mas01mc@292 1101 int dbfid;
mas01mc@292 1102 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0)
mas01mc@292 1103 error("Can't create serial file", filename, "open");
mas01mc@292 1104 get_lock(dbfid, 1);
mas01mc@292 1105
mas01mc@292 1106 // Make header first to get size of serialized database
mas01mc@296 1107 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);
mas01mc@296 1108
mas01mc@296 1109 cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
mas01mc@296 1110 if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
mas01mc@296 1111 error("Maximum size of LSH file exceded: > 4000MB");
mas01mc@296 1112
mas01mc@292 1113 // go to the location corresponding to the last byte
mas01mc@292 1114 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
mas01mc@292 1115 error("lseek error in db file", "", "lseek");
mas01mc@292 1116
mas01mc@292 1117 // write a dummy byte at the last location
mas01mc@292 1118 if (write (dbfid, "", 1) != 1)
mas01mc@292 1119 error("write error", "", "write");
mas01mc@292 1120
mas01mc@293 1121 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
mas01mc@296 1122
mas01mc@292 1123 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
mas01mc@296 1124
mas01mc@292 1125 serial_munmap(db, O2_SERIAL_HEADER_SIZE);
mas01mc@296 1126
mas01mc@292 1127 close(dbfid);
mas01mc@292 1128
mas01mc@292 1129 std::cout << "done initializing tables." << endl;
mas01mc@292 1130
mas01mc@292 1131 return 1;
mas01mc@292 1132 }
mas01mc@292 1133
mas01mc@292 1134 char* G::serial_mmap(int dbfid, Uns32T memSize, Uns32T forWrite, off_t offset){
mas01mc@293 1135 char* db;
mas01mc@292 1136 if(forWrite){
mas01mc@292 1137 if ((db = (char*) mmap(0, memSize, PROT_READ | PROT_WRITE,
mas01mc@292 1138 MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
mas01mc@292 1139 error("mmap error in request for writable serialized database", "", "mmap");
mas01mc@292 1140 }
mas01mc@292 1141 else if ((db = (char*) mmap(0, memSize, PROT_READ, MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
mas01mc@292 1142 error("mmap error in read-only serialized database", "", "mmap");
mas01mc@292 1143
mas01mc@292 1144 return db;
mas01mc@292 1145 }
mas01mc@292 1146
mas01mc@292 1147 SerialHeaderT* G::serial_get_header(char* db){
mas01mc@292 1148 lshHeader = new SerialHeaderT();
mas01mc@292 1149 memcpy((char*)lshHeader, db, sizeof(SerialHeaderT));
mas01mc@292 1150
mas01mc@292 1151 if(lshHeader->lshMagic!=O2_SERIAL_MAGIC)
mas01mc@292 1152 error("Not an LSH database file");
mas01mc@292 1153
mas01mc@292 1154 return lshHeader;
mas01mc@292 1155 }
mas01mc@292 1156
mas01mc@292 1157 void G::serial_munmap(char* db, Uns32T N){
mas01mc@292 1158 munmap(db, N);
mas01mc@292 1159 }
mas01mc@292 1160
mas01mc@292 1161 int G::serial_open(char* filename, int writeFlag){
mas01mc@292 1162 int dbfid;
mas01mc@292 1163 if(writeFlag){
mas01mc@292 1164 if ((dbfid = open (filename, O_RDWR)) < 0)
mas01mc@292 1165 error("Can't open serial file for read/write", filename, "open");
mas01mc@292 1166 get_lock(dbfid, writeFlag);
mas01mc@292 1167 }
mas01mc@292 1168 else{
mas01mc@292 1169 if ((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 1170 error("Can't open serial file for read", filename, "open");
mas01mc@292 1171 get_lock(dbfid, 0);
mas01mc@292 1172 }
mas01mc@292 1173
mas01mc@292 1174 return dbfid;
mas01mc@292 1175 }
mas01mc@292 1176
mas01mc@292 1177 void G::serial_close(int dbfid){
mas01mc@292 1178
mas01mc@292 1179 release_lock(dbfid);
mas01mc@292 1180 close(dbfid);
mas01mc@292 1181 }
mas01mc@292 1182
mas01mc@292 1183 int G::unserialize_lsh_header(char* filename){
mas01mc@292 1184
mas01mc@292 1185 int dbfid;
mas01mc@292 1186 char* db;
mas01mc@292 1187 // Test to see if file exists
mas01mc@292 1188 if((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 1189 error("Can't open the file", filename, "open");
mas01mc@292 1190 close(dbfid);
mas01mc@292 1191 dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1192 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1193 serial_get_header(db); // read header
mas01mc@292 1194 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@292 1195
mas01mc@292 1196 // Unserialize header parameters
mas01mc@292 1197 H::L = lshHeader->numTables;
mas01mc@292 1198 H::m = (Uns32T)( (1.0 + sqrt(1 + 8.0*(int)H::L)) / 2.0);
mas01mc@292 1199 H::N = lshHeader->numRows;
mas01mc@292 1200 H::C = lshHeader->numCols;
mas01mc@292 1201 H::k = lshHeader->numFuns;
mas01mc@292 1202 H::d = lshHeader->dataDim;
mas01mc@293 1203 H::w = lshHeader->binWidth;
mas01mc@293 1204 H::radius = lshHeader->radius;
mas01mc@293 1205 H::maxp = lshHeader->maxp;
mas01mc@296 1206 H::pointCount = lshHeader->pointCount;
mas01mc@292 1207
mas01mc@292 1208 return dbfid;
mas01mc@292 1209 }
mas01mc@292 1210
mas01mc@292 1211 // unserialize the LSH parameters
mas01mc@292 1212 // we leave the LSH tree on disk as a flat file
mas01mc@292 1213 // it is this flat file that we search by memory mapping
mas01mc@292 1214 void G::unserialize_lsh_functions(int dbfid){
mas01mc@292 1215 Uns32T j, kk;
mas01mc@292 1216 float* pf;
mas01mc@292 1217 Uns32T* pu;
mas01mc@292 1218
mas01mc@292 1219 // Load the hash functions into core
mas01mc@292 1220 char* db = serial_mmap(dbfid, get_serial_hashtable_offset(), 0);// get database pointer again
mas01mc@292 1221
mas01mc@292 1222 pf = get_serial_hashfunction_base(db);
mas01mc@292 1223
mas01mc@292 1224 #ifdef USE_U_FUNCTIONS
mas01mc@292 1225 for( j = 0 ; j < H::m ; j++ ){ // L functions gj(v)
mas01mc@292 1226 for( kk = 0 ; kk < H::k/2 ; kk++ ){ // Normally distributed hash functions
mas01mc@292 1227 #else
mas01mc@292 1228 for( j = 0 ; j < H::L ; j++ ){ // L functions gj(v)
mas01mc@292 1229 for( kk = 0 ; kk < H::k ; kk++ ){ // Normally distributed hash functions
mas01mc@292 1230 #endif
mas01mc@292 1231 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 1232 H::A[j][kk][i] = *pf++; // Normally distributed random vectors
mas01mc@292 1233 }
mas01mc@292 1234 }
mas01mc@292 1235 #ifdef USE_U_FUNCTIONS
mas01mc@292 1236 for( j = 0 ; j < H::m ; j++ ) // biases b
mas01mc@292 1237 for( kk = 0 ; kk < H::k/2 ; kk++ )
mas01mc@292 1238 #else
mas01mc@292 1239 for( j = 0 ; j < H::L ; j++ ) // biases b
mas01mc@292 1240 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1241 #endif
mas01mc@293 1242 H::b[j][kk] = *pf++;
mas01mc@292 1243
mas01mc@292 1244 pu = (Uns32T*)pf;
mas01mc@292 1245 for( j = 0 ; j < H::L ; j++ ) // Z projectors r1
mas01mc@292 1246 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1247 H::r1[j][kk] = *pu++;
mas01mc@292 1248
mas01mc@292 1249 for( j = 0 ; j < H::L ; j++ ) // Z projectors r2
mas01mc@292 1250 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1251 H::r2[j][kk] = *pu++;
mas01mc@292 1252
mas01mc@293 1253 serial_munmap(db, get_serial_hashtable_offset());
mas01mc@292 1254 }
mas01mc@292 1255
mas01mc@292 1256 void G::unserialize_lsh_hashtables_format1(int fid){
mas01mc@292 1257 SerialElementT *pe, *pt;
mas01mc@292 1258 Uns32T x,y;
mas01mc@292 1259 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1260 // Read the hash tables into core
mas01mc@292 1261 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 1262 // memory map a single hash table
mas01mc@292 1263 // Align each hash table to page boundary
mas01mc@292 1264 char* dbtable = serial_mmap(fid, hashTableSize, 0,
mas01mc@292 1265 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
mas01mc@324 1266 #ifdef __CYGWIN__
mas01mc@324 1267 // No madvise in CYGWIN
mas01mc@324 1268 #else
mas01mc@292 1269 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 1270 error("could not advise hashtable memory","","madvise");
mas01mc@324 1271 #endif
mas01mc@292 1272 pt=(SerialElementT*)dbtable;
mas01mc@292 1273 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 1274 // Move disk pointer to beginning of row
mas01mc@292 1275 pe=pt+y*lshHeader->numCols;
mas01mc@292 1276 unserialize_hashtable_row_format1(pe, h[x]+y);
mas01mc@340 1277 #ifdef LSH_DUMP_CORE_TABLES
mas01mc@292 1278 printf("S[%d,%d]", x, y);
mas01mc@292 1279 serial_bucket_dump(pe);
mas01mc@292 1280 printf("C[%d,%d]", x, y);
mas01mc@292 1281 dump_hashtable_row(h[x][y]);
mas01mc@292 1282 #endif
mas01mc@292 1283 }
mas01mc@292 1284 serial_munmap(dbtable, hashTableSize);
mas01mc@292 1285 }
mas01mc@292 1286 }
mas01mc@292 1287
mas01mc@292 1288 void G::unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b){
mas01mc@292 1289 Uns32T colCount = 0;
mas01mc@292 1290 while(colCount!=lshHeader->numCols && pe->hashValue !=IFLAG){
mas01mc@292 1291 H::p = pe->pointID; // current point ID
mas01mc@292 1292 t2 = pe->hashValue;
mas01mc@292 1293 bucket_insert_point(b);
mas01mc@292 1294 pe++;
mas01mc@292 1295 colCount++;
mas01mc@292 1296 }
mas01mc@292 1297 }
mas01mc@292 1298
mas01mc@340 1299 void G::unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge){
mas01mc@292 1300 Uns32T x=0,y=0;
mas01mc@292 1301
mas01mc@292 1302 // Seek to hashtable base offset
mas01mc@336 1303 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
mas01mc@336 1304 fclose(dbFile);
mas01mc@336 1305 error("fSeek error in unserialize_lsh_hashtables_format2");
mas01mc@292 1306 }
mas01mc@292 1307
mas01mc@292 1308 // Read the hash tables into core (structure is given in header)
mas01mc@292 1309 while( x < H::L){
mas01mc@336 1310 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1311 fclose(dbFile);
mas01mc@292 1312 error("Read error","unserialize_lsh_hashtables_format2()");
mas01mc@292 1313 }
mas01mc@306 1314 if(H::t1==O2_SERIAL_TOKEN_ENDTABLE)
mas01mc@292 1315 x++; // End of table
mas01mc@292 1316 else
mas01mc@292 1317 while(y < H::N){
mas01mc@306 1318 // Read a row and move file pointer to beginning of next row or _bittable
mas01mc@306 1319 if(!(H::t1==O2_SERIAL_TOKEN_T1)){
mas01mc@336 1320 fclose(dbFile);
mas01mc@306 1321 error("State matchine error T1","unserialize_lsh_hashtables_format2()");
mas01mc@292 1322 }
mas01mc@336 1323 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1324 fclose(dbFile);
mas01mc@306 1325 error("Read error: t1","unserialize_lsh_hashtables_format2()");
mas01mc@306 1326 }
mas01mc@306 1327 y = H::t1;
mas01mc@292 1328 if(y>=H::N){
mas01mc@336 1329 fclose(dbFile);
mas01mc@292 1330 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
mas01mc@292 1331 }
mas01mc@340 1332 Uns32T token = 0;
mas01mc@340 1333 #ifdef LSH_CORE_ARRAY
mas01mc@340 1334 Uns32T numElements;
mas01mc@340 1335 if(fread(&numElements, sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@340 1336 fclose(dbFile);
mas01mc@340 1337 error("Read error: numElements","unserialize_lsh_hashtables_format2()");
mas01mc@340 1338 }
mas01mc@292 1339
mas01mc@340 1340 // BACKWARD COMPATIBILITY: check to see if T2 or END token was read
mas01mc@340 1341 if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){
mas01mc@340 1342 forMerge=true; // Force use of dynamic linked list core format
mas01mc@340 1343 token = numElements;
mas01mc@340 1344 }
mas01mc@340 1345
mas01mc@340 1346 if(forMerge)
mas01mc@340 1347 // Use linked list CORE format
mas01mc@340 1348 token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token);
mas01mc@340 1349 else
mas01mc@340 1350 // Use ARRAY CORE format with numElements counter
mas01mc@340 1351 token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements);
mas01mc@340 1352 #else
mas01mc@340 1353 token = unserialize_hashtable_row_format2(dbFile, h[x]+y);
mas01mc@513 1354 #endif
mas01mc@292 1355 // Check that token is valid
mas01mc@306 1356 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
mas01mc@336 1357 fclose(dbFile);
mas01mc@292 1358 error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()");
mas01mc@292 1359 }
mas01mc@292 1360 // Check for end of table flag
mas01mc@306 1361 if(token==O2_SERIAL_TOKEN_ENDTABLE){
mas01mc@292 1362 x++;
mas01mc@292 1363 break;
mas01mc@292 1364 }
mas01mc@292 1365 // Check for new row flag
mas01mc@306 1366 if(token==O2_SERIAL_TOKEN_T1)
mas01mc@292 1367 H::t1 = token;
mas01mc@292 1368 }
mas01mc@292 1369 }
mas01mc@513 1370 #ifdef LSH_DUMP_CORE_TABLES
mas01mc@513 1371 dump_hashtables();
mas01mc@513 1372 #endif
mas01mc@306 1373 }
mas01mc@292 1374
mas01mc@340 1375 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token){
mas01mc@292 1376 bool pointFound = false;
mas01mc@340 1377
mas01mc@340 1378 if(token)
mas01mc@340 1379 H::t2 = token;
mas01mc@340 1380 else if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@340 1381 fclose(dbFile);
mas01mc@340 1382 error("Read error T2 token","unserialize_hashtable_row_format2");
mas01mc@340 1383 }
mas01mc@340 1384
mas01mc@306 1385 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
mas01mc@336 1386 fclose(dbFile);
mas01mc@292 1387 error("State machine error: expected E or T2");
mas01mc@292 1388 }
mas01mc@340 1389
mas01mc@306 1390 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
mas01mc@292 1391 pointFound=false;
mas01mc@292 1392 // Check for T2 token
mas01mc@306 1393 if(H::t2!=O2_SERIAL_TOKEN_T2){
mas01mc@336 1394 fclose(dbFile);
mas01mc@292 1395 error("State machine error T2 token", "unserialize_hashtable_row_format2()");
mas01mc@306 1396 }
mas01mc@292 1397 // Read t2 value
mas01mc@336 1398 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1399 fclose(dbFile);
mas01mc@292 1400 error("Read error t2","unserialize_hashtable_row_format2");
mas01mc@292 1401 }
mas01mc@336 1402 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1403 fclose(dbFile);
mas01mc@292 1404 error("Read error H::p","unserialize_hashtable_row_format2");
mas01mc@292 1405 }
mas01mc@306 1406 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
mas01mc@292 1407 pointFound=true;
mas01mc@292 1408 bucket_insert_point(b);
mas01mc@336 1409 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1410 fclose(dbFile);
mas01mc@292 1411 error("Read error H::p","unserialize_hashtable_row_format2");
mas01mc@292 1412 }
mas01mc@292 1413 }
mas01mc@292 1414 if(!pointFound)
mas01mc@292 1415 error("State machine error: point", "unserialize_hashtable_row_format2()");
mas01mc@306 1416 H::t2 = H::p; // Copy last found token to t2
mas01mc@292 1417 }
mas01mc@292 1418 return H::t2; // holds current token
mas01mc@292 1419 }
mas01mc@292 1420
mas01mc@340 1421 // Unserialize format2 hashtable row to a CORE_ARRAY pointed to
mas01mc@340 1422 // by the hashtable row pointer: rowPtr
mas01mc@340 1423 //
mas01mc@340 1424 // numElements is the total number of t2 buckets plus points
mas01mc@340 1425 // memory required is numElements+1+sizeof(hashtable ptr)
mas01mc@340 1426 //
mas01mc@340 1427 // numElements = numPoints + numBuckets
mas01mc@340 1428 //
mas01mc@340 1429 // During inserts (merges) new hashtable entries are appened at rowPtr+numPoints+numBuckets+1
mas01mc@340 1430 //
mas01mc@340 1431 // ASSUME: that LSH_LIST_HEAD_COUNTERS is set so that the first hashtable bucket is used to count
mas01mc@340 1432 // point and bucket entries
mas01mc@340 1433 //
mas01mc@340 1434 // We store the values of numPoints and numBuckets in separate fields of the first bucket
mas01mc@340 1435 // rowPtr->t2 // numPoints
mas01mc@340 1436 // (Uns32T)(rowPtr->snext) // numBuckets
mas01mc@340 1437 //
mas01mc@340 1438 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*))
mas01mc@340 1439 // To get to the fist bucket, we use
mas01mc@340 1440 //
mas01mc@340 1441
mas01mc@340 1442 #define READ_UNS32T(VAL,TOKENSTR) if(fread(VAL, sizeof(Uns32T), 1, dbFile) != 1){\
mas01mc@340 1443 fclose(dbFile);error("Read error unserialize_hashtable_format2",TOKENSTR);}
mas01mc@340 1444
mas01mc@340 1445 #define TEST_TOKEN(TEST, TESTSTR) if(TEST){fclose(dbFile);error("State machine error: ", TESTSTR);}
mas01mc@340 1446
mas01mc@340 1447 #define SKIP_BITS_LEFT_SHIFT_MSB (30)
mas01mc@340 1448
mas01mc@340 1449 #define SKIP_BITS_RIGHT_SHIFT_MSB (28)
mas01mc@340 1450 #define SKIP_BITS_RIGHT_SHIFT_LSB (30)
mas01mc@340 1451
mas01mc@340 1452 #define MAX_POINTS_IN_BUCKET_CORE_ARRAY (16)
mas01mc@340 1453 #define LSH_CORE_ARRAY_END_ROW_TOKEN (0xFFFFFFFD)
mas01mc@340 1454
mas01mc@340 1455 // Encode the skip bits. Zero if only one point, MAX 8 (plus first == 9)
mas01mc@340 1456 #define ENCODE_POINT_SKIP_BITS TEST_TOKEN(!numPointsThisBucket, "no points found");\
mas01mc@340 1457 if(numPointsThisBucket==1){\
mas01mc@340 1458 secondPtr=ap++;\
mas01mc@340 1459 *secondPtr=0;\
mas01mc@340 1460 numPoints++;\
mas01mc@340 1461 }\
mas01mc@340 1462 if(numPointsThisBucket>1){\
mas01mc@340 1463 *firstPtr |= ( (numPointsThisBucket-1) & 0x3 ) << SKIP_BITS_LEFT_SHIFT_MSB;\
mas01mc@340 1464 *secondPtr |= ( ( (numPointsThisBucket-1) & 0xC) >> 2 ) << SKIP_BITS_LEFT_SHIFT_MSB;}
mas01mc@340 1465
mas01mc@340 1466 Uns32T G::unserialize_hashtable_row_to_array(FILE* dbFile, bucket** rowPP, Uns32T numElements){
mas01mc@340 1467 Uns32T numPointsThisBucket = 0;
mas01mc@340 1468 Uns32T numBuckets = 0;
mas01mc@340 1469 Uns32T numPoints = 0;
mas01mc@340 1470 Uns32T* firstPtr = 0;
mas01mc@340 1471 Uns32T* secondPtr = 0;
mas01mc@340 1472
mas01mc@340 1473 // Initialize new row
mas01mc@340 1474 if(!*rowPP){
mas01mc@340 1475 *rowPP = new bucket();
mas01mc@340 1476 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@340 1477 (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row
mas01mc@340 1478 (*rowPP)->next = 0;
mas01mc@340 1479 #endif
mas01mc@340 1480 }
mas01mc@340 1481 bucket* rowPtr = *rowPP;
mas01mc@340 1482
mas01mc@340 1483 READ_UNS32T(&(H::t2),"t2");
mas01mc@340 1484 TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2");
mas01mc@340 1485 // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements
mas01mc@340 1486 // So over-allocate by a factor of two and realloc later to actual numElements
mas01mc@340 1487 CR_ASSERT(rowPtr->next = (bucket*) malloc((2*numElements+1)*sizeof(Uns32T)+sizeof(bucket**)));
mas01mc@340 1488 Uns32T* ap = reinterpret_cast<Uns32T*>(rowPtr->next); // Cast pointer to Uns32T* for array format
mas01mc@340 1489 while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){
mas01mc@340 1490 numPointsThisBucket = 0;// reset bucket point counter
mas01mc@340 1491 secondPtr = 0; // reset second-point pointer
mas01mc@340 1492 TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");
mas01mc@340 1493 READ_UNS32T(&(H::t2), "Read error t2");
mas01mc@340 1494 *ap++ = H::t2; // Insert t2 value into array
mas01mc@340 1495 numBuckets++;
mas01mc@340 1496 READ_UNS32T(&(H::p), "Read error H::p");
mas01mc@340 1497 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
mas01mc@340 1498 if(numPointsThisBucket==MAX_POINTS_IN_BUCKET_CORE_ARRAY){
mas01mc@340 1499 ENCODE_POINT_SKIP_BITS;
mas01mc@340 1500 *ap++ = H::t2; // Extra element
mas01mc@340 1501 numBuckets++; // record this as a new bucket
mas01mc@340 1502 numPointsThisBucket=0; // reset bucket point counter
mas01mc@340 1503 secondPtr = 0; // reset second-point pointer
mas01mc@340 1504 }
mas01mc@340 1505 if( ++numPointsThisBucket == 1 )
mas01mc@340 1506 firstPtr = ap; // store pointer to first point to insert skip bits later on
mas01mc@340 1507 else if( numPointsThisBucket == 2 )
mas01mc@340 1508 secondPtr = ap; // store pointer to first point to insert skip bits later on
mas01mc@340 1509 numPoints++;
mas01mc@340 1510 *ap++ = H::p;
mas01mc@340 1511 READ_UNS32T(&(H::p), "Read error H::p");
mas01mc@340 1512 }
mas01mc@340 1513 ENCODE_POINT_SKIP_BITS;
mas01mc@340 1514 H::t2 = H::p; // Copy last found token to t2
mas01mc@340 1515 }
mas01mc@340 1516 // Reallocate the row to its actual size
mas01mc@340 1517 CR_ASSERT(rowPtr->next = (bucket*) realloc(rowPtr->next, (numBuckets+numPoints+1)*sizeof(Uns32T)+sizeof(bucket**)));
mas01mc@340 1518 // Record the sizes at the head of the row
mas01mc@344 1519 rowPtr->snext.numBuckets = numBuckets;
mas01mc@340 1520 rowPtr->t2 = numPoints;
mas01mc@340 1521 // Place end of row marker
mas01mc@340 1522 *ap++ = LSH_CORE_ARRAY_END_ROW_TOKEN;
mas01mc@340 1523 // Set the LSH_CORE_ARRAY_BIT to identify data structure for insertion and retrieval
mas01mc@340 1524 rowPtr->t2 |= LSH_CORE_ARRAY_BIT;
mas01mc@340 1525 // Allocate a new dynamic list head at the end of the array
mas01mc@340 1526 bucket** listPtr = reinterpret_cast<bucket**> (ap);
mas01mc@340 1527 *listPtr = 0;
mas01mc@340 1528 // Return current token
mas01mc@340 1529 return H::t2; // return H::t2 which holds current token [E or T1]
mas01mc@340 1530 }
mas01mc@340 1531
mas01mc@340 1532
mas01mc@340 1533
mas01mc@340 1534 // *p is a pointer to the beginning of a hashtable row array
mas01mc@340 1535 // The array consists of t2 hash keys and one or more point identifiers p for each hash key
mas01mc@340 1536 // Retrieval is performed by generating a hash key query_t2 for query point q
mas01mc@340 1537 // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry
mas01mc@340 1538 // point for retrieve_from_core_hashtable_array
mas01mc@340 1539 #define SKIP_BITS (0xC0000000)
mas01mc@340 1540 void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){
mas01mc@340 1541 Uns32T skip;
mas01mc@340 1542 Uns32T t2;
mas01mc@340 1543 Uns32T p1;
mas01mc@340 1544 Uns32T p2;
mas01mc@340 1545
mas01mc@340 1546 CR_ASSERT(p);
mas01mc@340 1547
mas01mc@340 1548 do{
mas01mc@340 1549 t2 = *p++;
mas01mc@340 1550 if( t2 > H::t2 )
mas01mc@340 1551 return;
mas01mc@340 1552 p1 = *p++;
mas01mc@340 1553 p2 = *p++;
mas01mc@340 1554 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
mas01mc@340 1555 if( t2 == H::t2 ){
mas01mc@340 1556 add_point_callback(calling_instance, p1 ^ (p1 & SKIP_BITS), qpos, radius);
mas01mc@340 1557 if(skip--){
mas01mc@340 1558 add_point_callback(calling_instance, p2 ^ (p2 & SKIP_BITS), qpos, radius);
mas01mc@340 1559 while(skip-- )
mas01mc@340 1560 add_point_callback(calling_instance, *p++, qpos, radius);
mas01mc@340 1561 }
mas01mc@340 1562 }
mas01mc@340 1563 else
mas01mc@340 1564 if(*p != LSH_CORE_ARRAY_END_ROW_TOKEN)
mas01mc@340 1565 p = p + skip;
mas01mc@340 1566 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
mas01mc@340 1567 }
mas01mc@513 1568
mas01mc@513 1569 void G::dump_hashtables(){
mas01mc@513 1570 for(Uns32T x = 0; x < H::L ; x++)
mas01mc@513 1571 for(Uns32T y = 0; y < H::N ; y++){
mas01mc@513 1572 bucket* bPtr = h[x][y];
mas01mc@513 1573 if(bPtr){
mas01mc@513 1574 printf("C[%d,%d]", x, y);
mas01mc@513 1575 #ifdef LSH_LIST_HEAD_COUNTERS
mas01mc@513 1576 printf("[numBuckets=%d]",bPtr->snext.numBuckets);
mas01mc@513 1577 if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
mas01mc@513 1578 dump_core_hashtable_array((Uns32T*)(bPtr->next));
mas01mc@513 1579 }
mas01mc@513 1580 else {
mas01mc@513 1581 dump_hashtable_row(bPtr->next);
mas01mc@513 1582 }
mas01mc@513 1583 #else
mas01mc@513 1584 dump_hashtable_row(bPtr);
mas01mc@513 1585 #endif
mas01mc@513 1586 printf("\n");
mas01mc@513 1587 fflush(stdout);
mas01mc@513 1588 }
mas01mc@513 1589 }
mas01mc@513 1590 }
mas01mc@513 1591
mas01mc@513 1592 void G::dump_core_hashtable_array(Uns32T* p){
mas01mc@513 1593 Uns32T skip;
mas01mc@513 1594 Uns32T t2;
mas01mc@513 1595 Uns32T p1;
mas01mc@513 1596 Uns32T p2;
mas01mc@513 1597 CR_ASSERT(p);
mas01mc@513 1598 do{
mas01mc@513 1599 t2 = *p++;
mas01mc@513 1600 p1 = *p++;
mas01mc@513 1601 p2 = *p++;
mas01mc@513 1602 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
mas01mc@513 1603 printf("(%0x, %0x)", t2, p1 ^ (p1 & SKIP_BITS));
mas01mc@513 1604 if(skip--){
mas01mc@513 1605 printf("(%0x, %0x)", t2, p2 ^ (p2 & SKIP_BITS));
mas01mc@513 1606 while(skip-- )
mas01mc@513 1607 printf("(%0x, %0x)", t2, *p++);
mas01mc@513 1608 }
mas01mc@513 1609 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
mas01mc@513 1610 }
mas01mc@340 1611
mas01mc@292 1612 void G::dump_hashtable_row(bucket* p){
mas01mc@292 1613 while(p && p->t2!=IFLAG){
mas01mc@344 1614 sbucket* sbp = p->snext.ptr;
mas01mc@292 1615 while(sbp){
mas01mc@292 1616 printf("(%0X,%u)", p->t2, sbp->pointID);
mas01mc@292 1617 fflush(stdout);
mas01mc@292 1618 sbp=sbp->snext;
mas01mc@292 1619 }
mas01mc@292 1620 p=p->next;
mas01mc@292 1621 }
mas01mc@292 1622 printf("\n");
mas01mc@292 1623 }
mas01mc@292 1624
mas01mc@292 1625
mas01mc@292 1626 // G::serial_retrieve_point( ... )
mas01mc@292 1627 // retrieves (pointID) from a serialized LSH database
mas01mc@292 1628 //
mas01mc@292 1629 // inputs:
mas01mc@292 1630 // filename - file name of serialized LSH database
mas01mc@292 1631 // vv - query point set
mas01mc@292 1632 //
mas01mc@292 1633 // outputs:
mas01mc@292 1634 // inserts retrieved points into add_point() callback method
mas01mc@292 1635 void G::serial_retrieve_point_set(char* filename, vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller)
mas01mc@292 1636 {
mas01mc@292 1637 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1638 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1639 serial_get_header(dbheader); // read header
mas01mc@292 1640 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1641
mas01mc@292 1642 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
mas01mc@336 1643 serial_close(dbfid);
mas01mc@292 1644 error("serial_retrieve_point_set is for SERIAL_FILEFORMAT1 only");
mas01mc@292 1645 }
mas01mc@292 1646
mas01mc@292 1647 // size of each hash table
mas01mc@292 1648 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1649 calling_instance = caller; // class instance variable used in ...bucket_chain_point()
mas01mc@292 1650 add_point_callback = add_point;
mas01mc@292 1651
mas01mc@292 1652 for(Uns32T j=0; j<L; j++){
mas01mc@292 1653 // memory map a single hash table for random access
mas01mc@292 1654 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1655 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1656 #ifdef __CYGWIN__
mas01mc@324 1657 // No madvise in CYGWIN
mas01mc@324 1658 #else
mas01mc@292 1659 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
mas01mc@292 1660 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1661 #endif
mas01mc@292 1662 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@292 1663 for(Uns32T qpos=0; qpos<vv.size(); qpos++){
mas01mc@293 1664 H::compute_hash_functions(vv[qpos]);
mas01mc@293 1665 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
mas01mc@292 1666 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
mas01mc@292 1667 }
mas01mc@292 1668 serial_munmap(db, hashTableSize); // drop hashtable mmap
mas01mc@292 1669 }
mas01mc@292 1670 serial_close(dbfid);
mas01mc@292 1671 }
mas01mc@292 1672
mas01mc@292 1673 void G::serial_retrieve_point(char* filename, vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 1674 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1675 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1676 serial_get_header(dbheader); // read header
mas01mc@292 1677 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1678
mas01mc@292 1679 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
mas01mc@336 1680 serial_close(dbfid);
mas01mc@292 1681 error("serial_retrieve_point is for SERIAL_FILEFORMAT1 only");
mas01mc@292 1682 }
mas01mc@292 1683
mas01mc@292 1684 // size of each hash table
mas01mc@292 1685 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1686 calling_instance = caller;
mas01mc@292 1687 add_point_callback = add_point;
mas01mc@293 1688 H::compute_hash_functions(v);
mas01mc@292 1689 for(Uns32T j=0; j<L; j++){
mas01mc@292 1690 // memory map a single hash table for random access
mas01mc@292 1691 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1692 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1693 #ifdef __CYGWIN__
mas01mc@324 1694 // No madvise in CYGWIN
mas01mc@324 1695 #else
mas01mc@292 1696 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
mas01mc@292 1697 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1698 #endif
mas01mc@292 1699 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@293 1700 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
mas01mc@292 1701 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
mas01mc@292 1702 serial_munmap(db, hashTableSize); // drop hashtable mmap
mas01mc@292 1703 }
mas01mc@292 1704 serial_close(dbfid);
mas01mc@292 1705 }
mas01mc@292 1706
mas01mc@292 1707 void G::serial_dump_tables(char* filename){
mas01mc@292 1708 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1709 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1710 serial_get_header(dbheader); // read header
mas01mc@292 1711 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1712 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1713 for(Uns32T j=0; j<L; j++){
mas01mc@292 1714 // memory map a single hash table for random access
mas01mc@292 1715 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1716 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1717 #ifdef __CYGWIN__
mas01mc@324 1718 // No madvise in CYGWIN
mas01mc@324 1719 #else
mas01mc@292 1720 if(madvise(db, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 1721 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1722 #endif
mas01mc@292 1723 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@292 1724 printf("*********** TABLE %d ***************\n", j);
mas01mc@292 1725 fflush(stdout);
mas01mc@292 1726 int count=0;
mas01mc@292 1727 do{
mas01mc@292 1728 printf("[%d,%d]", j, count++);
mas01mc@292 1729 fflush(stdout);
mas01mc@292 1730 serial_bucket_dump(pe);
mas01mc@292 1731 pe+=lshHeader->numCols;
mas01mc@292 1732 }while(pe<(SerialElementT*)db+lshHeader->numRows*lshHeader->numCols);
mas01mc@292 1733 }
mas01mc@292 1734
mas01mc@292 1735 }
mas01mc@292 1736
mas01mc@292 1737 void G::serial_bucket_dump(SerialElementT* pe){
mas01mc@292 1738 SerialElementT* pend = pe+lshHeader->numCols;
mas01mc@292 1739 while( !(pe->hashValue==IFLAG || pe==pend ) ){
mas01mc@292 1740 printf("(%0X,%u)",pe->hashValue,pe->pointID);
mas01mc@292 1741 pe++;
mas01mc@292 1742 }
mas01mc@292 1743 printf("\n");
mas01mc@292 1744 fflush(stdout);
mas01mc@292 1745 }
mas01mc@292 1746
mas01mc@292 1747 void G::serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos){
mas01mc@292 1748 SerialElementT* pend = pe+lshHeader->numCols;
mas01mc@292 1749 while( !(pe->hashValue==IFLAG || pe==pend ) ){
mas01mc@292 1750 if(pe->hashValue==t2){ // new match
mas01mc@292 1751 add_point_callback(calling_instance, pe->pointID, qpos, radius);
mas01mc@292 1752 }
mas01mc@292 1753 pe++;
mas01mc@292 1754 }
mas01mc@292 1755 }
mas01mc@292 1756
mas01mc@292 1757 void G::bucket_chain_point(bucket* p, Uns32T qpos){
mas01mc@292 1758 if(!p || p->t2==IFLAG)
mas01mc@292 1759 return;
mas01mc@292 1760 if(p->t2==t2){ // match
mas01mc@344 1761 sbucket_chain_point(p->snext.ptr, qpos); // add to reporter
mas01mc@292 1762 }
mas01mc@292 1763 if(p->next){
mas01mc@292 1764 bucket_chain_point(p->next, qpos); // recurse
mas01mc@292 1765 }
mas01mc@292 1766 }
mas01mc@292 1767
mas01mc@292 1768 void G::sbucket_chain_point(sbucket* p, Uns32T qpos){
mas01mc@292 1769 add_point_callback(calling_instance, p->pointID, qpos, radius);
mas01mc@292 1770 if(p->snext){
mas01mc@292 1771 sbucket_chain_point(p->snext, qpos);
mas01mc@292 1772 }
mas01mc@292 1773 }
mas01mc@292 1774
mas01mc@292 1775 void G::get_lock(int fd, bool exclusive) {
mas01mc@292 1776 struct flock lock;
mas01mc@292 1777 int status;
mas01mc@292 1778 lock.l_type = exclusive ? F_WRLCK : F_RDLCK;
mas01mc@292 1779 lock.l_whence = SEEK_SET;
mas01mc@292 1780 lock.l_start = 0;
mas01mc@292 1781 lock.l_len = 0; /* "the whole file" */
mas01mc@292 1782 retry:
mas01mc@292 1783 do {
mas01mc@292 1784 status = fcntl(fd, F_SETLKW, &lock);
mas01mc@292 1785 } while (status != 0 && errno == EINTR);
mas01mc@292 1786 if (status) {
mas01mc@292 1787 if (errno == EAGAIN) {
mas01mc@292 1788 sleep(1);
mas01mc@292 1789 goto retry;
mas01mc@292 1790 } else {
mas01mc@292 1791 error("fcntl lock error", "", "fcntl");
mas01mc@292 1792 }
mas01mc@292 1793 }
mas01mc@292 1794 }
mas01mc@292 1795
mas01mc@292 1796 void G::release_lock(int fd) {
mas01mc@292 1797 struct flock lock;
mas01mc@292 1798 int status;
mas01mc@292 1799
mas01mc@292 1800 lock.l_type = F_UNLCK;
mas01mc@292 1801 lock.l_whence = SEEK_SET;
mas01mc@292 1802 lock.l_start = 0;
mas01mc@292 1803 lock.l_len = 0;
mas01mc@292 1804
mas01mc@292 1805 status = fcntl(fd, F_SETLKW, &lock);
mas01mc@292 1806
mas01mc@292 1807 if (status)
mas01mc@292 1808 error("fcntl unlock error", "", "fcntl");
mas01mc@292 1809 }