annotate lshlib.cpp @ 336:fe4d5b763086

converted read/write into fread/fwrite for LSH hashtable serialize and unserialize. INDEXING is now faster.
author mas01mc
date Fri, 05 Sep 2008 14:16:21 +0000
parents c93be2f3a674
children a6edbe97fddf
rev   line source
mas01mc@292 1 #include "lshlib.h"
mas01mc@292 2
mas01mc@292 3 //#define __LSH_DUMP_CORE_TABLES__
mas01mc@292 4 //#define USE_U_FUNCTIONS
mas01mc@296 5 #define LSH_BLOCK_FULL_ROWS
mas01mc@292 6
mas01mc@292 7 void err(char*s){cout << s << endl;exit(2);}
mas01mc@292 8
mas01mc@292 9 Uns32T get_page_logn(){
mas01mc@292 10 int pagesz = (int)sysconf(_SC_PAGESIZE);
mas01mc@292 11 return (Uns32T)log2((double)pagesz);
mas01mc@292 12 }
mas01mc@292 13
mas01mc@292 14 unsigned align_up(unsigned x, unsigned w){ return ((x) + ((1<<w)-1) & ~((1<<w)-1)); }
mas01mc@292 15
mas01mc@292 16 void H::error(const char* a, const char* b, const char *sysFunc) {
mas01mc@292 17 cerr << a << ": " << b << endl;
mas01mc@292 18 if (sysFunc) {
mas01mc@292 19 perror(sysFunc);
mas01mc@292 20 }
mas01mc@292 21 exit(1);
mas01mc@292 22 }
mas01mc@292 23
mas01mc@293 24 H::H(){
mas01mc@293 25 // Delay initialization of lsh functions until we know the parameters
mas01mc@293 26 }
mas01mc@293 27
mas01mc@293 28 H::H(Uns32T kk, Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float ww, float rr):
mas01mc@292 29 #ifdef USE_U_FUNCTIONS
mas01mc@292 30 use_u_functions(true),
mas01mc@292 31 #else
mas01mc@292 32 use_u_functions(false),
mas01mc@292 33 #endif
mas01mc@293 34 maxp(0),
mas01mc@292 35 bucketCount(0),
mas01mc@292 36 pointCount(0),
mas01mc@292 37 N(NN),
mas01mc@292 38 C(CC),
mas01mc@292 39 k(kk),
mas01mc@292 40 m(mm),
mas01mc@293 41 L((mm*(mm-1))/2),
mas01mc@293 42 d(dd),
mas01mc@293 43 w(ww),
mas01mc@293 44 radius(rr)
mas01mc@292 45 {
mas01mc@292 46
mas01mc@292 47 if(m<2){
mas01mc@292 48 m=2;
mas01mc@292 49 L=1; // check value of L
mas01mc@292 50 cout << "warning: setting m=2, L=1" << endl;
mas01mc@292 51 }
mas01mc@292 52 if(use_u_functions && k%2){
mas01mc@292 53 k++; // make sure k is even
mas01mc@292 54 cout << "warning: setting k even" << endl;
mas01mc@292 55 }
mas01mc@293 56
mas01mc@293 57 // We have the necessary parameters, so construct hashfunction datastructures
mas01mc@293 58 initialize_lsh_functions();
mas01mc@292 59 }
mas01mc@292 60
mas01mc@293 61 void H::initialize_lsh_functions(){
mas01mc@292 62 H::P = UH_PRIME_DEFAULT;
mas01mc@292 63
mas01mc@292 64 /* FIXME: don't use time(); instead use /dev/random or similar */
mas01mc@292 65 /* FIXME: write out the seed somewhere, so that we can get
mas01mc@292 66 repeatability */
mas01mc@292 67 #ifdef MT19937
mas01mc@292 68 init_genrand(time(NULL));
mas01mc@292 69 #else
mas01mc@292 70 srand(time(NULL)); // seed random number generator
mas01mc@292 71 #endif
mas01mc@293 72 Uns32T i,j, kk;
mas01mc@293 73 #ifdef USE_U_FUNCTIONS
mas01mc@293 74 H::A = new float**[ H::m ]; // m x k x d random projectors
mas01mc@293 75 H::b = new float*[ H::m ]; // m x k random biases
mas01mc@293 76 #else
mas01mc@293 77 H::A = new float**[ H::L ]; // m x k x d random projectors
mas01mc@293 78 H::b = new float*[ H::L ]; // m x k random biases
mas01mc@293 79 #endif
mas01mc@293 80 H::g = new Uns32T*[ H::L ]; // L x k random projections
mas01mc@293 81 assert( H::g && H::A && H::b ); // failure
mas01mc@293 82 #ifdef USE_U_FUNCTIONS
mas01mc@293 83 // Use m \times u_i functions \in R^{(k/2) \times (d)}
mas01mc@293 84 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
mas01mc@293 85 for( j = 0; j < H::m ; j++ ){ // m functions u_i(v)
mas01mc@293 86 H::A[j] = new float*[ H::k/2 ]; // k/2 x d 2-stable distribution coefficients
mas01mc@293 87 H::b[j] = new float[ H::k/2 ]; // bias
mas01mc@293 88 assert( H::A[j] && H::b[j] ); // failure
mas01mc@293 89 for( kk = 0; kk < H::k/2 ; kk++ ){
mas01mc@293 90 H::A[j][kk] = new float[ H::d ];
mas01mc@293 91 assert( H::A[j][kk] ); // failure
mas01mc@293 92 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 93 H::A[j][kk][i] = H::randn(); // Normal
mas01mc@293 94 H::b[j][kk] = H::ranf()*H::w; // Uniform
mas01mc@293 95 }
mas01mc@293 96 }
mas01mc@293 97 #else
mas01mc@293 98 // Use m \times u_i functions \in R^{k \times (d)}
mas01mc@293 99 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
mas01mc@293 100 for( j = 0; j < H::L ; j++ ){ // m functions u_i(v)
mas01mc@293 101 H::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients
mas01mc@293 102 H::b[j] = new float[ H::k ]; // bias
mas01mc@293 103 assert( H::A[j] && H::b[j] ); // failure
mas01mc@293 104 for( kk = 0; kk < H::k ; kk++ ){
mas01mc@293 105 H::A[j][kk] = new float[ H::d ];
mas01mc@293 106 assert( H::A[j][kk] ); // failure
mas01mc@293 107 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 108 H::A[j][kk][i] = H::randn(); // Normal
mas01mc@293 109 H::b[j][kk] = H::ranf()*H::w; // Uniform
mas01mc@293 110 }
mas01mc@293 111 }
mas01mc@293 112 #endif
mas01mc@293 113
mas01mc@293 114 // Storage for LSH hash function output (Uns32T)
mas01mc@293 115 for( j = 0 ; j < H::L ; j++ ){ // L functions g_j(u_a, u_b) a,b \in nchoosek(m,2)
mas01mc@293 116 H::g[j] = new Uns32T[ H::k ]; // k x 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z
mas01mc@293 117 assert( H::g[j] );
mas01mc@292 118 }
mas01mc@292 119
mas01mc@293 120 // LSH Hash tables
mas01mc@293 121 H::h = new bucket**[ H::L ];
mas01mc@293 122 assert( H::h );
mas01mc@292 123 for( j = 0 ; j < H::L ; j++ ){
mas01mc@292 124 H::h[j] = new bucket*[ H::N ];
mas01mc@292 125 assert( H::h[j] );
mas01mc@292 126 for( i = 0 ; i < H::N ; i++)
mas01mc@292 127 H::h[j][i] = 0;
mas01mc@292 128 }
mas01mc@293 129
mas01mc@293 130 // Standard hash functions
mas01mc@293 131 H::r1 = new Uns32T*[ H::L ];
mas01mc@293 132 H::r2 = new Uns32T*[ H::L ];
mas01mc@293 133 assert( H::r1 && H::r2 ); // failure
mas01mc@293 134 for( j = 0 ; j < H::L ; j++ ){
mas01mc@293 135 H::r1[ j ] = new Uns32T[ H::k ];
mas01mc@293 136 H::r2[ j ] = new Uns32T[ H::k ];
mas01mc@293 137 assert( H::r1[j] && H::r2[j] ); // failure
mas01mc@293 138 for( i = 0; i<H::k; i++){
mas01mc@293 139 H::r1[j][i] = randr();
mas01mc@293 140 H::r2[j][i] = randr();
mas01mc@293 141 }
mas01mc@293 142 }
mas01mc@293 143
mas01mc@293 144 // Storage for whole or partial function evaluation depdenting on USE_U_FUNCTIONS
mas01mc@293 145 H::initialize_partial_functions();
mas01mc@293 146 }
mas01mc@293 147
mas01mc@293 148 void H::initialize_partial_functions(){
mas01mc@293 149
mas01mc@293 150 #ifdef USE_U_FUNCTIONS
mas01mc@293 151 H::uu = vector<vector<Uns32T> >(H::m);
mas01mc@293 152 for( Uns32T aa=0 ; aa < H::m ; aa++ )
mas01mc@293 153 H::uu[aa] = vector<Uns32T>( H::k/2 );
mas01mc@293 154 #else
mas01mc@293 155 H::uu = vector<vector<Uns32T> >(H::L);
mas01mc@293 156 for( Uns32T aa=0 ; aa < H::L ; aa++ )
mas01mc@293 157 H::uu[aa] = vector<Uns32T>( H::k );
mas01mc@293 158 #endif
mas01mc@293 159 }
mas01mc@293 160
mas01mc@293 161
mas01mc@293 162 // Generate z ~ N(0,1)
mas01mc@293 163 float H::randn(){
mas01mc@293 164 // Box-Muller
mas01mc@293 165 float x1, x2;
mas01mc@293 166 do{
mas01mc@293 167 x1 = ranf();
mas01mc@293 168 } while (x1 == 0); // cannot take log of 0
mas01mc@293 169 x2 = ranf();
mas01mc@293 170 float z;
mas01mc@293 171 z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2);
mas01mc@293 172 return z;
mas01mc@293 173 }
mas01mc@293 174
mas01mc@293 175 float H::ranf(){
mas01mc@293 176 #ifdef MT19937
mas01mc@293 177 return (float) genrand_real2();
mas01mc@293 178 #else
mas01mc@293 179 return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) );
mas01mc@293 180 #endif
mas01mc@293 181 }
mas01mc@293 182
mas01mc@293 183 // range is 1..2^29
mas01mc@293 184 /* FIXME: that looks like an ... odd range. Still. */
mas01mc@293 185 Uns32T H::randr(){
mas01mc@293 186 #ifdef MT19937
mas01mc@293 187 return (Uns32T)((genrand_int32() >> 3) + 1);
mas01mc@293 188 #else
mas01mc@293 189 return (Uns32T) ((rand() >> 2) + 1);
mas01mc@293 190 #endif
mas01mc@292 191 }
mas01mc@292 192
mas01mc@292 193 // Destruct hash tables
mas01mc@292 194 H::~H(){
mas01mc@293 195 Uns32T i,j,kk;
mas01mc@293 196 #ifdef USE_U_FUNCTIONS
mas01mc@293 197 for( j = 0 ; j < H::m ; j++ ){
mas01mc@293 198 for( kk = 0 ; kk < H::k/2 ; kk++ )
mas01mc@293 199 delete[] A[j][kk];
mas01mc@293 200 delete[] A[j];
mas01mc@293 201 }
mas01mc@293 202 delete[] A;
mas01mc@293 203 for( j = 0 ; j < H::m ; j++ )
mas01mc@293 204 delete[] b[j];
mas01mc@293 205 delete[] b;
mas01mc@293 206 #else
mas01mc@293 207 for( j = 0 ; j < H::L ; j++ ){
mas01mc@293 208 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@293 209 delete[] A[j][kk];
mas01mc@293 210 delete[] A[j];
mas01mc@293 211 }
mas01mc@293 212 delete[] A;
mas01mc@293 213 for( j = 0 ; j < H::L ; j++ )
mas01mc@293 214 delete[] b[j];
mas01mc@293 215 delete[] b;
mas01mc@293 216 #endif
mas01mc@293 217
mas01mc@293 218 for( j = 0 ; j < H::L ; j++ )
mas01mc@293 219 delete[] g[j];
mas01mc@293 220 delete[] g;
mas01mc@292 221 for( j=0 ; j < H::L ; j++ ){
mas01mc@292 222 delete[] H::r1[ j ];
mas01mc@292 223 delete[] H::r2[ j ];
mas01mc@292 224 for(i = 0; i< H::N ; i++)
mas01mc@292 225 delete H::h[ j ][ i ];
mas01mc@292 226 delete[] H::h[ j ];
mas01mc@292 227 }
mas01mc@292 228 delete[] H::r1;
mas01mc@292 229 delete[] H::r2;
mas01mc@292 230 delete[] H::h;
mas01mc@292 231 }
mas01mc@292 232
mas01mc@292 233
mas01mc@293 234 // Compute all hash functions for vector v
mas01mc@293 235 // #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d}
mas01mc@293 236 // to make L \times g_j functions \in Z^k
mas01mc@293 237 void H::compute_hash_functions(vector<float>& v){ // v \in R^d
mas01mc@293 238 float iw = 1. / H::w; // hash bucket width
mas01mc@293 239 Uns32T aa, kk;
mas01mc@293 240 if( v.size() != H::d )
mas01mc@293 241 error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality
mas01mc@293 242 double tmp = 0;
mas01mc@293 243 float *pA, *pb;
mas01mc@293 244 Uns32T *pg;
mas01mc@293 245 int dd;
mas01mc@293 246 vector<float>::iterator vi;
mas01mc@293 247 vector<Uns32T>::iterator ui;
mas01mc@293 248
mas01mc@293 249 #ifdef USE_U_FUNCTIONS
mas01mc@293 250 Uns32T bb;
mas01mc@293 251 // Store m dot products to expand
mas01mc@293 252 for( aa=0; aa < H::m ; aa++ ){
mas01mc@293 253 ui = H::uu[aa].begin();
mas01mc@293 254 for( kk = 0 ; kk < H::k/2 ; kk++ ){
mas01mc@293 255 pb = *( H::b + aa ) + kk;
mas01mc@293 256 pA = * ( * ( H::A + aa ) + kk );
mas01mc@293 257 dd = H::d;
mas01mc@293 258 tmp = 0.;
mas01mc@293 259 vi = v.begin();
mas01mc@293 260 while( dd-- )
mas01mc@293 261 tmp += *pA++ * *vi++; // project
mas01mc@293 262 tmp += *pb; // translate
mas01mc@293 263 tmp *= iw; // scale
mas01mc@293 264 *ui++ = (Uns32T) floor(tmp); // floor
mas01mc@293 265 }
mas01mc@293 266 }
mas01mc@293 267 // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d}
mas01mc@293 268 Uns32T j;
mas01mc@293 269 for( aa=0, j=0 ; aa < H::m-1 ; aa++ )
mas01mc@293 270 for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){
mas01mc@293 271 pg= *( H::g + j ); // L \times functions g_j(v) \in Z^k
mas01mc@293 272 // u_1 \in Z^{(k/2) \times d}
mas01mc@293 273 ui = H::uu[aa].begin();
mas01mc@293 274 kk=H::k/2;
mas01mc@293 275 while( kk-- )
mas01mc@293 276 *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z
mas01mc@293 277 // u_2 \in Z^{(k/2) \times d}
mas01mc@293 278 ui = H::uu[bb].begin();
mas01mc@293 279 kk=H::k/2;
mas01mc@293 280 while( kk--)
mas01mc@293 281 *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z
mas01mc@293 282 }
mas01mc@293 283 #else
mas01mc@293 284 for( aa=0; aa < H::L ; aa++ ){
mas01mc@293 285 ui = H::uu[aa].begin();
mas01mc@293 286 for( kk = 0 ; kk < H::k ; kk++ ){
mas01mc@293 287 pb = *( H::b + aa ) + kk;
mas01mc@293 288 pA = * ( * ( H::A + aa ) + kk );
mas01mc@293 289 dd = H::d;
mas01mc@293 290 tmp = 0.;
mas01mc@293 291 vi = v.begin();
mas01mc@293 292 while( dd-- )
mas01mc@293 293 tmp += *pA++ * *vi++; // project
mas01mc@293 294 tmp += *pb; // translate
mas01mc@293 295 tmp *= iw; // scale
mas01mc@293 296 *ui++ = (Uns32T) (floor(tmp)); // floor
mas01mc@293 297 }
mas01mc@293 298 }
mas01mc@293 299 // Compute hash functions
mas01mc@293 300 for( aa=0 ; aa < H::L ; aa++ ){
mas01mc@293 301 pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k
mas01mc@293 302 // u_1 \in Z^{k \times d}
mas01mc@293 303 ui = H::uu[aa].begin();
mas01mc@293 304 kk=H::k;
mas01mc@293 305 while( kk-- )
mas01mc@293 306 *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z
mas01mc@293 307 }
mas01mc@293 308 #endif
mas01mc@293 309 }
mas01mc@293 310
mas01mc@292 311 // make hash value \in Z
mas01mc@293 312 void H::generate_hash_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){
mas01mc@293 313 H::t1 = computeProductModDefaultPrime( g, r1, H::k ) % H::N;
mas01mc@293 314 H::t2 = computeProductModDefaultPrime( g, r2, H::k );
mas01mc@292 315 }
mas01mc@292 316
mas01mc@292 317 #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}}
mas01mc@292 318
mas01mc@292 319 // Computes (a.b) mod UH_PRIME_DEFAULT
mas01mc@293 320 inline Uns32T H::computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){
mas01mc@292 321 LongUns64T h = 0;
mas01mc@292 322
mas01mc@292 323 for(IntT i = 0; i < size; i++){
mas01mc@292 324 h = h + (LongUns64T)a[i] * (LongUns64T)b[i];
mas01mc@292 325 h = (h & TWO_TO_32_MINUS_1) + 5 * (h >> 32);
mas01mc@292 326 if (h >= UH_PRIME_DEFAULT) {
mas01mc@292 327 h = h - UH_PRIME_DEFAULT;
mas01mc@292 328 }
mas01mc@292 329 CR_ASSERT(h < UH_PRIME_DEFAULT);
mas01mc@292 330 }
mas01mc@292 331 return h;
mas01mc@292 332 }
mas01mc@292 333
mas01mc@292 334 Uns32T H::bucket_insert_point(bucket **pp){
mas01mc@296 335 collisionCount = 0;
mas01mc@292 336 if(!*pp){
mas01mc@292 337 *pp = new bucket();
mas01mc@292 338 #ifdef LSH_BLOCK_FULL_ROWS
mas01mc@292 339 (*pp)->t2 = 0; // Use t2 as a collision counter for the row
mas01mc@292 340 (*pp)->next = new bucket();
mas01mc@292 341 #endif
mas01mc@292 342 }
mas01mc@292 343 #ifdef LSH_BLOCK_FULL_ROWS
mas01mc@292 344 collisionCount = (*pp)->t2;
mas01mc@292 345 if(collisionCount < H::C){ // Block if row is full
mas01mc@292 346 (*pp)->t2++; // Increment collision counter
mas01mc@292 347 pointCount++;
mas01mc@292 348 collisionCount++;
mas01mc@292 349 __bucket_insert_point((*pp)->next); // First bucket holds collision count
mas01mc@292 350 }
mas01mc@292 351 #else
mas01mc@296 352 pointCount++;
mas01mc@296 353 __bucket_insert_point(*pp); // No collision count storage
mas01mc@292 354 #endif
mas01mc@292 355 return collisionCount;
mas01mc@292 356 }
mas01mc@292 357
mas01mc@292 358 void H::__bucket_insert_point(bucket* p){
mas01mc@292 359 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2?
mas01mc@292 360 p->t2 = H::t2;
mas01mc@292 361 bucketCount++; // Record start of new point-locale collision chain
mas01mc@292 362 p->snext = new sbucket();
mas01mc@292 363 __sbucket_insert_point(p->snext);
mas01mc@292 364 return;
mas01mc@292 365 }
mas01mc@292 366
mas01mc@292 367 if(p->t2 == H::t2){
mas01mc@292 368 __sbucket_insert_point(p->snext);
mas01mc@292 369 return;
mas01mc@292 370 }
mas01mc@292 371
mas01mc@292 372 if(p->next){
mas01mc@292 373 __bucket_insert_point(p->next);
mas01mc@292 374 }
mas01mc@292 375
mas01mc@292 376 else{
mas01mc@292 377 p->next = new bucket();
mas01mc@292 378 __bucket_insert_point(p->next);
mas01mc@292 379 }
mas01mc@292 380
mas01mc@292 381 }
mas01mc@292 382
mas01mc@292 383 void H::__sbucket_insert_point(sbucket* p){
mas01mc@292 384 if(p->pointID==IFLAG){
mas01mc@292 385 p->pointID = H::p;
mas01mc@292 386 return;
mas01mc@292 387 }
mas01mc@292 388
mas01mc@292 389 // Search for pointID
mas01mc@292 390 if(p->snext){
mas01mc@292 391 __sbucket_insert_point(p->snext);
mas01mc@292 392 }
mas01mc@292 393 else{
mas01mc@292 394 // Make new point collision bucket at end of list
mas01mc@292 395 p->snext = new sbucket();
mas01mc@292 396 __sbucket_insert_point(p->snext);
mas01mc@292 397 }
mas01mc@292 398 }
mas01mc@292 399
mas01mc@293 400 inline bucket** H::get_bucket(int j){
mas01mc@292 401 return *(h+j);
mas01mc@292 402 }
mas01mc@292 403
mas01mc@293 404 // Interface to Locality Sensitive Hashing G
mas01mc@293 405 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr):
mas01mc@293 406 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures
mas01mc@308 407 indexName(0),
mas01mc@293 408 lshHeader(0),
mas01mc@292 409 calling_instance(0),
mas01mc@293 410 add_point_callback(0)
mas01mc@292 411 {
mas01mc@293 412
mas01mc@292 413 }
mas01mc@292 414
mas01mc@292 415 // Serialize from file LSH constructor
mas01mc@292 416 // Read parameters from database file
mas01mc@292 417 // Load the hash functions, close the database
mas01mc@292 418 // Optionally load the LSH tables into head-allocated lists in core
mas01mc@292 419 G::G(char* filename, bool lshInCoreFlag):
mas01mc@293 420 H(), // default base-class constructor call delays data-structure initialization
mas01mc@309 421 indexName(0),
mas01mc@293 422 lshHeader(0),
mas01mc@292 423 calling_instance(0),
mas01mc@292 424 add_point_callback(0)
mas01mc@292 425 {
mas01mc@292 426 int dbfid = unserialize_lsh_header(filename);
mas01mc@309 427 indexName = new char[O2_INDEX_MAXSTR];
mas01mc@309 428 strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER
mas01mc@293 429 H::initialize_lsh_functions(); // Base-class data-structure initialization
mas01mc@293 430 unserialize_lsh_functions(dbfid); // populate with on-disk hashfunction values
mas01mc@292 431
mas01mc@292 432 // Format1 only needs unserializing if specifically requested
mas01mc@292 433 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2) && lshInCoreFlag){
mas01mc@292 434 unserialize_lsh_hashtables_format1(dbfid);
mas01mc@292 435 }
mas01mc@292 436
mas01mc@292 437 // Format2 always needs unserializing
mas01mc@292 438 if(lshHeader->flags&O2_SERIAL_FILEFORMAT2 && lshInCoreFlag){
mas01mc@336 439 FILE* dbFile = fdopen(dbfid, "rb");
mas01mc@336 440 if(!dbFile)
mas01mc@336 441 error("Cannot open LSH file for reading", filename);
mas01mc@336 442 unserialize_lsh_hashtables_format2(dbFile);
mas01mc@292 443 }
mas01mc@336 444 serial_close(dbfid);
mas01mc@336 445 }
mas01mc@292 446
mas01mc@292 447 G::~G(){
mas01mc@292 448 delete lshHeader;
mas01mc@292 449 }
mas01mc@292 450
mas01mc@292 451 // single point insertion; inserted values are hash value and pointID
mas01mc@292 452 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
mas01mc@292 453 Uns32T collisionCount = 0;
mas01mc@292 454 H::p = pp;
mas01mc@299 455 if(H::maxp && pp<=H::maxp)
mas01mc@296 456 error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
mas01mc@296 457 H::maxp=pp; // Store highest pointID in database
mas01mc@293 458 H::compute_hash_functions( v );
mas01mc@292 459 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
mas01mc@293 460 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
mas01mc@292 461 collisionCount += bucket_insert_point( *(h + j) + t1 );
mas01mc@292 462 }
mas01mc@292 463 return collisionCount;
mas01mc@292 464 }
mas01mc@292 465
mas01mc@292 466
mas01mc@292 467 // batch insert for a point set
mas01mc@292 468 // inserted values are vector hash value and pointID starting at basePointID
mas01mc@292 469 void G::insert_point_set(vector<vector<float> >& vv, Uns32T basePointID){
mas01mc@292 470 for(Uns32T point=0; point<vv.size(); point++)
mas01mc@292 471 insert_point(vv[point], basePointID+point);
mas01mc@292 472 }
mas01mc@292 473
mas01mc@292 474 // point retrieval routine
mas01mc@292 475 void G::retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 476 calling_instance = caller;
mas01mc@292 477 add_point_callback = add_point;
mas01mc@293 478 H::compute_hash_functions( v );
mas01mc@292 479 for(Uns32T j = 0 ; j < H::L ; j++ ){
mas01mc@293 480 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
mas01mc@293 481 if( bucket* bPtr = *(get_bucket(j) + get_t1()) )
mas01mc@292 482 #ifdef LSH_BLOCK_FULL_ROWS
mas01mc@292 483 bucket_chain_point( bPtr->next, qpos);
mas01mc@292 484 #else
mas01mc@292 485 bucket_chain_point( bPtr , qpos);
mas01mc@292 486 #endif
mas01mc@292 487 }
mas01mc@292 488 }
mas01mc@292 489
mas01mc@292 490 void G::retrieve_point_set(vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 491 for(Uns32T qpos = 0 ; qpos < vv.size() ; qpos++ )
mas01mc@292 492 retrieve_point(vv[qpos], qpos, add_point, caller);
mas01mc@292 493 }
mas01mc@292 494
mas01mc@292 495 // export lsh tables to table structure on disk
mas01mc@292 496 //
mas01mc@292 497 // LSH TABLE STRUCTURE
mas01mc@292 498 // ---header 64 bytes ---
mas01mc@292 499 // [magic #tables #rows #cols elementSize databaseSize version flags dim #funs 0 0 0 0 0 0]
mas01mc@292 500 //
mas01mc@292 501 // ---random projections L x k x d float ---
mas01mc@292 502 // A[0][0][0] A[0][0][1] ... A[0][0][d-1]
mas01mc@292 503 // A[0][1][0] A[0][1][1] ... A[1][1][d-1]
mas01mc@292 504 // ...
mas01mc@292 505 // A[0][K-1][0] A[0][1][1] ... A[0][k-1][d-1]
mas01mc@292 506 // ...
mas01mc@292 507 // ...
mas01mc@292 508 // A[L-1][0][0] A[M-1][0][1] ... A[L-1][0][d-1]
mas01mc@292 509 // A[L-1][1][0] A[M-1][1][1] ... A[L-1][1][d-1]
mas01mc@292 510 // ...
mas01mc@292 511 // A[L-1][k-1][0] A[M-1][1][1] ... A[L-1][k-1][d-1]
mas01mc@292 512 //
mas01mc@292 513 // ---bias L x k float ---
mas01mc@292 514 // b[0][0] b[0][1] ... b[0][k-1]
mas01mc@292 515 // b[1][0] b[1][1] ... b[1][k-1]
mas01mc@292 516 // ...
mas01mc@292 517 // b[L-1][0] b[L-1][1] ... b[L-1][k-1]
mas01mc@292 518 //
mas01mc@292 519 // ---random r1 L x k float ---
mas01mc@292 520 // r1[0][0] r1[0][1] ... r1[0][k-1]
mas01mc@292 521 // r1[1][0] r1[1][1] ... r1[1][k-1]
mas01mc@292 522 // ...
mas01mc@292 523 // r1[L-1][0] r1[L-1][1] ... r1[L-1][k-1]
mas01mc@292 524 //
mas01mc@292 525 // ---random r2 L x k float ---
mas01mc@292 526 // r2[0][0] r2[0][1] ... r2[0][k-1]
mas01mc@292 527 // r2[1][0] r2[1][1] ... r2[1][k-1]
mas01mc@292 528 // ...
mas01mc@292 529 // r2[L-1][0] r2[L-1][1] ... r2[L-1][k-1]
mas01mc@292 530 //
mas01mc@293 531 // ******* HASHTABLES FORMAT1 (optimized for LSH_ON_DISK retrieval) *******
mas01mc@292 532 // ---hash table 0: N x C x 8 ---
mas01mc@292 533 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 534 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 535 // ...
mas01mc@292 536 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 537 //
mas01mc@292 538 // ---hash table 1: N x C x 8 ---
mas01mc@292 539 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 540 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 541 // ...
mas01mc@292 542 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 543 //
mas01mc@292 544 // ...
mas01mc@292 545 //
mas01mc@292 546 // ---hash table L-1: N x C x 8 ---
mas01mc@292 547 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 548 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 549 // ...
mas01mc@292 550 // [t2 pointID][t2 pointID]...[t2 pointID]
mas01mc@292 551 //
mas01mc@293 552 // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) *******
mas01mc@293 553 //
mas01mc@293 554 // State machine controlled by regular expression.
mas01mc@293 555 // legend:
mas01mc@293 556 //
mas01mc@306 557 // O2_SERIAL_TOKEN_T1 = 0xFFFFFFFCU
mas01mc@306 558 // O2_SERIAL_TOKEN_T2 = 0xFFFFFFFDU
mas01mc@306 559 // O2_SERIAL_TOKEN_ENDTABLE = 0xFFFFFFFEU
mas01mc@293 560 //
mas01mc@306 561 // T1 - T1 hash token
mas01mc@306 562 // t1 - t1 hash key (t1 range 0..2^29-1)
mas01mc@306 563 // T2 - T2 token
mas01mc@293 564 // t2 - t2 hash key (range 1..2^32-6)
mas01mc@293 565 // p - point identifier (range 0..2^32-1)
mas01mc@306 566 // E - end hash table token
mas01mc@293 567 // {...} required arguments
mas01mc@293 568 // [...] optional arguments
mas01mc@293 569 // * - match zero or more occurences
mas01mc@293 570 // + - match one or more occurences
mas01mc@293 571 // {...}^L - repeat argument L times
mas01mc@293 572 //
mas01mc@293 573 // FORMAT2 Regular expression:
mas01mc@306 574 // { [T1 t1 [T2 t2 p+]+ ]* E }^L
mas01mc@293 575 //
mas01mc@292 576
mas01mc@292 577 // Serial header constructors
mas01mc@292 578 SerialHeader::SerialHeader(){;}
mas01mc@296 579 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
mas01mc@292 580 lshMagic(O2_SERIAL_MAGIC),
mas01mc@292 581 binWidth(W),
mas01mc@292 582 numTables(L),
mas01mc@292 583 numRows(N),
mas01mc@292 584 numCols(C),
mas01mc@292 585 elementSize(O2_SERIAL_ELEMENT_SIZE),
mas01mc@296 586 version(O2_SERIAL_VERSION),
mas01mc@296 587 size(0), // we are deprecating this value
mas01mc@292 588 flags(FMT),
mas01mc@292 589 dataDim(d),
mas01mc@292 590 numFuns(k),
mas01mc@292 591 radius(r),
mas01mc@296 592 maxp(p),
mas01mc@296 593 size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables
mas01mc@296 594 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
mas01mc@296 595 (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
mas01mc@296 596 pointCount(pc){
mas01mc@296 597
mas01mc@296 598 if(FMT==O2_SERIAL_FILEFORMAT2)
mas01mc@296 599 size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE
mas01mc@296 600 + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
mas01mc@296 601 +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
mas01mc@296 602 } // header
mas01mc@292 603
mas01mc@292 604 float* G::get_serial_hashfunction_base(char* db){
mas01mc@292 605 if(db&&lshHeader)
mas01mc@292 606 return (float*)(db+O2_SERIAL_HEADER_SIZE);
mas01mc@292 607 else return NULL;
mas01mc@292 608 }
mas01mc@292 609
mas01mc@292 610 SerialElementT* G::get_serial_hashtable_base(char* db){
mas01mc@292 611 if(db&&lshHeader)
mas01mc@292 612 return (SerialElementT*)(db+get_serial_hashtable_offset());
mas01mc@292 613 else
mas01mc@292 614 return NULL;
mas01mc@292 615 }
mas01mc@292 616
mas01mc@292 617 Uns32T G::get_serial_hashtable_offset(){
mas01mc@292 618 if(lshHeader)
mas01mc@292 619 return align_up(O2_SERIAL_HEADER_SIZE +
mas01mc@292 620 L*lshHeader->numFuns*( sizeof(float)*lshHeader->dataDim+2*sizeof(Uns32T)+sizeof(float)),get_page_logn());
mas01mc@292 621 else
mas01mc@292 622 return 0;
mas01mc@292 623 }
mas01mc@292 624
mas01mc@292 625 void G::serialize(char* filename, Uns32T serialFormat){
mas01mc@292 626 int dbfid;
mas01mc@292 627 char* db;
mas01mc@292 628 int dbIsNew=0;
mas01mc@292 629
mas01mc@292 630 // Check requested serialFormat
mas01mc@292 631 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
mas01mc@292 632 error("Unrecognized serial file format request: ", "serialize()");
mas01mc@296 633
mas01mc@292 634 // Test to see if file exists
mas01mc@292 635 if((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 636 // If it doesn't, then create the file (CREATE)
mas01mc@292 637 if(errno == ENOENT){
mas01mc@292 638 // Create the file
mas01mc@292 639 std::cout << "Creating new serialized LSH database:" << filename << "...";
mas01mc@292 640 std::cout.flush();
mas01mc@292 641 serial_create(filename, serialFormat);
mas01mc@292 642 dbIsNew=1;
mas01mc@292 643 }
mas01mc@292 644 else
mas01mc@292 645 // The file can't be opened
mas01mc@292 646 error("Can't open the file", filename, "open");
mas01mc@292 647
mas01mc@292 648 // Load the on-disk header into core
mas01mc@292 649 dbfid = serial_open(filename, 1); // open for write
mas01mc@292 650 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
mas01mc@292 651 serial_get_header(db); // read header
mas01mc@292 652 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@292 653
mas01mc@292 654 // Check compatibility of core and disk data structures
mas01mc@292 655 if( !serial_can_merge(serialFormat) )
mas01mc@292 656 error("Incompatible core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 657
mas01mc@292 658 // For new LSH databases write the hashfunctions
mas01mc@292 659 if(dbIsNew)
mas01mc@292 660 serialize_lsh_hashfunctions(dbfid);
mas01mc@292 661 // Write the hashtables in the requested format
mas01mc@292 662 if(serialFormat == O2_SERIAL_FILEFORMAT1)
mas01mc@292 663 serialize_lsh_hashtables_format1(dbfid, !dbIsNew);
mas01mc@336 664 else{
mas01mc@336 665 FILE* dbFile = fdopen(dbfid, "r+b");
mas01mc@336 666 if(!dbFile)
mas01mc@336 667 error("Cannot open LSH file for writing",filename);
mas01mc@336 668 serialize_lsh_hashtables_format2(dbFile, !dbIsNew);
mas01mc@336 669 fflush(dbFile);
mas01mc@336 670 }
mas01mc@292 671
mas01mc@336 672 if(!dbIsNew) {
mas01mc@292 673 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
mas01mc@292 674 //serial_get_header(db); // read header
mas01mc@293 675 cout << "maxp = " << H::maxp << endl;
mas01mc@293 676 lshHeader->maxp=H::maxp;
mas01mc@292 677 // Default to FILEFORMAT1
mas01mc@292 678 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2))
mas01mc@294 679 lshHeader->flags|=O2_SERIAL_FILEFORMAT1;
mas01mc@292 680 memcpy((char*)db, (char*)lshHeader, sizeof(SerialHeaderT));
mas01mc@292 681 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@336 682 }
mas01mc@336 683 serial_close(dbfid);
mas01mc@292 684 }
mas01mc@292 685
mas01mc@292 686 // Test to see if core structure and requested format is
mas01mc@292 687 // compatible with currently opened database
mas01mc@292 688 int G::serial_can_merge(Uns32T format){
mas01mc@292 689 SerialHeaderT* that = lshHeader;
mas01mc@292 690 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2)
mas01mc@292 691 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
mas01mc@292 692 || !( this->w == that->binWidth &&
mas01mc@296 693 this->L == that->numTables &&
mas01mc@296 694 this->N == that->numRows &&
mas01mc@296 695 this->k == that->numFuns &&
mas01mc@296 696 this->d == that->dataDim &&
mas01mc@296 697 sizeof(SerialElementT) == that->elementSize &&
mas01mc@296 698 this->radius == that->radius)){
mas01mc@292 699 serial_print_header(format);
mas01mc@292 700 return 0;
mas01mc@292 701 }
mas01mc@292 702 else
mas01mc@292 703 return 1;
mas01mc@292 704 }
mas01mc@292 705
mas01mc@292 706 // Used as an error message for serial_can_merge()
mas01mc@292 707 void G::serial_print_header(Uns32T format){
mas01mc@292 708 std::cout << "Fc:" << format << " Fs:" << lshHeader->flags << endl;
mas01mc@292 709 std::cout << "Wc:" << w << " Ls:" << lshHeader->binWidth << endl;
mas01mc@292 710 std::cout << "Lc:" << L << " Ls:" << lshHeader->numTables << endl;
mas01mc@292 711 std::cout << "Nc:" << N << " Ns:" << lshHeader->numRows << endl;
mas01mc@292 712 std::cout << "kc:" << k << " ks:" << lshHeader->numFuns << endl;
mas01mc@292 713 std::cout << "dc:" << d << " ds:" << lshHeader->dataDim << endl;
mas01mc@292 714 std::cout << "sc:" << sizeof(SerialElementT) << " ss:" << lshHeader->elementSize << endl;
mas01mc@292 715 std::cout << "rc:" << this->radius << " rs:" << lshHeader->radius << endl;
mas01mc@292 716 }
mas01mc@292 717
mas01mc@292 718 int G::serialize_lsh_hashfunctions(int fid){
mas01mc@292 719 float* pf;
mas01mc@292 720 Uns32T *pu;
mas01mc@292 721 Uns32T x,y,z;
mas01mc@292 722
mas01mc@293 723 char* db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer
mas01mc@292 724 pf = get_serial_hashfunction_base(db);
mas01mc@292 725
mas01mc@292 726 // HASH FUNCTIONS
mas01mc@292 727 // Write the random projectors A[][][]
mas01mc@292 728 #ifdef USE_U_FUNCTIONS
mas01mc@292 729 for( x = 0 ; x < H::m ; x++ )
mas01mc@292 730 for( y = 0 ; y < H::k/2 ; y++ )
mas01mc@292 731 #else
mas01mc@292 732 for( x = 0 ; x < H::L ; x++ )
mas01mc@292 733 for( y = 0 ; y < H::k ; y++ )
mas01mc@292 734 #endif
mas01mc@292 735 for( z = 0 ; z < d ; z++ )
mas01mc@293 736 *pf++ = H::A[x][y][z];
mas01mc@292 737
mas01mc@292 738 // Write the random biases b[][]
mas01mc@292 739 #ifdef USE_U_FUNCTIONS
mas01mc@292 740 for( x = 0 ; x < H::m ; x++ )
mas01mc@292 741 for( y = 0 ; y < H::k/2 ; y++ )
mas01mc@292 742 #else
mas01mc@292 743 for( x = 0 ; x < H::L ; x++ )
mas01mc@292 744 for( y = 0 ; y < H::k ; y++ )
mas01mc@292 745 #endif
mas01mc@293 746 *pf++ = H::b[x][y];
mas01mc@292 747
mas01mc@292 748 pu = (Uns32T*)pf;
mas01mc@292 749
mas01mc@292 750 // Write the Z projectors r1[][]
mas01mc@292 751 for( x = 0 ; x < H::L ; x++)
mas01mc@292 752 for( y = 0 ; y < H::k ; y++)
mas01mc@293 753 *pu++ = H::r1[x][y];
mas01mc@292 754
mas01mc@292 755 // Write the Z projectors r2[][]
mas01mc@292 756 for( x = 0 ; x < H::L ; x++)
mas01mc@292 757 for( y = 0; y < H::k ; y++)
mas01mc@293 758 *pu++ = H::r2[x][y];
mas01mc@292 759
mas01mc@292 760 serial_munmap(db, get_serial_hashtable_offset());
mas01mc@292 761 return 1;
mas01mc@292 762 }
mas01mc@292 763
mas01mc@292 764 int G::serialize_lsh_hashtables_format1(int fid, int merge){
mas01mc@292 765 SerialElementT *pe, *pt;
mas01mc@292 766 Uns32T x,y;
mas01mc@292 767
mas01mc@292 768 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT1) )
mas01mc@292 769 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 770
mas01mc@292 771 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 772 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount;
mas01mc@292 773 // Write the hash tables
mas01mc@292 774 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 775 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT1...";
mas01mc@292 776 std::cout.flush();
mas01mc@292 777 // memory map a single hash table for sequential access
mas01mc@292 778 // Align each hash table to page boundary
mas01mc@292 779 char* dbtable = serial_mmap(fid, hashTableSize, 1,
mas01mc@292 780 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
mas01mc@324 781 #ifdef __CYGWIN__
mas01mc@324 782 // No madvise in CYGWIN
mas01mc@324 783 #else
mas01mc@292 784 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 785 error("could not advise hashtable memory","","madvise");
mas01mc@324 786 #endif
mas01mc@292 787 maxColCount=0;
mas01mc@292 788 minColCount=O2_SERIAL_MAX_COLS;
mas01mc@292 789 meanColCount=0;
mas01mc@292 790 colCountN=0;
mas01mc@292 791 pt=(SerialElementT*)dbtable;
mas01mc@292 792 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 793 // Move disk pointer to beginning of row
mas01mc@292 794 pe=pt+y*lshHeader->numCols;
mas01mc@292 795
mas01mc@292 796 colCount=0;
mas01mc@292 797 if(bucket* bPtr = h[x][y])
mas01mc@292 798 if(merge)
mas01mc@292 799 #ifdef LSH_BLOCK_FULL_ROWS
mas01mc@292 800 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 801 else
mas01mc@292 802 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 803 #else
mas01mc@292 804 serial_merge_hashtable_row_format1(pe, bPtr, colCount);
mas01mc@292 805 else
mas01mc@292 806 serial_write_hashtable_row_format1(pe, bPtr, colCount);
mas01mc@292 807 #endif
mas01mc@292 808 if(colCount){
mas01mc@292 809 if(colCount<minColCount)
mas01mc@292 810 minColCount=colCount;
mas01mc@292 811 if(colCount>maxColCount)
mas01mc@292 812 maxColCount=colCount;
mas01mc@292 813 meanColCount+=colCount;
mas01mc@292 814 colCountN++;
mas01mc@292 815 }
mas01mc@292 816 }
mas01mc@292 817 if(colCountN)
mas01mc@292 818 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
mas01mc@292 819 << ", min = " << minColCount << ", max = " << maxColCount
mas01mc@292 820 << endl;
mas01mc@292 821 serial_munmap(dbtable, hashTableSize);
mas01mc@292 822 }
mas01mc@292 823
mas01mc@292 824 // We're done writing
mas01mc@292 825 return 1;
mas01mc@292 826 }
mas01mc@292 827
mas01mc@292 828 void G::serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* b, Uns32T& colCount){
mas01mc@292 829 while(b && b->t2!=IFLAG){
mas01mc@292 830 SerialElementT*pe=pr; // reset disk pointer to beginning of row
mas01mc@292 831 serial_merge_element_format1(pe, b->snext, b->t2, colCount);
mas01mc@292 832 b=b->next;
mas01mc@292 833 }
mas01mc@292 834 }
mas01mc@292 835
mas01mc@292 836 void G::serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
mas01mc@292 837 while(sb){
mas01mc@292 838 if(colCount==lshHeader->numCols){
mas01mc@292 839 std::cout << "!point-chain full " << endl;
mas01mc@292 840 return;
mas01mc@292 841 }
mas01mc@292 842 Uns32T c=0;
mas01mc@292 843 // Merge collision chains
mas01mc@292 844 while(c<lshHeader->numCols){
mas01mc@292 845 if( (pe+c)->hashValue==IFLAG){
mas01mc@292 846 (pe+c)->hashValue=t2;
mas01mc@292 847 (pe+c)->pointID=sb->pointID;
mas01mc@292 848 colCount=c+1;
mas01mc@292 849 if(c+1<lshHeader->numCols)
mas01mc@292 850 (pe+c+1)->hashValue=IFLAG;
mas01mc@292 851 break;
mas01mc@292 852 }
mas01mc@292 853 c++;
mas01mc@292 854 }
mas01mc@292 855 sb=sb->snext;
mas01mc@292 856 }
mas01mc@292 857 return;
mas01mc@292 858 }
mas01mc@292 859
mas01mc@292 860 void G::serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* b, Uns32T& colCount){
mas01mc@292 861 pe->hashValue=IFLAG;
mas01mc@292 862 while(b && b->t2!=IFLAG){
mas01mc@292 863 serial_write_element_format1(pe, b->snext, b->t2, colCount);
mas01mc@292 864 b=b->next;
mas01mc@292 865 }
mas01mc@292 866 }
mas01mc@292 867
mas01mc@292 868 void G::serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
mas01mc@292 869 while(sb){
mas01mc@292 870 if(colCount==lshHeader->numCols){
mas01mc@292 871 std::cout << "!point-chain full " << endl;
mas01mc@292 872 return;
mas01mc@292 873 }
mas01mc@292 874 pe->hashValue=t2;
mas01mc@292 875 pe->pointID=sb->pointID;
mas01mc@292 876 pe++;
mas01mc@292 877 colCount++;
mas01mc@292 878 sb=sb->snext;
mas01mc@292 879 }
mas01mc@292 880 pe->hashValue=IFLAG;
mas01mc@292 881 return;
mas01mc@292 882 }
mas01mc@292 883
mas01mc@336 884 int G::serialize_lsh_hashtables_format2(FILE* dbFile, int merge){
mas01mc@292 885 Uns32T x,y;
mas01mc@292 886
mas01mc@292 887 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
mas01mc@292 888 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
mas01mc@292 889
mas01mc@306 890 // We must pereform FORMAT2 merges in core
mas01mc@292 891 if(merge)
mas01mc@336 892 unserialize_lsh_hashtables_format2(dbFile);
mas01mc@292 893
mas01mc@292 894 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
mas01mc@336 895 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
mas01mc@336 896 fclose(dbFile);
mas01mc@336 897 error("fSeek error in serialize_lsh_hashtables_format2");
mas01mc@336 898 }
mas01mc@292 899
mas01mc@292 900 // Write the hash tables
mas01mc@292 901 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 902 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT2...";
mas01mc@292 903 std::cout.flush();
mas01mc@292 904 maxColCount=0;
mas01mc@292 905 minColCount=O2_SERIAL_MAX_COLS;
mas01mc@292 906 meanColCount=0;
mas01mc@292 907 colCountN=0;
mas01mc@292 908 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 909 colCount=0;
mas01mc@292 910 if(bucket* bPtr = h[x][y]){
mas01mc@306 911 // Check for empty row (even though row was allocated)
mas01mc@306 912 #ifdef LSH_BLOCK_FULL_ROWS
mas01mc@306 913 if(bPtr->next->t2==IFLAG){
mas01mc@336 914 fclose(dbFile);
mas01mc@306 915 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
mas01mc@306 916 }
mas01mc@306 917 #else
mas01mc@306 918 if(bPtr->t2==IFLAG){
mas01mc@336 919 fclose(dbFile);
mas01mc@306 920 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
mas01mc@306 921 }
mas01mc@306 922 #endif
mas01mc@306 923 t1 = O2_SERIAL_TOKEN_T1;
mas01mc@336 924 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 925 fclose(dbFile);
mas01mc@306 926 error("write error in serial_write_hashtable_format2() [T1]");
mas01mc@306 927 }
mas01mc@306 928 t1 = y;
mas01mc@336 929 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 930 fclose(dbFile);
mas01mc@292 931 error("write error in serial_write_hashtable_format2() [t1]");
mas01mc@292 932 }
mas01mc@292 933 #ifdef LSH_BLOCK_FULL_ROWS
mas01mc@336 934 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket
mas01mc@292 935 #else
mas01mc@336 936 serial_write_hashtable_row_format2(dbFile, bPtr, colCount);
mas01mc@292 937 #endif
mas01mc@292 938 }
mas01mc@292 939 if(colCount){
mas01mc@292 940 if(colCount<minColCount)
mas01mc@292 941 minColCount=colCount;
mas01mc@292 942 if(colCount>maxColCount)
mas01mc@292 943 maxColCount=colCount;
mas01mc@292 944 meanColCount+=colCount;
mas01mc@292 945 colCountN++;
mas01mc@292 946 }
mas01mc@292 947 }
mas01mc@292 948 // Write END of table marker
mas01mc@306 949 t1 = O2_SERIAL_TOKEN_ENDTABLE;
mas01mc@336 950 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile ) != 1 ){
mas01mc@336 951 fclose(dbFile);
mas01mc@292 952 error("write error in serial_write_hashtable_format2() [end]");
mas01mc@292 953 }
mas01mc@292 954
mas01mc@292 955 if(colCountN)
mas01mc@292 956 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
mas01mc@292 957 << ", min = " << minColCount << ", max = " << maxColCount
mas01mc@292 958 << endl;
mas01mc@292 959 }
mas01mc@292 960
mas01mc@292 961 // We're done writing
mas01mc@292 962 return 1;
mas01mc@292 963 }
mas01mc@292 964
mas01mc@336 965 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
mas01mc@292 966 while(b && b->t2!=IFLAG){
mas01mc@306 967 if(!b->snext){
mas01mc@336 968 fclose(dbFile);
mas01mc@306 969 error("Empty collision chain in serial_write_hashtable_row_format2()");
mas01mc@306 970 }
mas01mc@306 971 t2 = O2_SERIAL_TOKEN_T2;
mas01mc@336 972 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 973 fclose(dbFile);
mas01mc@292 974 error("write error in serial_write_hashtable_row_format2()");
mas01mc@292 975 }
mas01mc@292 976 t2 = b->t2;
mas01mc@336 977 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
mas01mc@336 978 fclose(dbFile);
mas01mc@292 979 error("write error in serial_write_hashtable_row_format2()");
mas01mc@292 980 }
mas01mc@336 981 serial_write_element_format2(dbFile, b->snext, colCount);
mas01mc@292 982 b=b->next;
mas01mc@292 983 }
mas01mc@292 984 }
mas01mc@292 985
mas01mc@336 986 void G::serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount){
mas01mc@292 987 while(sb){
mas01mc@336 988 if(fwrite(&sb->pointID, sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 989 fclose(dbFile);
mas01mc@292 990 error("Write error in serial_write_element_format2()");
mas01mc@292 991 }
mas01mc@292 992 colCount++;
mas01mc@292 993 sb=sb->snext;
mas01mc@292 994 }
mas01mc@292 995 }
mas01mc@292 996
mas01mc@292 997
mas01mc@292 998 int G::serial_create(char* filename, Uns32T FMT){
mas01mc@292 999 return serial_create(filename, w, L, N, C, k, d, FMT);
mas01mc@292 1000 }
mas01mc@292 1001
mas01mc@292 1002
mas01mc@292 1003 int G::serial_create(char* filename, float binWidth, Uns32T numTables, Uns32T numRows, Uns32T numCols,
mas01mc@292 1004 Uns32T numFuns, Uns32T dim, Uns32T FMT){
mas01mc@292 1005
mas01mc@292 1006 if(numTables > O2_SERIAL_MAX_TABLES || numRows > O2_SERIAL_MAX_ROWS
mas01mc@292 1007 || numCols > O2_SERIAL_MAX_COLS || numFuns > O2_SERIAL_MAX_FUNS
mas01mc@292 1008 || dim>O2_SERIAL_MAX_DIM){
mas01mc@292 1009 error("LSH parameters out of bounds for serialization");
mas01mc@292 1010 }
mas01mc@292 1011
mas01mc@292 1012 int dbfid;
mas01mc@292 1013 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0)
mas01mc@292 1014 error("Can't create serial file", filename, "open");
mas01mc@292 1015 get_lock(dbfid, 1);
mas01mc@292 1016
mas01mc@292 1017 // Make header first to get size of serialized database
mas01mc@296 1018 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);
mas01mc@296 1019
mas01mc@296 1020 cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
mas01mc@296 1021 if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
mas01mc@296 1022 error("Maximum size of LSH file exceded: > 4000MB");
mas01mc@296 1023
mas01mc@292 1024 // go to the location corresponding to the last byte
mas01mc@292 1025 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
mas01mc@292 1026 error("lseek error in db file", "", "lseek");
mas01mc@292 1027
mas01mc@292 1028 // write a dummy byte at the last location
mas01mc@292 1029 if (write (dbfid, "", 1) != 1)
mas01mc@292 1030 error("write error", "", "write");
mas01mc@292 1031
mas01mc@293 1032 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
mas01mc@296 1033
mas01mc@292 1034 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
mas01mc@296 1035
mas01mc@292 1036 serial_munmap(db, O2_SERIAL_HEADER_SIZE);
mas01mc@296 1037
mas01mc@292 1038 close(dbfid);
mas01mc@292 1039
mas01mc@292 1040 std::cout << "done initializing tables." << endl;
mas01mc@292 1041
mas01mc@292 1042 return 1;
mas01mc@292 1043 }
mas01mc@292 1044
mas01mc@292 1045 char* G::serial_mmap(int dbfid, Uns32T memSize, Uns32T forWrite, off_t offset){
mas01mc@293 1046 char* db;
mas01mc@292 1047 if(forWrite){
mas01mc@292 1048 if ((db = (char*) mmap(0, memSize, PROT_READ | PROT_WRITE,
mas01mc@292 1049 MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
mas01mc@292 1050 error("mmap error in request for writable serialized database", "", "mmap");
mas01mc@292 1051 }
mas01mc@292 1052 else if ((db = (char*) mmap(0, memSize, PROT_READ, MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
mas01mc@292 1053 error("mmap error in read-only serialized database", "", "mmap");
mas01mc@292 1054
mas01mc@292 1055 return db;
mas01mc@292 1056 }
mas01mc@292 1057
mas01mc@292 1058 SerialHeaderT* G::serial_get_header(char* db){
mas01mc@292 1059 lshHeader = new SerialHeaderT();
mas01mc@292 1060 memcpy((char*)lshHeader, db, sizeof(SerialHeaderT));
mas01mc@292 1061
mas01mc@292 1062 if(lshHeader->lshMagic!=O2_SERIAL_MAGIC)
mas01mc@292 1063 error("Not an LSH database file");
mas01mc@292 1064
mas01mc@292 1065 return lshHeader;
mas01mc@292 1066 }
mas01mc@292 1067
mas01mc@292 1068 void G::serial_munmap(char* db, Uns32T N){
mas01mc@292 1069 munmap(db, N);
mas01mc@292 1070 }
mas01mc@292 1071
mas01mc@292 1072 int G::serial_open(char* filename, int writeFlag){
mas01mc@292 1073 int dbfid;
mas01mc@292 1074 if(writeFlag){
mas01mc@292 1075 if ((dbfid = open (filename, O_RDWR)) < 0)
mas01mc@292 1076 error("Can't open serial file for read/write", filename, "open");
mas01mc@292 1077 get_lock(dbfid, writeFlag);
mas01mc@292 1078 }
mas01mc@292 1079 else{
mas01mc@292 1080 if ((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 1081 error("Can't open serial file for read", filename, "open");
mas01mc@292 1082 get_lock(dbfid, 0);
mas01mc@292 1083 }
mas01mc@292 1084
mas01mc@292 1085 return dbfid;
mas01mc@292 1086 }
mas01mc@292 1087
mas01mc@292 1088 void G::serial_close(int dbfid){
mas01mc@292 1089
mas01mc@292 1090 release_lock(dbfid);
mas01mc@292 1091 close(dbfid);
mas01mc@292 1092 }
mas01mc@292 1093
mas01mc@292 1094 int G::unserialize_lsh_header(char* filename){
mas01mc@292 1095
mas01mc@292 1096 int dbfid;
mas01mc@292 1097 char* db;
mas01mc@292 1098 // Test to see if file exists
mas01mc@292 1099 if((dbfid = open (filename, O_RDONLY)) < 0)
mas01mc@292 1100 error("Can't open the file", filename, "open");
mas01mc@292 1101 close(dbfid);
mas01mc@292 1102 dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1103 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1104 serial_get_header(db); // read header
mas01mc@292 1105 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
mas01mc@292 1106
mas01mc@292 1107 // Unserialize header parameters
mas01mc@292 1108 H::L = lshHeader->numTables;
mas01mc@292 1109 H::m = (Uns32T)( (1.0 + sqrt(1 + 8.0*(int)H::L)) / 2.0);
mas01mc@292 1110 H::N = lshHeader->numRows;
mas01mc@292 1111 H::C = lshHeader->numCols;
mas01mc@292 1112 H::k = lshHeader->numFuns;
mas01mc@292 1113 H::d = lshHeader->dataDim;
mas01mc@293 1114 H::w = lshHeader->binWidth;
mas01mc@293 1115 H::radius = lshHeader->radius;
mas01mc@293 1116 H::maxp = lshHeader->maxp;
mas01mc@296 1117 H::pointCount = lshHeader->pointCount;
mas01mc@292 1118
mas01mc@292 1119 return dbfid;
mas01mc@292 1120 }
mas01mc@292 1121
mas01mc@292 1122 // unserialize the LSH parameters
mas01mc@292 1123 // we leave the LSH tree on disk as a flat file
mas01mc@292 1124 // it is this flat file that we search by memory mapping
mas01mc@292 1125 void G::unserialize_lsh_functions(int dbfid){
mas01mc@292 1126 Uns32T j, kk;
mas01mc@292 1127 float* pf;
mas01mc@292 1128 Uns32T* pu;
mas01mc@292 1129
mas01mc@292 1130 // Load the hash functions into core
mas01mc@292 1131 char* db = serial_mmap(dbfid, get_serial_hashtable_offset(), 0);// get database pointer again
mas01mc@292 1132
mas01mc@292 1133 pf = get_serial_hashfunction_base(db);
mas01mc@292 1134
mas01mc@292 1135 #ifdef USE_U_FUNCTIONS
mas01mc@292 1136 for( j = 0 ; j < H::m ; j++ ){ // L functions gj(v)
mas01mc@292 1137 for( kk = 0 ; kk < H::k/2 ; kk++ ){ // Normally distributed hash functions
mas01mc@292 1138 #else
mas01mc@292 1139 for( j = 0 ; j < H::L ; j++ ){ // L functions gj(v)
mas01mc@292 1140 for( kk = 0 ; kk < H::k ; kk++ ){ // Normally distributed hash functions
mas01mc@292 1141 #endif
mas01mc@292 1142 for(Uns32T i = 0 ; i < H::d ; i++ )
mas01mc@293 1143 H::A[j][kk][i] = *pf++; // Normally distributed random vectors
mas01mc@292 1144 }
mas01mc@292 1145 }
mas01mc@292 1146 #ifdef USE_U_FUNCTIONS
mas01mc@292 1147 for( j = 0 ; j < H::m ; j++ ) // biases b
mas01mc@292 1148 for( kk = 0 ; kk < H::k/2 ; kk++ )
mas01mc@292 1149 #else
mas01mc@292 1150 for( j = 0 ; j < H::L ; j++ ) // biases b
mas01mc@292 1151 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1152 #endif
mas01mc@293 1153 H::b[j][kk] = *pf++;
mas01mc@292 1154
mas01mc@292 1155 pu = (Uns32T*)pf;
mas01mc@292 1156 for( j = 0 ; j < H::L ; j++ ) // Z projectors r1
mas01mc@292 1157 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1158 H::r1[j][kk] = *pu++;
mas01mc@292 1159
mas01mc@292 1160 for( j = 0 ; j < H::L ; j++ ) // Z projectors r2
mas01mc@292 1161 for( kk = 0 ; kk < H::k ; kk++ )
mas01mc@292 1162 H::r2[j][kk] = *pu++;
mas01mc@292 1163
mas01mc@293 1164 serial_munmap(db, get_serial_hashtable_offset());
mas01mc@292 1165 }
mas01mc@292 1166
mas01mc@292 1167 void G::unserialize_lsh_hashtables_format1(int fid){
mas01mc@292 1168 SerialElementT *pe, *pt;
mas01mc@292 1169 Uns32T x,y;
mas01mc@292 1170 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1171 // Read the hash tables into core
mas01mc@292 1172 for( x = 0 ; x < H::L ; x++ ){
mas01mc@292 1173 // memory map a single hash table
mas01mc@292 1174 // Align each hash table to page boundary
mas01mc@292 1175 char* dbtable = serial_mmap(fid, hashTableSize, 0,
mas01mc@292 1176 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
mas01mc@324 1177 #ifdef __CYGWIN__
mas01mc@324 1178 // No madvise in CYGWIN
mas01mc@324 1179 #else
mas01mc@292 1180 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 1181 error("could not advise hashtable memory","","madvise");
mas01mc@324 1182 #endif
mas01mc@292 1183 pt=(SerialElementT*)dbtable;
mas01mc@292 1184 for( y = 0 ; y < H::N ; y++ ){
mas01mc@292 1185 // Move disk pointer to beginning of row
mas01mc@292 1186 pe=pt+y*lshHeader->numCols;
mas01mc@292 1187 unserialize_hashtable_row_format1(pe, h[x]+y);
mas01mc@292 1188 #ifdef __LSH_DUMP_CORE_TABLES__
mas01mc@292 1189 printf("S[%d,%d]", x, y);
mas01mc@292 1190 serial_bucket_dump(pe);
mas01mc@292 1191 printf("C[%d,%d]", x, y);
mas01mc@292 1192 dump_hashtable_row(h[x][y]);
mas01mc@292 1193 #endif
mas01mc@292 1194 }
mas01mc@292 1195 serial_munmap(dbtable, hashTableSize);
mas01mc@292 1196 }
mas01mc@292 1197 }
mas01mc@292 1198
mas01mc@292 1199 void G::unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b){
mas01mc@292 1200 Uns32T colCount = 0;
mas01mc@292 1201 while(colCount!=lshHeader->numCols && pe->hashValue !=IFLAG){
mas01mc@292 1202 H::p = pe->pointID; // current point ID
mas01mc@292 1203 t2 = pe->hashValue;
mas01mc@292 1204 bucket_insert_point(b);
mas01mc@292 1205 pe++;
mas01mc@292 1206 colCount++;
mas01mc@292 1207 }
mas01mc@292 1208 }
mas01mc@292 1209
mas01mc@336 1210 void G::unserialize_lsh_hashtables_format2(FILE* dbFile){
mas01mc@292 1211 Uns32T x=0,y=0;
mas01mc@292 1212
mas01mc@292 1213 // Seek to hashtable base offset
mas01mc@336 1214 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
mas01mc@336 1215 fclose(dbFile);
mas01mc@336 1216 error("fSeek error in unserialize_lsh_hashtables_format2");
mas01mc@292 1217 }
mas01mc@292 1218
mas01mc@292 1219 // Read the hash tables into core (structure is given in header)
mas01mc@292 1220 while( x < H::L){
mas01mc@336 1221 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1222 fclose(dbFile);
mas01mc@292 1223 error("Read error","unserialize_lsh_hashtables_format2()");
mas01mc@292 1224 }
mas01mc@306 1225 if(H::t1==O2_SERIAL_TOKEN_ENDTABLE)
mas01mc@292 1226 x++; // End of table
mas01mc@292 1227 else
mas01mc@292 1228 while(y < H::N){
mas01mc@306 1229 // Read a row and move file pointer to beginning of next row or _bittable
mas01mc@306 1230 if(!(H::t1==O2_SERIAL_TOKEN_T1)){
mas01mc@336 1231 fclose(dbFile);
mas01mc@306 1232 error("State matchine error T1","unserialize_lsh_hashtables_format2()");
mas01mc@292 1233 }
mas01mc@336 1234 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1235 fclose(dbFile);
mas01mc@306 1236 error("Read error: t1","unserialize_lsh_hashtables_format2()");
mas01mc@306 1237 }
mas01mc@306 1238 y = H::t1;
mas01mc@292 1239 if(y>=H::N){
mas01mc@336 1240 fclose(dbFile);
mas01mc@292 1241 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
mas01mc@292 1242 }
mas01mc@336 1243 Uns32T token = unserialize_hashtable_row_format2(dbFile, h[x]+y);
mas01mc@292 1244
mas01mc@292 1245 #ifdef __LSH_DUMP_CORE_TABLES__
mas01mc@292 1246 printf("C[%d,%d]", x, y);
mas01mc@292 1247 dump_hashtable_row(h[x][y]);
mas01mc@292 1248 #endif
mas01mc@292 1249 // Check that token is valid
mas01mc@306 1250 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
mas01mc@336 1251 fclose(dbFile);
mas01mc@292 1252 error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()");
mas01mc@292 1253 }
mas01mc@292 1254 // Check for end of table flag
mas01mc@306 1255 if(token==O2_SERIAL_TOKEN_ENDTABLE){
mas01mc@292 1256 x++;
mas01mc@292 1257 break;
mas01mc@292 1258 }
mas01mc@292 1259 // Check for new row flag
mas01mc@306 1260 if(token==O2_SERIAL_TOKEN_T1)
mas01mc@292 1261 H::t1 = token;
mas01mc@292 1262 }
mas01mc@292 1263 }
mas01mc@306 1264 }
mas01mc@292 1265
mas01mc@336 1266 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b){
mas01mc@292 1267 bool pointFound = false;
mas01mc@336 1268 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1269 fclose(dbFile);
mas01mc@292 1270 error("Read error T2 token","unserialize_hashtable_row_format2");
mas01mc@292 1271 }
mas01mc@306 1272 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
mas01mc@336 1273 fclose(dbFile);
mas01mc@292 1274 error("State machine error: expected E or T2");
mas01mc@292 1275 }
mas01mc@306 1276 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
mas01mc@292 1277 pointFound=false;
mas01mc@292 1278 // Check for T2 token
mas01mc@306 1279 if(H::t2!=O2_SERIAL_TOKEN_T2){
mas01mc@336 1280 fclose(dbFile);
mas01mc@292 1281 error("State machine error T2 token", "unserialize_hashtable_row_format2()");
mas01mc@306 1282 }
mas01mc@292 1283 // Read t2 value
mas01mc@336 1284 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1285 fclose(dbFile);
mas01mc@292 1286 error("Read error t2","unserialize_hashtable_row_format2");
mas01mc@292 1287 }
mas01mc@336 1288 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1289 fclose(dbFile);
mas01mc@292 1290 error("Read error H::p","unserialize_hashtable_row_format2");
mas01mc@292 1291 }
mas01mc@306 1292 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
mas01mc@292 1293 pointFound=true;
mas01mc@292 1294 bucket_insert_point(b);
mas01mc@336 1295 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
mas01mc@336 1296 fclose(dbFile);
mas01mc@292 1297 error("Read error H::p","unserialize_hashtable_row_format2");
mas01mc@292 1298 }
mas01mc@292 1299 }
mas01mc@292 1300 if(!pointFound)
mas01mc@292 1301 error("State machine error: point", "unserialize_hashtable_row_format2()");
mas01mc@306 1302 H::t2 = H::p; // Copy last found token to t2
mas01mc@292 1303 }
mas01mc@292 1304 return H::t2; // holds current token
mas01mc@292 1305 }
mas01mc@292 1306
mas01mc@292 1307 void G::dump_hashtable_row(bucket* p){
mas01mc@292 1308 while(p && p->t2!=IFLAG){
mas01mc@292 1309 sbucket* sbp = p->snext;
mas01mc@292 1310 while(sbp){
mas01mc@292 1311 printf("(%0X,%u)", p->t2, sbp->pointID);
mas01mc@292 1312 fflush(stdout);
mas01mc@292 1313 sbp=sbp->snext;
mas01mc@292 1314 }
mas01mc@292 1315 p=p->next;
mas01mc@292 1316 }
mas01mc@292 1317 printf("\n");
mas01mc@292 1318 }
mas01mc@292 1319
mas01mc@292 1320
mas01mc@292 1321 // G::serial_retrieve_point( ... )
mas01mc@292 1322 // retrieves (pointID) from a serialized LSH database
mas01mc@292 1323 //
mas01mc@292 1324 // inputs:
mas01mc@292 1325 // filename - file name of serialized LSH database
mas01mc@292 1326 // vv - query point set
mas01mc@292 1327 //
mas01mc@292 1328 // outputs:
mas01mc@292 1329 // inserts retrieved points into add_point() callback method
mas01mc@292 1330 void G::serial_retrieve_point_set(char* filename, vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller)
mas01mc@292 1331 {
mas01mc@292 1332 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1333 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1334 serial_get_header(dbheader); // read header
mas01mc@292 1335 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1336
mas01mc@292 1337 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
mas01mc@336 1338 serial_close(dbfid);
mas01mc@292 1339 error("serial_retrieve_point_set is for SERIAL_FILEFORMAT1 only");
mas01mc@292 1340 }
mas01mc@292 1341
mas01mc@292 1342 // size of each hash table
mas01mc@292 1343 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1344 calling_instance = caller; // class instance variable used in ...bucket_chain_point()
mas01mc@292 1345 add_point_callback = add_point;
mas01mc@292 1346
mas01mc@292 1347 for(Uns32T j=0; j<L; j++){
mas01mc@292 1348 // memory map a single hash table for random access
mas01mc@292 1349 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1350 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1351 #ifdef __CYGWIN__
mas01mc@324 1352 // No madvise in CYGWIN
mas01mc@324 1353 #else
mas01mc@292 1354 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
mas01mc@292 1355 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1356 #endif
mas01mc@292 1357 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@292 1358 for(Uns32T qpos=0; qpos<vv.size(); qpos++){
mas01mc@293 1359 H::compute_hash_functions(vv[qpos]);
mas01mc@293 1360 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
mas01mc@292 1361 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
mas01mc@292 1362 }
mas01mc@292 1363 serial_munmap(db, hashTableSize); // drop hashtable mmap
mas01mc@292 1364 }
mas01mc@292 1365 serial_close(dbfid);
mas01mc@292 1366 }
mas01mc@292 1367
mas01mc@292 1368 void G::serial_retrieve_point(char* filename, vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
mas01mc@292 1369 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1370 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1371 serial_get_header(dbheader); // read header
mas01mc@292 1372 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1373
mas01mc@292 1374 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
mas01mc@336 1375 serial_close(dbfid);
mas01mc@292 1376 error("serial_retrieve_point is for SERIAL_FILEFORMAT1 only");
mas01mc@292 1377 }
mas01mc@292 1378
mas01mc@292 1379 // size of each hash table
mas01mc@292 1380 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1381 calling_instance = caller;
mas01mc@292 1382 add_point_callback = add_point;
mas01mc@293 1383 H::compute_hash_functions(v);
mas01mc@292 1384 for(Uns32T j=0; j<L; j++){
mas01mc@292 1385 // memory map a single hash table for random access
mas01mc@292 1386 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1387 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1388 #ifdef __CYGWIN__
mas01mc@324 1389 // No madvise in CYGWIN
mas01mc@324 1390 #else
mas01mc@292 1391 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
mas01mc@292 1392 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1393 #endif
mas01mc@292 1394 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@293 1395 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
mas01mc@292 1396 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
mas01mc@292 1397 serial_munmap(db, hashTableSize); // drop hashtable mmap
mas01mc@292 1398 }
mas01mc@292 1399 serial_close(dbfid);
mas01mc@292 1400 }
mas01mc@292 1401
mas01mc@292 1402 void G::serial_dump_tables(char* filename){
mas01mc@292 1403 int dbfid = serial_open(filename, 0); // open for read
mas01mc@292 1404 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
mas01mc@292 1405 serial_get_header(dbheader); // read header
mas01mc@292 1406 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
mas01mc@292 1407 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
mas01mc@292 1408 for(Uns32T j=0; j<L; j++){
mas01mc@292 1409 // memory map a single hash table for random access
mas01mc@292 1410 char* db = serial_mmap(dbfid, hashTableSize, 0,
mas01mc@292 1411 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
mas01mc@324 1412 #ifdef __CYGWIN__
mas01mc@324 1413 // No madvise in CYGWIN
mas01mc@324 1414 #else
mas01mc@292 1415 if(madvise(db, hashTableSize, MADV_SEQUENTIAL)<0)
mas01mc@292 1416 error("could not advise local hashtable memory","","madvise");
mas01mc@324 1417 #endif
mas01mc@292 1418 SerialElementT* pe = (SerialElementT*)db ;
mas01mc@292 1419 printf("*********** TABLE %d ***************\n", j);
mas01mc@292 1420 fflush(stdout);
mas01mc@292 1421 int count=0;
mas01mc@292 1422 do{
mas01mc@292 1423 printf("[%d,%d]", j, count++);
mas01mc@292 1424 fflush(stdout);
mas01mc@292 1425 serial_bucket_dump(pe);
mas01mc@292 1426 pe+=lshHeader->numCols;
mas01mc@292 1427 }while(pe<(SerialElementT*)db+lshHeader->numRows*lshHeader->numCols);
mas01mc@292 1428 }
mas01mc@292 1429
mas01mc@292 1430 }
mas01mc@292 1431
mas01mc@292 1432 void G::serial_bucket_dump(SerialElementT* pe){
mas01mc@292 1433 SerialElementT* pend = pe+lshHeader->numCols;
mas01mc@292 1434 while( !(pe->hashValue==IFLAG || pe==pend ) ){
mas01mc@292 1435 printf("(%0X,%u)",pe->hashValue,pe->pointID);
mas01mc@292 1436 pe++;
mas01mc@292 1437 }
mas01mc@292 1438 printf("\n");
mas01mc@292 1439 fflush(stdout);
mas01mc@292 1440 }
mas01mc@292 1441
mas01mc@292 1442 void G::serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos){
mas01mc@292 1443 SerialElementT* pend = pe+lshHeader->numCols;
mas01mc@292 1444 while( !(pe->hashValue==IFLAG || pe==pend ) ){
mas01mc@292 1445 if(pe->hashValue==t2){ // new match
mas01mc@292 1446 add_point_callback(calling_instance, pe->pointID, qpos, radius);
mas01mc@292 1447 }
mas01mc@292 1448 pe++;
mas01mc@292 1449 }
mas01mc@292 1450 }
mas01mc@292 1451
mas01mc@292 1452 void G::bucket_chain_point(bucket* p, Uns32T qpos){
mas01mc@292 1453 if(!p || p->t2==IFLAG)
mas01mc@292 1454 return;
mas01mc@292 1455 if(p->t2==t2){ // match
mas01mc@292 1456 sbucket_chain_point(p->snext, qpos); // add to reporter
mas01mc@292 1457 }
mas01mc@292 1458 if(p->next){
mas01mc@292 1459 bucket_chain_point(p->next, qpos); // recurse
mas01mc@292 1460 }
mas01mc@292 1461 }
mas01mc@292 1462
mas01mc@292 1463 void G::sbucket_chain_point(sbucket* p, Uns32T qpos){
mas01mc@292 1464 add_point_callback(calling_instance, p->pointID, qpos, radius);
mas01mc@292 1465 if(p->snext){
mas01mc@292 1466 sbucket_chain_point(p->snext, qpos);
mas01mc@292 1467 }
mas01mc@292 1468 }
mas01mc@292 1469
mas01mc@292 1470 void G::get_lock(int fd, bool exclusive) {
mas01mc@292 1471 struct flock lock;
mas01mc@292 1472 int status;
mas01mc@292 1473 lock.l_type = exclusive ? F_WRLCK : F_RDLCK;
mas01mc@292 1474 lock.l_whence = SEEK_SET;
mas01mc@292 1475 lock.l_start = 0;
mas01mc@292 1476 lock.l_len = 0; /* "the whole file" */
mas01mc@292 1477 retry:
mas01mc@292 1478 do {
mas01mc@292 1479 status = fcntl(fd, F_SETLKW, &lock);
mas01mc@292 1480 } while (status != 0 && errno == EINTR);
mas01mc@292 1481 if (status) {
mas01mc@292 1482 if (errno == EAGAIN) {
mas01mc@292 1483 sleep(1);
mas01mc@292 1484 goto retry;
mas01mc@292 1485 } else {
mas01mc@292 1486 error("fcntl lock error", "", "fcntl");
mas01mc@292 1487 }
mas01mc@292 1488 }
mas01mc@292 1489 }
mas01mc@292 1490
mas01mc@292 1491 void G::release_lock(int fd) {
mas01mc@292 1492 struct flock lock;
mas01mc@292 1493 int status;
mas01mc@292 1494
mas01mc@292 1495 lock.l_type = F_UNLCK;
mas01mc@292 1496 lock.l_whence = SEEK_SET;
mas01mc@292 1497 lock.l_start = 0;
mas01mc@292 1498 lock.l_len = 0;
mas01mc@292 1499
mas01mc@292 1500 status = fcntl(fd, F_SETLKW, &lock);
mas01mc@292 1501
mas01mc@292 1502 if (status)
mas01mc@292 1503 error("fcntl unlock error", "", "fcntl");
mas01mc@292 1504 }