comparison lshlib.cpp @ 340:a6edbe97fddf

Added LSH_CORE_ARRAY structure for hashtables instead of linked lists. Maintained Backwards Compatibiliity with indexes build for linked list format. Added tests for indexing and merging. Tested backwards compatibility OK.\n\n The purpose of the LSH_CORE_ARRAY data structure is greater space efficiency and L1/2 cache usage. Essential for multiple indexes with multiple hashtables in RAM
author mas01mc
date Wed, 10 Sep 2008 18:55:16 +0000
parents fe4d5b763086
children fdeb8db97678
comparison
equal deleted inserted replaced
339:da901c62e569 340:a6edbe97fddf
1 #include "lshlib.h" 1 #include "lshlib.h"
2 2
3 //#define __LSH_DUMP_CORE_TABLES__ 3 //#define LSH_DUMP_CORE_TABLES
4 //#define USE_U_FUNCTIONS 4 //#define USE_U_FUNCTIONS
5 #define LSH_BLOCK_FULL_ROWS 5
6 // Use backward-compatible CORE ARRAY lsh index
7 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists
8 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads
9
10 // Critical path logic
11 #if defined LSH_CORE_ARRAY && !defined LSH_LIST_HEAD_COUNTERS
12 #define LSH_LIST_HEAD_COUNTERS
13 #endif
14
15 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head
16
6 17
7 void err(char*s){cout << s << endl;exit(2);} 18 void err(char*s){cout << s << endl;exit(2);}
8 19
9 Uns32T get_page_logn(){ 20 Uns32T get_page_logn(){
10 int pagesz = (int)sysconf(_SC_PAGESIZE); 21 int pagesz = (int)sysconf(_SC_PAGESIZE);
191 } 202 }
192 203
193 // Destruct hash tables 204 // Destruct hash tables
194 H::~H(){ 205 H::~H(){
195 Uns32T i,j,kk; 206 Uns32T i,j,kk;
207 bucket** pp;
196 #ifdef USE_U_FUNCTIONS 208 #ifdef USE_U_FUNCTIONS
197 for( j = 0 ; j < H::m ; j++ ){ 209 for( j = 0 ; j < H::m ; j++ ){
198 for( kk = 0 ; kk < H::k/2 ; kk++ ) 210 for( kk = 0 ; kk < H::k/2 ; kk++ )
199 delete[] A[j][kk]; 211 delete[] A[j][kk];
200 delete[] A[j]; 212 delete[] A[j];
219 delete[] g[j]; 231 delete[] g[j];
220 delete[] g; 232 delete[] g;
221 for( j=0 ; j < H::L ; j++ ){ 233 for( j=0 ; j < H::L ; j++ ){
222 delete[] H::r1[ j ]; 234 delete[] H::r1[ j ];
223 delete[] H::r2[ j ]; 235 delete[] H::r2[ j ];
224 for(i = 0; i< H::N ; i++) 236 for(i = 0; i< H::N ; i++){
237 pp = 0;
238 #ifdef LSH_CORE_ARRAY
239 if(H::h[ j ][ i ])
240 if(H::h[ j ][ i ]->t2 & LSH_CORE_ARRAY_BIT){
241 pp = get_pointer_to_bucket_linked_list(H::h[ j ][ i ]);
242 if(*pp){
243 (*pp)->snext=0; // Head of list uses snext as a non-pointer value
244 delete *pp; // Now the destructor can do its work properly
245 }
246 free(H::h[ j ][ i ]->next);
247 H::h[ j ][ i ]->next = 0; // Zero next pointer
248 H::h[ j ][ i ]->snext = 0; // Zero head-of-list pointer as above
249 }
250 #endif
225 delete H::h[ j ][ i ]; 251 delete H::h[ j ][ i ];
252 }
226 delete[] H::h[ j ]; 253 delete[] H::h[ j ];
227 } 254 }
228 delete[] H::r1; 255 delete[] H::r1;
229 delete[] H::r2; 256 delete[] H::r2;
230 delete[] H::h; 257 delete[] H::h;
331 return h; 358 return h;
332 } 359 }
333 360
334 Uns32T H::bucket_insert_point(bucket **pp){ 361 Uns32T H::bucket_insert_point(bucket **pp){
335 collisionCount = 0; 362 collisionCount = 0;
363 #ifdef LSH_LIST_HEAD_COUNTERS
336 if(!*pp){ 364 if(!*pp){
337 *pp = new bucket(); 365 *pp = new bucket();
338 #ifdef LSH_BLOCK_FULL_ROWS
339 (*pp)->t2 = 0; // Use t2 as a collision counter for the row 366 (*pp)->t2 = 0; // Use t2 as a collision counter for the row
340 (*pp)->next = new bucket(); 367 (*pp)->next = new bucket();
341 #endif 368 }
342 } 369 // The list head holds point collision count
343 #ifdef LSH_BLOCK_FULL_ROWS 370 if( (*pp)->t2 & LSH_CORE_ARRAY_BIT )
344 collisionCount = (*pp)->t2; 371 bucket_insert_point(get_pointer_to_bucket_linked_list(*pp)); // recurse
345 if(collisionCount < H::C){ // Block if row is full 372 else{
346 (*pp)->t2++; // Increment collision counter 373 collisionCount = (*pp)->t2;
347 pointCount++; 374 if(collisionCount < H::C){ // Block if row is full
348 collisionCount++; 375 (*pp)->t2++; // Increment collision counter (numPoints in row)
349 __bucket_insert_point((*pp)->next); // First bucket holds collision count 376 pointCount++;
350 } 377 collisionCount++;
351 #else 378 // Locate the bucket linked list
379 __bucket_insert_point( (*pp)->next );
380 }
381 }
382 #else // NOT USING LSH_LIST_HEAD_COUNTERS
383 if(!*pp)
384 *pp = new bucket();
352 pointCount++; 385 pointCount++;
353 __bucket_insert_point(*pp); // No collision count storage 386 __bucket_insert_point(*pp); // No collision count storage
354 #endif 387 #endif
355 return collisionCount; 388 return collisionCount;
356 } 389 }
357 390
391 // insert points into hashtable row collision chain
358 void H::__bucket_insert_point(bucket* p){ 392 void H::__bucket_insert_point(bucket* p){
359 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2? 393 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2?
360 p->t2 = H::t2; 394 p->t2 = H::t2;
361 bucketCount++; // Record start of new point-locale collision chain 395 bucketCount++; // Record start of new point-locale collision chain
362 p->snext = new sbucket(); 396 p->snext = new sbucket();
368 __sbucket_insert_point(p->snext); 402 __sbucket_insert_point(p->snext);
369 return; 403 return;
370 } 404 }
371 405
372 if(p->next){ 406 if(p->next){
373 __bucket_insert_point(p->next); 407 // Construct list in t2 order
374 } 408 if(H::t2 < p->next->t2){
375 409 bucket* tmp = new bucket();
376 else{ 410 tmp->next = p->next;
411 p->next = tmp;
412 __bucket_insert_point(tmp);
413 }
414 else
415 __bucket_insert_point(p->next);
416 }
417 else {
377 p->next = new bucket(); 418 p->next = new bucket();
378 __bucket_insert_point(p->next); 419 __bucket_insert_point(p->next);
379 } 420 }
380 421 }
381 } 422
382 423 // insert points into point-locale collision chain
383 void H::__sbucket_insert_point(sbucket* p){ 424 void H::__sbucket_insert_point(sbucket* p){
384 if(p->pointID==IFLAG){ 425 if(p->pointID==IFLAG){
385 p->pointID = H::p; 426 p->pointID = H::p;
386 return; 427 return;
387 } 428 }
388 429
389 // Search for pointID 430 // Search for pointID
390 if(p->snext){ 431 if(p->snext){
391 __sbucket_insert_point(p->snext); 432 __sbucket_insert_point(p->snext);
392 } 433 }
393 else{ 434 else{
394 // Make new point collision bucket at end of list 435 // Make new point collision bucket at end of list
395 p->snext = new sbucket(); 436 p->snext = new sbucket();
396 __sbucket_insert_point(p->snext); 437 __sbucket_insert_point(p->snext);
397 } 438 }
398 } 439 }
399 440
400 inline bucket** H::get_bucket(int j){ 441 inline bucket** H::get_bucket(int j){
401 return *(h+j); 442 return *(h+j);
443 }
444
445 // Find the linked-list pointer at the end of the CORE_ARRAY
446 bucket** H::get_pointer_to_bucket_linked_list(bucket* rowPtr){
447 Uns32T numBuckets = (Uns32T) rowPtr->snext; // Cast pointer to unsigned int
448 Uns32T numPoints = rowPtr->t2 & 0x7FFFFFFF; // Value is stored in low 31 bits of t2 field
449 bucket** listPtr = reinterpret_cast<bucket**> (reinterpret_cast<unsigned int*>(rowPtr->next)+numPoints+numBuckets+1);
450 return listPtr;
402 } 451 }
403 452
404 // Interface to Locality Sensitive Hashing G 453 // Interface to Locality Sensitive Hashing G
405 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr): 454 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr):
406 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures 455 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures
477 add_point_callback = add_point; 526 add_point_callback = add_point;
478 H::compute_hash_functions( v ); 527 H::compute_hash_functions( v );
479 for(Uns32T j = 0 ; j < H::L ; j++ ){ 528 for(Uns32T j = 0 ; j < H::L ; j++ ){
480 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 529 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
481 if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) 530 if( bucket* bPtr = *(get_bucket(j) + get_t1()) )
482 #ifdef LSH_BLOCK_FULL_ROWS 531 #ifdef LSH_LIST_HEAD_COUNTERS
483 bucket_chain_point( bPtr->next, qpos); 532 if(bPtr->t2&LSH_CORE_ARRAY_BIT)
533 retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos);
534 else
535 bucket_chain_point( bPtr->next, qpos);
484 #else 536 #else
485 bucket_chain_point( bPtr , qpos); 537 bucket_chain_point( bPtr , qpos);
486 #endif 538 #endif
487 } 539 }
488 } 540 }
794 pe=pt+y*lshHeader->numCols; 846 pe=pt+y*lshHeader->numCols;
795 847
796 colCount=0; 848 colCount=0;
797 if(bucket* bPtr = h[x][y]) 849 if(bucket* bPtr = h[x][y])
798 if(merge) 850 if(merge)
799 #ifdef LSH_BLOCK_FULL_ROWS 851 #ifdef LSH_LIST_HEAD_COUNTERS
800 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket 852 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
801 else 853 else
802 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket 854 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
803 #else 855 #else
804 serial_merge_hashtable_row_format1(pe, bPtr, colCount); 856 serial_merge_hashtable_row_format1(pe, bPtr, colCount);
885 Uns32T x,y; 937 Uns32T x,y;
886 938
887 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) ) 939 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
888 error("Cannot merge core and serial LSH, data structure dimensions mismatch."); 940 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
889 941
890 // We must pereform FORMAT2 merges in core 942 // We must pereform FORMAT2 merges in core FORMAT1 (dynamic list structure)
891 if(merge) 943 if(merge)
892 unserialize_lsh_hashtables_format2(dbFile); 944 unserialize_lsh_hashtables_format2(dbFile, merge);
893 945
894 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1; 946 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
895 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ 947 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
896 fclose(dbFile); 948 fclose(dbFile);
897 error("fSeek error in serialize_lsh_hashtables_format2"); 949 error("fSeek error in serialize_lsh_hashtables_format2");
907 colCountN=0; 959 colCountN=0;
908 for( y = 0 ; y < H::N ; y++ ){ 960 for( y = 0 ; y < H::N ; y++ ){
909 colCount=0; 961 colCount=0;
910 if(bucket* bPtr = h[x][y]){ 962 if(bucket* bPtr = h[x][y]){
911 // Check for empty row (even though row was allocated) 963 // Check for empty row (even though row was allocated)
912 #ifdef LSH_BLOCK_FULL_ROWS 964 #ifdef LSH_LIST_HEAD_COUNTERS
913 if(bPtr->next->t2==IFLAG){ 965 if(bPtr->next->t2==IFLAG){
914 fclose(dbFile); 966 fclose(dbFile);
915 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()"); 967 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
916 } 968 }
917 #else 969 #else
919 fclose(dbFile); 971 fclose(dbFile);
920 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()"); 972 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
921 } 973 }
922 #endif 974 #endif
923 t1 = O2_SERIAL_TOKEN_T1; 975 t1 = O2_SERIAL_TOKEN_T1;
924 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){ 976 WRITE_UNS32(&t1, "[T1]");
925 fclose(dbFile);
926 error("write error in serial_write_hashtable_format2() [T1]");
927 }
928 t1 = y; 977 t1 = y;
929 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){ 978 WRITE_UNS32(&t1, "[t1]");
930 fclose(dbFile); 979 #ifdef LSH_CORE_ARRAY
931 error("write error in serial_write_hashtable_format2() [t1]"); 980 t1 = count_buckets_and_points_hashtable_row(bPtr);
932 } 981 WRITE_UNS32(&t1,"[count]"); // write numElements
933 #ifdef LSH_BLOCK_FULL_ROWS 982 #endif
983 #ifdef LSH_LIST_HEAD_COUNTERS
934 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket 984 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket
935 #else 985 #else
936 serial_write_hashtable_row_format2(dbFile, bPtr, colCount); 986 serial_write_hashtable_row_format2(dbFile, bPtr, colCount);
937 #endif 987 #endif
938 } 988 }
939 if(colCount){ 989 if(colCount){
940 if(colCount<minColCount) 990 if(colCount<minColCount)
941 minColCount=colCount; 991 minColCount=colCount;
945 colCountN++; 995 colCountN++;
946 } 996 }
947 } 997 }
948 // Write END of table marker 998 // Write END of table marker
949 t1 = O2_SERIAL_TOKEN_ENDTABLE; 999 t1 = O2_SERIAL_TOKEN_ENDTABLE;
950 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile ) != 1 ){ 1000 WRITE_UNS32(&t1,"[end]");
951 fclose(dbFile);
952 error("write error in serial_write_hashtable_format2() [end]");
953 }
954
955 if(colCountN) 1001 if(colCountN)
956 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN 1002 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
957 << ", min = " << minColCount << ", max = " << maxColCount 1003 << ", min = " << minColCount << ", max = " << maxColCount
958 << endl; 1004 << endl;
959 } 1005 }
960
961 // We're done writing 1006 // We're done writing
962 return 1; 1007 return 1;
1008 }
1009
1010 Uns32T G::count_buckets_and_points_hashtable_row(bucket* bPtr){
1011 Uns32T total_count = 0;
1012 bucket* p = 0;
1013
1014 // count points
1015 #ifdef LSH_LIST_HEAD_COUNTERS
1016 total_count = bPtr->t2; // points already counted
1017 p = bPtr->next;
1018 #else
1019 total_count = count_points_hashtable_row(bPtr);
1020 p = bPtr;
1021 #endif
1022
1023 // count buckets
1024 do{
1025 total_count++;
1026 }while((p=p->next));
1027
1028 return total_count;
1029 }
1030
1031 Uns32T G::count_points_hashtable_row(bucket* bPtr){
1032 Uns32T point_count = 0;
1033 bucket* p = bPtr;
1034 sbucket* s = 0;
1035 while(p){
1036 s = p->snext;
1037 while(s){
1038 point_count++;
1039 s=s->snext;
1040 }
1041 p=p->next;
1042 }
1043 return point_count;
963 } 1044 }
964 1045
965 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){ 1046 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
966 while(b && b->t2!=IFLAG){ 1047 while(b && b->t2!=IFLAG){
967 if(!b->snext){ 1048 if(!b->snext){
1183 pt=(SerialElementT*)dbtable; 1264 pt=(SerialElementT*)dbtable;
1184 for( y = 0 ; y < H::N ; y++ ){ 1265 for( y = 0 ; y < H::N ; y++ ){
1185 // Move disk pointer to beginning of row 1266 // Move disk pointer to beginning of row
1186 pe=pt+y*lshHeader->numCols; 1267 pe=pt+y*lshHeader->numCols;
1187 unserialize_hashtable_row_format1(pe, h[x]+y); 1268 unserialize_hashtable_row_format1(pe, h[x]+y);
1188 #ifdef __LSH_DUMP_CORE_TABLES__ 1269 #ifdef LSH_DUMP_CORE_TABLES
1189 printf("S[%d,%d]", x, y); 1270 printf("S[%d,%d]", x, y);
1190 serial_bucket_dump(pe); 1271 serial_bucket_dump(pe);
1191 printf("C[%d,%d]", x, y); 1272 printf("C[%d,%d]", x, y);
1192 dump_hashtable_row(h[x][y]); 1273 dump_hashtable_row(h[x][y]);
1193 #endif 1274 #endif
1205 pe++; 1286 pe++;
1206 colCount++; 1287 colCount++;
1207 } 1288 }
1208 } 1289 }
1209 1290
1210 void G::unserialize_lsh_hashtables_format2(FILE* dbFile){ 1291 void G::unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge){
1211 Uns32T x=0,y=0; 1292 Uns32T x=0,y=0;
1212 1293
1213 // Seek to hashtable base offset 1294 // Seek to hashtable base offset
1214 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ 1295 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
1215 fclose(dbFile); 1296 fclose(dbFile);
1238 y = H::t1; 1319 y = H::t1;
1239 if(y>=H::N){ 1320 if(y>=H::N){
1240 fclose(dbFile); 1321 fclose(dbFile);
1241 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()"); 1322 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
1242 } 1323 }
1243 Uns32T token = unserialize_hashtable_row_format2(dbFile, h[x]+y); 1324 Uns32T token = 0;
1325 #ifdef LSH_CORE_ARRAY
1326 Uns32T numElements;
1327 if(fread(&numElements, sizeof(Uns32T), 1, dbFile) != 1){
1328 fclose(dbFile);
1329 error("Read error: numElements","unserialize_lsh_hashtables_format2()");
1330 }
1244 1331
1245 #ifdef __LSH_DUMP_CORE_TABLES__ 1332 // BACKWARD COMPATIBILITY: check to see if T2 or END token was read
1333 if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){
1334 forMerge=true; // Force use of dynamic linked list core format
1335 token = numElements;
1336 }
1337
1338 if(forMerge)
1339 // Use linked list CORE format
1340 token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token);
1341 else
1342 // Use ARRAY CORE format with numElements counter
1343 token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements);
1344 #else
1345 token = unserialize_hashtable_row_format2(dbFile, h[x]+y);
1346 #endif
1347
1348 #ifdef LSH_DUMP_CORE_TABLES
1246 printf("C[%d,%d]", x, y); 1349 printf("C[%d,%d]", x, y);
1247 dump_hashtable_row(h[x][y]); 1350 dump_hashtable_row(h[x][y]);
1248 #endif 1351 #endif
1249 // Check that token is valid 1352 // Check that token is valid
1250 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){ 1353 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
1261 H::t1 = token; 1364 H::t1 = token;
1262 } 1365 }
1263 } 1366 }
1264 } 1367 }
1265 1368
1266 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b){ 1369 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token){
1267 bool pointFound = false; 1370 bool pointFound = false;
1268 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){ 1371
1269 fclose(dbFile); 1372 if(token)
1270 error("Read error T2 token","unserialize_hashtable_row_format2"); 1373 H::t2 = token;
1271 } 1374 else if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
1375 fclose(dbFile);
1376 error("Read error T2 token","unserialize_hashtable_row_format2");
1377 }
1378
1272 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){ 1379 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
1273 fclose(dbFile); 1380 fclose(dbFile);
1274 error("State machine error: expected E or T2"); 1381 error("State machine error: expected E or T2");
1275 } 1382 }
1383
1276 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){ 1384 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
1277 pointFound=false; 1385 pointFound=false;
1278 // Check for T2 token 1386 // Check for T2 token
1279 if(H::t2!=O2_SERIAL_TOKEN_T2){ 1387 if(H::t2!=O2_SERIAL_TOKEN_T2){
1280 fclose(dbFile); 1388 fclose(dbFile);
1302 H::t2 = H::p; // Copy last found token to t2 1410 H::t2 = H::p; // Copy last found token to t2
1303 } 1411 }
1304 return H::t2; // holds current token 1412 return H::t2; // holds current token
1305 } 1413 }
1306 1414
1415 // Unserialize format2 hashtable row to a CORE_ARRAY pointed to
1416 // by the hashtable row pointer: rowPtr
1417 //
1418 // numElements is the total number of t2 buckets plus points
1419 // memory required is numElements+1+sizeof(hashtable ptr)
1420 //
1421 // numElements = numPoints + numBuckets
1422 //
1423 // During inserts (merges) new hashtable entries are appened at rowPtr+numPoints+numBuckets+1
1424 //
1425 // ASSUME: that LSH_LIST_HEAD_COUNTERS is set so that the first hashtable bucket is used to count
1426 // point and bucket entries
1427 //
1428 // We store the values of numPoints and numBuckets in separate fields of the first bucket
1429 // rowPtr->t2 // numPoints
1430 // (Uns32T)(rowPtr->snext) // numBuckets
1431 //
1432 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*))
1433 // To get to the fist bucket, we use
1434 //
1435
1436 #define READ_UNS32T(VAL,TOKENSTR) if(fread(VAL, sizeof(Uns32T), 1, dbFile) != 1){\
1437 fclose(dbFile);error("Read error unserialize_hashtable_format2",TOKENSTR);}
1438
1439 #define TEST_TOKEN(TEST, TESTSTR) if(TEST){fclose(dbFile);error("State machine error: ", TESTSTR);}
1440
1441 #define SKIP_BITS_LEFT_SHIFT_MSB (30)
1442
1443 #define SKIP_BITS_RIGHT_SHIFT_MSB (28)
1444 #define SKIP_BITS_RIGHT_SHIFT_LSB (30)
1445
1446 #define MAX_POINTS_IN_BUCKET_CORE_ARRAY (16)
1447 #define LSH_CORE_ARRAY_END_ROW_TOKEN (0xFFFFFFFD)
1448
1449 // Encode the skip bits. Zero if only one point, MAX 8 (plus first == 9)
1450 #define ENCODE_POINT_SKIP_BITS TEST_TOKEN(!numPointsThisBucket, "no points found");\
1451 if(numPointsThisBucket==1){\
1452 secondPtr=ap++;\
1453 *secondPtr=0;\
1454 numPoints++;\
1455 }\
1456 if(numPointsThisBucket>1){\
1457 *firstPtr |= ( (numPointsThisBucket-1) & 0x3 ) << SKIP_BITS_LEFT_SHIFT_MSB;\
1458 *secondPtr |= ( ( (numPointsThisBucket-1) & 0xC) >> 2 ) << SKIP_BITS_LEFT_SHIFT_MSB;}
1459
1460 Uns32T G::unserialize_hashtable_row_to_array(FILE* dbFile, bucket** rowPP, Uns32T numElements){
1461 Uns32T numPointsThisBucket = 0;
1462 Uns32T numBuckets = 0;
1463 Uns32T numPoints = 0;
1464 Uns32T* firstPtr = 0;
1465 Uns32T* secondPtr = 0;
1466
1467 // Initialize new row
1468 if(!*rowPP){
1469 *rowPP = new bucket();
1470 #ifdef LSH_LIST_HEAD_COUNTERS
1471 (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row
1472 (*rowPP)->next = 0;
1473 #endif
1474 }
1475 bucket* rowPtr = *rowPP;
1476
1477 READ_UNS32T(&(H::t2),"t2");
1478 TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2");
1479 // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements
1480 // So over-allocate by a factor of two and realloc later to actual numElements
1481 CR_ASSERT(rowPtr->next = (bucket*) malloc((2*numElements+1)*sizeof(Uns32T)+sizeof(bucket**)));
1482 Uns32T* ap = reinterpret_cast<Uns32T*>(rowPtr->next); // Cast pointer to Uns32T* for array format
1483 while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){
1484 numPointsThisBucket = 0;// reset bucket point counter
1485 secondPtr = 0; // reset second-point pointer
1486 TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");
1487 READ_UNS32T(&(H::t2), "Read error t2");
1488 *ap++ = H::t2; // Insert t2 value into array
1489 numBuckets++;
1490 READ_UNS32T(&(H::p), "Read error H::p");
1491 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
1492 if(numPointsThisBucket==MAX_POINTS_IN_BUCKET_CORE_ARRAY){
1493 ENCODE_POINT_SKIP_BITS;
1494 *ap++ = H::t2; // Extra element
1495 numBuckets++; // record this as a new bucket
1496 numPointsThisBucket=0; // reset bucket point counter
1497 secondPtr = 0; // reset second-point pointer
1498 }
1499 if( ++numPointsThisBucket == 1 )
1500 firstPtr = ap; // store pointer to first point to insert skip bits later on
1501 else if( numPointsThisBucket == 2 )
1502 secondPtr = ap; // store pointer to first point to insert skip bits later on
1503 numPoints++;
1504 *ap++ = H::p;
1505 READ_UNS32T(&(H::p), "Read error H::p");
1506 }
1507 ENCODE_POINT_SKIP_BITS;
1508 H::t2 = H::p; // Copy last found token to t2
1509 }
1510 // Reallocate the row to its actual size
1511 CR_ASSERT(rowPtr->next = (bucket*) realloc(rowPtr->next, (numBuckets+numPoints+1)*sizeof(Uns32T)+sizeof(bucket**)));
1512 // Record the sizes at the head of the row
1513 rowPtr->snext = (sbucket*) numBuckets;
1514 rowPtr->t2 = numPoints;
1515 // Place end of row marker
1516 *ap++ = LSH_CORE_ARRAY_END_ROW_TOKEN;
1517 // Set the LSH_CORE_ARRAY_BIT to identify data structure for insertion and retrieval
1518 rowPtr->t2 |= LSH_CORE_ARRAY_BIT;
1519 // Allocate a new dynamic list head at the end of the array
1520 bucket** listPtr = reinterpret_cast<bucket**> (ap);
1521 *listPtr = 0;
1522 // Return current token
1523 return H::t2; // return H::t2 which holds current token [E or T1]
1524 }
1525
1526
1527
1528 // *p is a pointer to the beginning of a hashtable row array
1529 // The array consists of t2 hash keys and one or more point identifiers p for each hash key
1530 // Retrieval is performed by generating a hash key query_t2 for query point q
1531 // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry
1532 // point for retrieve_from_core_hashtable_array
1533 #define SKIP_BITS (0xC0000000)
1534 void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){
1535 Uns32T skip;
1536 Uns32T t2;
1537 Uns32T p1;
1538 Uns32T p2;
1539
1540 CR_ASSERT(p);
1541
1542 do{
1543 t2 = *p++;
1544 if( t2 > H::t2 )
1545 return;
1546 p1 = *p++;
1547 p2 = *p++;
1548 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
1549 if( t2 == H::t2 ){
1550 add_point_callback(calling_instance, p1 ^ (p1 & SKIP_BITS), qpos, radius);
1551 if(skip--){
1552 add_point_callback(calling_instance, p2 ^ (p2 & SKIP_BITS), qpos, radius);
1553 while(skip-- )
1554 add_point_callback(calling_instance, *p++, qpos, radius);
1555 }
1556 }
1557 else
1558 if(*p != LSH_CORE_ARRAY_END_ROW_TOKEN)
1559 p = p + skip;
1560 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
1561 }
1562
1307 void G::dump_hashtable_row(bucket* p){ 1563 void G::dump_hashtable_row(bucket* p){
1308 while(p && p->t2!=IFLAG){ 1564 while(p && p->t2!=IFLAG){
1309 sbucket* sbp = p->snext; 1565 sbucket* sbp = p->snext;
1310 while(sbp){ 1566 while(sbp){
1311 printf("(%0X,%u)", p->t2, sbp->pointID); 1567 printf("(%0X,%u)", p->t2, sbp->pointID);