Mercurial > hg > audiodb
comparison lshlib.cpp @ 340:a6edbe97fddf
Added LSH_CORE_ARRAY structure for hashtables instead of linked lists. Maintained Backwards Compatibiliity with indexes build for linked list format. Added tests for indexing and merging. Tested backwards compatibility OK.\n\n The purpose of the LSH_CORE_ARRAY data structure is greater space efficiency and L1/2 cache usage. Essential for multiple indexes with multiple hashtables in RAM
author | mas01mc |
---|---|
date | Wed, 10 Sep 2008 18:55:16 +0000 |
parents | fe4d5b763086 |
children | fdeb8db97678 |
comparison
equal
deleted
inserted
replaced
339:da901c62e569 | 340:a6edbe97fddf |
---|---|
1 #include "lshlib.h" | 1 #include "lshlib.h" |
2 | 2 |
3 //#define __LSH_DUMP_CORE_TABLES__ | 3 //#define LSH_DUMP_CORE_TABLES |
4 //#define USE_U_FUNCTIONS | 4 //#define USE_U_FUNCTIONS |
5 #define LSH_BLOCK_FULL_ROWS | 5 |
6 // Use backward-compatible CORE ARRAY lsh index | |
7 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists | |
8 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads | |
9 | |
10 // Critical path logic | |
11 #if defined LSH_CORE_ARRAY && !defined LSH_LIST_HEAD_COUNTERS | |
12 #define LSH_LIST_HEAD_COUNTERS | |
13 #endif | |
14 | |
15 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head | |
16 | |
6 | 17 |
7 void err(char*s){cout << s << endl;exit(2);} | 18 void err(char*s){cout << s << endl;exit(2);} |
8 | 19 |
9 Uns32T get_page_logn(){ | 20 Uns32T get_page_logn(){ |
10 int pagesz = (int)sysconf(_SC_PAGESIZE); | 21 int pagesz = (int)sysconf(_SC_PAGESIZE); |
191 } | 202 } |
192 | 203 |
193 // Destruct hash tables | 204 // Destruct hash tables |
194 H::~H(){ | 205 H::~H(){ |
195 Uns32T i,j,kk; | 206 Uns32T i,j,kk; |
207 bucket** pp; | |
196 #ifdef USE_U_FUNCTIONS | 208 #ifdef USE_U_FUNCTIONS |
197 for( j = 0 ; j < H::m ; j++ ){ | 209 for( j = 0 ; j < H::m ; j++ ){ |
198 for( kk = 0 ; kk < H::k/2 ; kk++ ) | 210 for( kk = 0 ; kk < H::k/2 ; kk++ ) |
199 delete[] A[j][kk]; | 211 delete[] A[j][kk]; |
200 delete[] A[j]; | 212 delete[] A[j]; |
219 delete[] g[j]; | 231 delete[] g[j]; |
220 delete[] g; | 232 delete[] g; |
221 for( j=0 ; j < H::L ; j++ ){ | 233 for( j=0 ; j < H::L ; j++ ){ |
222 delete[] H::r1[ j ]; | 234 delete[] H::r1[ j ]; |
223 delete[] H::r2[ j ]; | 235 delete[] H::r2[ j ]; |
224 for(i = 0; i< H::N ; i++) | 236 for(i = 0; i< H::N ; i++){ |
237 pp = 0; | |
238 #ifdef LSH_CORE_ARRAY | |
239 if(H::h[ j ][ i ]) | |
240 if(H::h[ j ][ i ]->t2 & LSH_CORE_ARRAY_BIT){ | |
241 pp = get_pointer_to_bucket_linked_list(H::h[ j ][ i ]); | |
242 if(*pp){ | |
243 (*pp)->snext=0; // Head of list uses snext as a non-pointer value | |
244 delete *pp; // Now the destructor can do its work properly | |
245 } | |
246 free(H::h[ j ][ i ]->next); | |
247 H::h[ j ][ i ]->next = 0; // Zero next pointer | |
248 H::h[ j ][ i ]->snext = 0; // Zero head-of-list pointer as above | |
249 } | |
250 #endif | |
225 delete H::h[ j ][ i ]; | 251 delete H::h[ j ][ i ]; |
252 } | |
226 delete[] H::h[ j ]; | 253 delete[] H::h[ j ]; |
227 } | 254 } |
228 delete[] H::r1; | 255 delete[] H::r1; |
229 delete[] H::r2; | 256 delete[] H::r2; |
230 delete[] H::h; | 257 delete[] H::h; |
331 return h; | 358 return h; |
332 } | 359 } |
333 | 360 |
334 Uns32T H::bucket_insert_point(bucket **pp){ | 361 Uns32T H::bucket_insert_point(bucket **pp){ |
335 collisionCount = 0; | 362 collisionCount = 0; |
363 #ifdef LSH_LIST_HEAD_COUNTERS | |
336 if(!*pp){ | 364 if(!*pp){ |
337 *pp = new bucket(); | 365 *pp = new bucket(); |
338 #ifdef LSH_BLOCK_FULL_ROWS | |
339 (*pp)->t2 = 0; // Use t2 as a collision counter for the row | 366 (*pp)->t2 = 0; // Use t2 as a collision counter for the row |
340 (*pp)->next = new bucket(); | 367 (*pp)->next = new bucket(); |
341 #endif | 368 } |
342 } | 369 // The list head holds point collision count |
343 #ifdef LSH_BLOCK_FULL_ROWS | 370 if( (*pp)->t2 & LSH_CORE_ARRAY_BIT ) |
344 collisionCount = (*pp)->t2; | 371 bucket_insert_point(get_pointer_to_bucket_linked_list(*pp)); // recurse |
345 if(collisionCount < H::C){ // Block if row is full | 372 else{ |
346 (*pp)->t2++; // Increment collision counter | 373 collisionCount = (*pp)->t2; |
347 pointCount++; | 374 if(collisionCount < H::C){ // Block if row is full |
348 collisionCount++; | 375 (*pp)->t2++; // Increment collision counter (numPoints in row) |
349 __bucket_insert_point((*pp)->next); // First bucket holds collision count | 376 pointCount++; |
350 } | 377 collisionCount++; |
351 #else | 378 // Locate the bucket linked list |
379 __bucket_insert_point( (*pp)->next ); | |
380 } | |
381 } | |
382 #else // NOT USING LSH_LIST_HEAD_COUNTERS | |
383 if(!*pp) | |
384 *pp = new bucket(); | |
352 pointCount++; | 385 pointCount++; |
353 __bucket_insert_point(*pp); // No collision count storage | 386 __bucket_insert_point(*pp); // No collision count storage |
354 #endif | 387 #endif |
355 return collisionCount; | 388 return collisionCount; |
356 } | 389 } |
357 | 390 |
391 // insert points into hashtable row collision chain | |
358 void H::__bucket_insert_point(bucket* p){ | 392 void H::__bucket_insert_point(bucket* p){ |
359 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2? | 393 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2? |
360 p->t2 = H::t2; | 394 p->t2 = H::t2; |
361 bucketCount++; // Record start of new point-locale collision chain | 395 bucketCount++; // Record start of new point-locale collision chain |
362 p->snext = new sbucket(); | 396 p->snext = new sbucket(); |
368 __sbucket_insert_point(p->snext); | 402 __sbucket_insert_point(p->snext); |
369 return; | 403 return; |
370 } | 404 } |
371 | 405 |
372 if(p->next){ | 406 if(p->next){ |
373 __bucket_insert_point(p->next); | 407 // Construct list in t2 order |
374 } | 408 if(H::t2 < p->next->t2){ |
375 | 409 bucket* tmp = new bucket(); |
376 else{ | 410 tmp->next = p->next; |
411 p->next = tmp; | |
412 __bucket_insert_point(tmp); | |
413 } | |
414 else | |
415 __bucket_insert_point(p->next); | |
416 } | |
417 else { | |
377 p->next = new bucket(); | 418 p->next = new bucket(); |
378 __bucket_insert_point(p->next); | 419 __bucket_insert_point(p->next); |
379 } | 420 } |
380 | 421 } |
381 } | 422 |
382 | 423 // insert points into point-locale collision chain |
383 void H::__sbucket_insert_point(sbucket* p){ | 424 void H::__sbucket_insert_point(sbucket* p){ |
384 if(p->pointID==IFLAG){ | 425 if(p->pointID==IFLAG){ |
385 p->pointID = H::p; | 426 p->pointID = H::p; |
386 return; | 427 return; |
387 } | 428 } |
388 | 429 |
389 // Search for pointID | 430 // Search for pointID |
390 if(p->snext){ | 431 if(p->snext){ |
391 __sbucket_insert_point(p->snext); | 432 __sbucket_insert_point(p->snext); |
392 } | 433 } |
393 else{ | 434 else{ |
394 // Make new point collision bucket at end of list | 435 // Make new point collision bucket at end of list |
395 p->snext = new sbucket(); | 436 p->snext = new sbucket(); |
396 __sbucket_insert_point(p->snext); | 437 __sbucket_insert_point(p->snext); |
397 } | 438 } |
398 } | 439 } |
399 | 440 |
400 inline bucket** H::get_bucket(int j){ | 441 inline bucket** H::get_bucket(int j){ |
401 return *(h+j); | 442 return *(h+j); |
443 } | |
444 | |
445 // Find the linked-list pointer at the end of the CORE_ARRAY | |
446 bucket** H::get_pointer_to_bucket_linked_list(bucket* rowPtr){ | |
447 Uns32T numBuckets = (Uns32T) rowPtr->snext; // Cast pointer to unsigned int | |
448 Uns32T numPoints = rowPtr->t2 & 0x7FFFFFFF; // Value is stored in low 31 bits of t2 field | |
449 bucket** listPtr = reinterpret_cast<bucket**> (reinterpret_cast<unsigned int*>(rowPtr->next)+numPoints+numBuckets+1); | |
450 return listPtr; | |
402 } | 451 } |
403 | 452 |
404 // Interface to Locality Sensitive Hashing G | 453 // Interface to Locality Sensitive Hashing G |
405 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr): | 454 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr): |
406 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures | 455 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures |
477 add_point_callback = add_point; | 526 add_point_callback = add_point; |
478 H::compute_hash_functions( v ); | 527 H::compute_hash_functions( v ); |
479 for(Uns32T j = 0 ; j < H::L ; j++ ){ | 528 for(Uns32T j = 0 ; j < H::L ; j++ ){ |
480 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); | 529 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); |
481 if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) | 530 if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) |
482 #ifdef LSH_BLOCK_FULL_ROWS | 531 #ifdef LSH_LIST_HEAD_COUNTERS |
483 bucket_chain_point( bPtr->next, qpos); | 532 if(bPtr->t2&LSH_CORE_ARRAY_BIT) |
533 retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos); | |
534 else | |
535 bucket_chain_point( bPtr->next, qpos); | |
484 #else | 536 #else |
485 bucket_chain_point( bPtr , qpos); | 537 bucket_chain_point( bPtr , qpos); |
486 #endif | 538 #endif |
487 } | 539 } |
488 } | 540 } |
794 pe=pt+y*lshHeader->numCols; | 846 pe=pt+y*lshHeader->numCols; |
795 | 847 |
796 colCount=0; | 848 colCount=0; |
797 if(bucket* bPtr = h[x][y]) | 849 if(bucket* bPtr = h[x][y]) |
798 if(merge) | 850 if(merge) |
799 #ifdef LSH_BLOCK_FULL_ROWS | 851 #ifdef LSH_LIST_HEAD_COUNTERS |
800 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket | 852 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket |
801 else | 853 else |
802 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket | 854 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket |
803 #else | 855 #else |
804 serial_merge_hashtable_row_format1(pe, bPtr, colCount); | 856 serial_merge_hashtable_row_format1(pe, bPtr, colCount); |
885 Uns32T x,y; | 937 Uns32T x,y; |
886 | 938 |
887 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) ) | 939 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) ) |
888 error("Cannot merge core and serial LSH, data structure dimensions mismatch."); | 940 error("Cannot merge core and serial LSH, data structure dimensions mismatch."); |
889 | 941 |
890 // We must pereform FORMAT2 merges in core | 942 // We must pereform FORMAT2 merges in core FORMAT1 (dynamic list structure) |
891 if(merge) | 943 if(merge) |
892 unserialize_lsh_hashtables_format2(dbFile); | 944 unserialize_lsh_hashtables_format2(dbFile, merge); |
893 | 945 |
894 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1; | 946 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1; |
895 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ | 947 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ |
896 fclose(dbFile); | 948 fclose(dbFile); |
897 error("fSeek error in serialize_lsh_hashtables_format2"); | 949 error("fSeek error in serialize_lsh_hashtables_format2"); |
907 colCountN=0; | 959 colCountN=0; |
908 for( y = 0 ; y < H::N ; y++ ){ | 960 for( y = 0 ; y < H::N ; y++ ){ |
909 colCount=0; | 961 colCount=0; |
910 if(bucket* bPtr = h[x][y]){ | 962 if(bucket* bPtr = h[x][y]){ |
911 // Check for empty row (even though row was allocated) | 963 // Check for empty row (even though row was allocated) |
912 #ifdef LSH_BLOCK_FULL_ROWS | 964 #ifdef LSH_LIST_HEAD_COUNTERS |
913 if(bPtr->next->t2==IFLAG){ | 965 if(bPtr->next->t2==IFLAG){ |
914 fclose(dbFile); | 966 fclose(dbFile); |
915 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()"); | 967 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()"); |
916 } | 968 } |
917 #else | 969 #else |
919 fclose(dbFile); | 971 fclose(dbFile); |
920 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()"); | 972 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()"); |
921 } | 973 } |
922 #endif | 974 #endif |
923 t1 = O2_SERIAL_TOKEN_T1; | 975 t1 = O2_SERIAL_TOKEN_T1; |
924 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){ | 976 WRITE_UNS32(&t1, "[T1]"); |
925 fclose(dbFile); | |
926 error("write error in serial_write_hashtable_format2() [T1]"); | |
927 } | |
928 t1 = y; | 977 t1 = y; |
929 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile) != 1 ){ | 978 WRITE_UNS32(&t1, "[t1]"); |
930 fclose(dbFile); | 979 #ifdef LSH_CORE_ARRAY |
931 error("write error in serial_write_hashtable_format2() [t1]"); | 980 t1 = count_buckets_and_points_hashtable_row(bPtr); |
932 } | 981 WRITE_UNS32(&t1,"[count]"); // write numElements |
933 #ifdef LSH_BLOCK_FULL_ROWS | 982 #endif |
983 #ifdef LSH_LIST_HEAD_COUNTERS | |
934 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket | 984 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket |
935 #else | 985 #else |
936 serial_write_hashtable_row_format2(dbFile, bPtr, colCount); | 986 serial_write_hashtable_row_format2(dbFile, bPtr, colCount); |
937 #endif | 987 #endif |
938 } | 988 } |
939 if(colCount){ | 989 if(colCount){ |
940 if(colCount<minColCount) | 990 if(colCount<minColCount) |
941 minColCount=colCount; | 991 minColCount=colCount; |
945 colCountN++; | 995 colCountN++; |
946 } | 996 } |
947 } | 997 } |
948 // Write END of table marker | 998 // Write END of table marker |
949 t1 = O2_SERIAL_TOKEN_ENDTABLE; | 999 t1 = O2_SERIAL_TOKEN_ENDTABLE; |
950 if( fwrite(&t1, sizeof(Uns32T), 1, dbFile ) != 1 ){ | 1000 WRITE_UNS32(&t1,"[end]"); |
951 fclose(dbFile); | |
952 error("write error in serial_write_hashtable_format2() [end]"); | |
953 } | |
954 | |
955 if(colCountN) | 1001 if(colCountN) |
956 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN | 1002 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN |
957 << ", min = " << minColCount << ", max = " << maxColCount | 1003 << ", min = " << minColCount << ", max = " << maxColCount |
958 << endl; | 1004 << endl; |
959 } | 1005 } |
960 | |
961 // We're done writing | 1006 // We're done writing |
962 return 1; | 1007 return 1; |
1008 } | |
1009 | |
1010 Uns32T G::count_buckets_and_points_hashtable_row(bucket* bPtr){ | |
1011 Uns32T total_count = 0; | |
1012 bucket* p = 0; | |
1013 | |
1014 // count points | |
1015 #ifdef LSH_LIST_HEAD_COUNTERS | |
1016 total_count = bPtr->t2; // points already counted | |
1017 p = bPtr->next; | |
1018 #else | |
1019 total_count = count_points_hashtable_row(bPtr); | |
1020 p = bPtr; | |
1021 #endif | |
1022 | |
1023 // count buckets | |
1024 do{ | |
1025 total_count++; | |
1026 }while((p=p->next)); | |
1027 | |
1028 return total_count; | |
1029 } | |
1030 | |
1031 Uns32T G::count_points_hashtable_row(bucket* bPtr){ | |
1032 Uns32T point_count = 0; | |
1033 bucket* p = bPtr; | |
1034 sbucket* s = 0; | |
1035 while(p){ | |
1036 s = p->snext; | |
1037 while(s){ | |
1038 point_count++; | |
1039 s=s->snext; | |
1040 } | |
1041 p=p->next; | |
1042 } | |
1043 return point_count; | |
963 } | 1044 } |
964 | 1045 |
965 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){ | 1046 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){ |
966 while(b && b->t2!=IFLAG){ | 1047 while(b && b->t2!=IFLAG){ |
967 if(!b->snext){ | 1048 if(!b->snext){ |
1183 pt=(SerialElementT*)dbtable; | 1264 pt=(SerialElementT*)dbtable; |
1184 for( y = 0 ; y < H::N ; y++ ){ | 1265 for( y = 0 ; y < H::N ; y++ ){ |
1185 // Move disk pointer to beginning of row | 1266 // Move disk pointer to beginning of row |
1186 pe=pt+y*lshHeader->numCols; | 1267 pe=pt+y*lshHeader->numCols; |
1187 unserialize_hashtable_row_format1(pe, h[x]+y); | 1268 unserialize_hashtable_row_format1(pe, h[x]+y); |
1188 #ifdef __LSH_DUMP_CORE_TABLES__ | 1269 #ifdef LSH_DUMP_CORE_TABLES |
1189 printf("S[%d,%d]", x, y); | 1270 printf("S[%d,%d]", x, y); |
1190 serial_bucket_dump(pe); | 1271 serial_bucket_dump(pe); |
1191 printf("C[%d,%d]", x, y); | 1272 printf("C[%d,%d]", x, y); |
1192 dump_hashtable_row(h[x][y]); | 1273 dump_hashtable_row(h[x][y]); |
1193 #endif | 1274 #endif |
1205 pe++; | 1286 pe++; |
1206 colCount++; | 1287 colCount++; |
1207 } | 1288 } |
1208 } | 1289 } |
1209 | 1290 |
1210 void G::unserialize_lsh_hashtables_format2(FILE* dbFile){ | 1291 void G::unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge){ |
1211 Uns32T x=0,y=0; | 1292 Uns32T x=0,y=0; |
1212 | 1293 |
1213 // Seek to hashtable base offset | 1294 // Seek to hashtable base offset |
1214 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ | 1295 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){ |
1215 fclose(dbFile); | 1296 fclose(dbFile); |
1238 y = H::t1; | 1319 y = H::t1; |
1239 if(y>=H::N){ | 1320 if(y>=H::N){ |
1240 fclose(dbFile); | 1321 fclose(dbFile); |
1241 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()"); | 1322 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()"); |
1242 } | 1323 } |
1243 Uns32T token = unserialize_hashtable_row_format2(dbFile, h[x]+y); | 1324 Uns32T token = 0; |
1325 #ifdef LSH_CORE_ARRAY | |
1326 Uns32T numElements; | |
1327 if(fread(&numElements, sizeof(Uns32T), 1, dbFile) != 1){ | |
1328 fclose(dbFile); | |
1329 error("Read error: numElements","unserialize_lsh_hashtables_format2()"); | |
1330 } | |
1244 | 1331 |
1245 #ifdef __LSH_DUMP_CORE_TABLES__ | 1332 // BACKWARD COMPATIBILITY: check to see if T2 or END token was read |
1333 if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){ | |
1334 forMerge=true; // Force use of dynamic linked list core format | |
1335 token = numElements; | |
1336 } | |
1337 | |
1338 if(forMerge) | |
1339 // Use linked list CORE format | |
1340 token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token); | |
1341 else | |
1342 // Use ARRAY CORE format with numElements counter | |
1343 token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements); | |
1344 #else | |
1345 token = unserialize_hashtable_row_format2(dbFile, h[x]+y); | |
1346 #endif | |
1347 | |
1348 #ifdef LSH_DUMP_CORE_TABLES | |
1246 printf("C[%d,%d]", x, y); | 1349 printf("C[%d,%d]", x, y); |
1247 dump_hashtable_row(h[x][y]); | 1350 dump_hashtable_row(h[x][y]); |
1248 #endif | 1351 #endif |
1249 // Check that token is valid | 1352 // Check that token is valid |
1250 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){ | 1353 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){ |
1261 H::t1 = token; | 1364 H::t1 = token; |
1262 } | 1365 } |
1263 } | 1366 } |
1264 } | 1367 } |
1265 | 1368 |
1266 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b){ | 1369 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token){ |
1267 bool pointFound = false; | 1370 bool pointFound = false; |
1268 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){ | 1371 |
1269 fclose(dbFile); | 1372 if(token) |
1270 error("Read error T2 token","unserialize_hashtable_row_format2"); | 1373 H::t2 = token; |
1271 } | 1374 else if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){ |
1375 fclose(dbFile); | |
1376 error("Read error T2 token","unserialize_hashtable_row_format2"); | |
1377 } | |
1378 | |
1272 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){ | 1379 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){ |
1273 fclose(dbFile); | 1380 fclose(dbFile); |
1274 error("State machine error: expected E or T2"); | 1381 error("State machine error: expected E or T2"); |
1275 } | 1382 } |
1383 | |
1276 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){ | 1384 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){ |
1277 pointFound=false; | 1385 pointFound=false; |
1278 // Check for T2 token | 1386 // Check for T2 token |
1279 if(H::t2!=O2_SERIAL_TOKEN_T2){ | 1387 if(H::t2!=O2_SERIAL_TOKEN_T2){ |
1280 fclose(dbFile); | 1388 fclose(dbFile); |
1302 H::t2 = H::p; // Copy last found token to t2 | 1410 H::t2 = H::p; // Copy last found token to t2 |
1303 } | 1411 } |
1304 return H::t2; // holds current token | 1412 return H::t2; // holds current token |
1305 } | 1413 } |
1306 | 1414 |
1415 // Unserialize format2 hashtable row to a CORE_ARRAY pointed to | |
1416 // by the hashtable row pointer: rowPtr | |
1417 // | |
1418 // numElements is the total number of t2 buckets plus points | |
1419 // memory required is numElements+1+sizeof(hashtable ptr) | |
1420 // | |
1421 // numElements = numPoints + numBuckets | |
1422 // | |
1423 // During inserts (merges) new hashtable entries are appened at rowPtr+numPoints+numBuckets+1 | |
1424 // | |
1425 // ASSUME: that LSH_LIST_HEAD_COUNTERS is set so that the first hashtable bucket is used to count | |
1426 // point and bucket entries | |
1427 // | |
1428 // We store the values of numPoints and numBuckets in separate fields of the first bucket | |
1429 // rowPtr->t2 // numPoints | |
1430 // (Uns32T)(rowPtr->snext) // numBuckets | |
1431 // | |
1432 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*)) | |
1433 // To get to the fist bucket, we use | |
1434 // | |
1435 | |
1436 #define READ_UNS32T(VAL,TOKENSTR) if(fread(VAL, sizeof(Uns32T), 1, dbFile) != 1){\ | |
1437 fclose(dbFile);error("Read error unserialize_hashtable_format2",TOKENSTR);} | |
1438 | |
1439 #define TEST_TOKEN(TEST, TESTSTR) if(TEST){fclose(dbFile);error("State machine error: ", TESTSTR);} | |
1440 | |
1441 #define SKIP_BITS_LEFT_SHIFT_MSB (30) | |
1442 | |
1443 #define SKIP_BITS_RIGHT_SHIFT_MSB (28) | |
1444 #define SKIP_BITS_RIGHT_SHIFT_LSB (30) | |
1445 | |
1446 #define MAX_POINTS_IN_BUCKET_CORE_ARRAY (16) | |
1447 #define LSH_CORE_ARRAY_END_ROW_TOKEN (0xFFFFFFFD) | |
1448 | |
1449 // Encode the skip bits. Zero if only one point, MAX 8 (plus first == 9) | |
1450 #define ENCODE_POINT_SKIP_BITS TEST_TOKEN(!numPointsThisBucket, "no points found");\ | |
1451 if(numPointsThisBucket==1){\ | |
1452 secondPtr=ap++;\ | |
1453 *secondPtr=0;\ | |
1454 numPoints++;\ | |
1455 }\ | |
1456 if(numPointsThisBucket>1){\ | |
1457 *firstPtr |= ( (numPointsThisBucket-1) & 0x3 ) << SKIP_BITS_LEFT_SHIFT_MSB;\ | |
1458 *secondPtr |= ( ( (numPointsThisBucket-1) & 0xC) >> 2 ) << SKIP_BITS_LEFT_SHIFT_MSB;} | |
1459 | |
1460 Uns32T G::unserialize_hashtable_row_to_array(FILE* dbFile, bucket** rowPP, Uns32T numElements){ | |
1461 Uns32T numPointsThisBucket = 0; | |
1462 Uns32T numBuckets = 0; | |
1463 Uns32T numPoints = 0; | |
1464 Uns32T* firstPtr = 0; | |
1465 Uns32T* secondPtr = 0; | |
1466 | |
1467 // Initialize new row | |
1468 if(!*rowPP){ | |
1469 *rowPP = new bucket(); | |
1470 #ifdef LSH_LIST_HEAD_COUNTERS | |
1471 (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row | |
1472 (*rowPP)->next = 0; | |
1473 #endif | |
1474 } | |
1475 bucket* rowPtr = *rowPP; | |
1476 | |
1477 READ_UNS32T(&(H::t2),"t2"); | |
1478 TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2"); | |
1479 // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements | |
1480 // So over-allocate by a factor of two and realloc later to actual numElements | |
1481 CR_ASSERT(rowPtr->next = (bucket*) malloc((2*numElements+1)*sizeof(Uns32T)+sizeof(bucket**))); | |
1482 Uns32T* ap = reinterpret_cast<Uns32T*>(rowPtr->next); // Cast pointer to Uns32T* for array format | |
1483 while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){ | |
1484 numPointsThisBucket = 0;// reset bucket point counter | |
1485 secondPtr = 0; // reset second-point pointer | |
1486 TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2"); | |
1487 READ_UNS32T(&(H::t2), "Read error t2"); | |
1488 *ap++ = H::t2; // Insert t2 value into array | |
1489 numBuckets++; | |
1490 READ_UNS32T(&(H::p), "Read error H::p"); | |
1491 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){ | |
1492 if(numPointsThisBucket==MAX_POINTS_IN_BUCKET_CORE_ARRAY){ | |
1493 ENCODE_POINT_SKIP_BITS; | |
1494 *ap++ = H::t2; // Extra element | |
1495 numBuckets++; // record this as a new bucket | |
1496 numPointsThisBucket=0; // reset bucket point counter | |
1497 secondPtr = 0; // reset second-point pointer | |
1498 } | |
1499 if( ++numPointsThisBucket == 1 ) | |
1500 firstPtr = ap; // store pointer to first point to insert skip bits later on | |
1501 else if( numPointsThisBucket == 2 ) | |
1502 secondPtr = ap; // store pointer to first point to insert skip bits later on | |
1503 numPoints++; | |
1504 *ap++ = H::p; | |
1505 READ_UNS32T(&(H::p), "Read error H::p"); | |
1506 } | |
1507 ENCODE_POINT_SKIP_BITS; | |
1508 H::t2 = H::p; // Copy last found token to t2 | |
1509 } | |
1510 // Reallocate the row to its actual size | |
1511 CR_ASSERT(rowPtr->next = (bucket*) realloc(rowPtr->next, (numBuckets+numPoints+1)*sizeof(Uns32T)+sizeof(bucket**))); | |
1512 // Record the sizes at the head of the row | |
1513 rowPtr->snext = (sbucket*) numBuckets; | |
1514 rowPtr->t2 = numPoints; | |
1515 // Place end of row marker | |
1516 *ap++ = LSH_CORE_ARRAY_END_ROW_TOKEN; | |
1517 // Set the LSH_CORE_ARRAY_BIT to identify data structure for insertion and retrieval | |
1518 rowPtr->t2 |= LSH_CORE_ARRAY_BIT; | |
1519 // Allocate a new dynamic list head at the end of the array | |
1520 bucket** listPtr = reinterpret_cast<bucket**> (ap); | |
1521 *listPtr = 0; | |
1522 // Return current token | |
1523 return H::t2; // return H::t2 which holds current token [E or T1] | |
1524 } | |
1525 | |
1526 | |
1527 | |
1528 // *p is a pointer to the beginning of a hashtable row array | |
1529 // The array consists of t2 hash keys and one or more point identifiers p for each hash key | |
1530 // Retrieval is performed by generating a hash key query_t2 for query point q | |
1531 // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry | |
1532 // point for retrieve_from_core_hashtable_array | |
1533 #define SKIP_BITS (0xC0000000) | |
1534 void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){ | |
1535 Uns32T skip; | |
1536 Uns32T t2; | |
1537 Uns32T p1; | |
1538 Uns32T p2; | |
1539 | |
1540 CR_ASSERT(p); | |
1541 | |
1542 do{ | |
1543 t2 = *p++; | |
1544 if( t2 > H::t2 ) | |
1545 return; | |
1546 p1 = *p++; | |
1547 p2 = *p++; | |
1548 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB); | |
1549 if( t2 == H::t2 ){ | |
1550 add_point_callback(calling_instance, p1 ^ (p1 & SKIP_BITS), qpos, radius); | |
1551 if(skip--){ | |
1552 add_point_callback(calling_instance, p2 ^ (p2 & SKIP_BITS), qpos, radius); | |
1553 while(skip-- ) | |
1554 add_point_callback(calling_instance, *p++, qpos, radius); | |
1555 } | |
1556 } | |
1557 else | |
1558 if(*p != LSH_CORE_ARRAY_END_ROW_TOKEN) | |
1559 p = p + skip; | |
1560 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN ); | |
1561 } | |
1562 | |
1307 void G::dump_hashtable_row(bucket* p){ | 1563 void G::dump_hashtable_row(bucket* p){ |
1308 while(p && p->t2!=IFLAG){ | 1564 while(p && p->t2!=IFLAG){ |
1309 sbucket* sbp = p->snext; | 1565 sbucket* sbp = p->snext; |
1310 while(sbp){ | 1566 while(sbp){ |
1311 printf("(%0X,%u)", p->t2, sbp->pointID); | 1567 printf("(%0X,%u)", p->t2, sbp->pointID); |