comparison lshlib.cpp @ 306:921ba500a024

changed FORMAT2 index serialization so that token bits don't scribble over point index for >=32768 tracks
author mas01mc
date Tue, 05 Aug 2008 22:40:38 +0000
parents b10ad7b6427f
children 896679d8cc39
comparison
equal deleted inserted replaced
305:8cec6eb40526 306:921ba500a024
546 // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) ******* 546 // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) *******
547 // 547 //
548 // State machine controlled by regular expression. 548 // State machine controlled by regular expression.
549 // legend: 549 // legend:
550 // 550 //
551 // O2_SERIAL_FLAGS_T1_BIT = 0x80000000U 551 // O2_SERIAL_TOKEN_T1 = 0xFFFFFFFCU
552 // O2_SERIAL_FLAGS_T2_BIT = 0x40000000U 552 // O2_SERIAL_TOKEN_T2 = 0xFFFFFFFDU
553 // O2_SERIAL_FLAGS_END_BIT = 0x20000000U 553 // O2_SERIAL_TOKEN_ENDTABLE = 0xFFFFFFFEU
554 // 554 //
555 // T1(t1) - T1 hash token containing t1 hash key with O2_SERIAL_FLAGS_T1_BIT set (t1 range 0..2^29-1) 555 // T1 - T1 hash token
556 // T2 - T2 hash token with O2_SERIAL_FLAGS_T2_BIT set 556 // t1 - t1 hash key (t1 range 0..2^29-1)
557 // T2 - T2 token
557 // t2 - t2 hash key (range 1..2^32-6) 558 // t2 - t2 hash key (range 1..2^32-6)
558 // p - point identifier (range 0..2^32-1) 559 // p - point identifier (range 0..2^32-1)
559 // E - end hash table token with O2_SERIAL_FLAGS_END_BIT set 560 // E - end hash table token
560 // {...} required arguments 561 // {...} required arguments
561 // [...] optional arguments 562 // [...] optional arguments
562 // * - match zero or more occurences 563 // * - match zero or more occurences
563 // + - match one or more occurences 564 // + - match one or more occurences
564 // {...}^L - repeat argument L times 565 // {...}^L - repeat argument L times
565 // 566 //
566 // FORMAT2 Regular expression: 567 // FORMAT2 Regular expression:
567 // { [T1(t1) T2 t2 p+ [T2 t2 p+]* ]* E. }^L 568 // { [T1 t1 [T2 t2 p+]+ ]* E }^L
568 // 569 //
569 570
570 // Serial header constructors 571 // Serial header constructors
571 SerialHeader::SerialHeader(){;} 572 SerialHeader::SerialHeader(){;}
572 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc): 573 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
871 Uns32T x,y; 872 Uns32T x,y;
872 873
873 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) ) 874 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
874 error("Cannot merge core and serial LSH, data structure dimensions mismatch."); 875 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
875 876
876 // We must pereform FORMAT1 merges in core 877 // We must pereform FORMAT2 merges in core
877 if(merge) 878 if(merge)
878 unserialize_lsh_hashtables_format2(fid); 879 unserialize_lsh_hashtables_format2(fid);
879 880
880 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1; 881 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
881 lseek(fid, get_serial_hashtable_offset(), SEEK_SET); 882 lseek(fid, get_serial_hashtable_offset(), SEEK_SET);
889 meanColCount=0; 890 meanColCount=0;
890 colCountN=0; 891 colCountN=0;
891 for( y = 0 ; y < H::N ; y++ ){ 892 for( y = 0 ; y < H::N ; y++ ){
892 colCount=0; 893 colCount=0;
893 if(bucket* bPtr = h[x][y]){ 894 if(bucket* bPtr = h[x][y]){
894 t1 = y | O2_SERIAL_FLAGS_T1_BIT; 895 // Check for empty row (even though row was allocated)
896 #ifdef LSH_BLOCK_FULL_ROWS
897 if(bPtr->next->t2==IFLAG){
898 close(fid);
899 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
900 }
901 #else
902 if(bPtr->t2==IFLAG){
903 close(fid);
904 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
905 }
906 #endif
907 t1 = O2_SERIAL_TOKEN_T1;
908 if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){
909 close(fid);
910 error("write error in serial_write_hashtable_format2() [T1]");
911 }
912 t1 = y;
895 if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){ 913 if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){
896 close(fid); 914 close(fid);
897 error("write error in serial_write_hashtable_format2() [t1]"); 915 error("write error in serial_write_hashtable_format2() [t1]");
898 } 916 }
899 #ifdef LSH_BLOCK_FULL_ROWS 917 #ifdef LSH_BLOCK_FULL_ROWS
910 meanColCount+=colCount; 928 meanColCount+=colCount;
911 colCountN++; 929 colCountN++;
912 } 930 }
913 } 931 }
914 // Write END of table marker 932 // Write END of table marker
915 t1 = O2_SERIAL_FLAGS_END_BIT; 933 t1 = O2_SERIAL_TOKEN_ENDTABLE;
916 if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){ 934 if( write(fid, &t1, sizeof(Uns32T)) != sizeof(Uns32T) ){
917 close(fid); 935 close(fid);
918 error("write error in serial_write_hashtable_format2() [end]"); 936 error("write error in serial_write_hashtable_format2() [end]");
919 } 937 }
920 938
928 return 1; 946 return 1;
929 } 947 }
930 948
931 void G::serial_write_hashtable_row_format2(int fid, bucket* b, Uns32T& colCount){ 949 void G::serial_write_hashtable_row_format2(int fid, bucket* b, Uns32T& colCount){
932 while(b && b->t2!=IFLAG){ 950 while(b && b->t2!=IFLAG){
933 t2 = O2_SERIAL_FLAGS_T2_BIT; 951 if(!b->snext){
952 close(fid);
953 error("Empty collision chain in serial_write_hashtable_row_format2()");
954 }
955 t2 = O2_SERIAL_TOKEN_T2;
934 if( write(fid, &t2, sizeof(Uns32T)) != sizeof(Uns32T) ){ 956 if( write(fid, &t2, sizeof(Uns32T)) != sizeof(Uns32T) ){
935 close(fid); 957 close(fid);
936 error("write error in serial_write_hashtable_row_format2()"); 958 error("write error in serial_write_hashtable_row_format2()");
937 } 959 }
938 t2 = b->t2; 960 t2 = b->t2;
1178 while( x < H::L){ 1200 while( x < H::L){
1179 if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){ 1201 if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){
1180 close(fid); 1202 close(fid);
1181 error("Read error","unserialize_lsh_hashtables_format2()"); 1203 error("Read error","unserialize_lsh_hashtables_format2()");
1182 } 1204 }
1183 if(H::t1&O2_SERIAL_FLAGS_END_BIT) 1205 if(H::t1==O2_SERIAL_TOKEN_ENDTABLE)
1184 x++; // End of table 1206 x++; // End of table
1185 else 1207 else
1186 while(y < H::N){ 1208 while(y < H::N){
1187 // Read a row and move file pointer to beginning of next row or table 1209 // Read a row and move file pointer to beginning of next row or _bittable
1188 if(!(H::t1&O2_SERIAL_FLAGS_T1_BIT)){ 1210 if(!(H::t1==O2_SERIAL_TOKEN_T1)){
1189 close(fid); 1211 close(fid);
1190 error("State matchine error t1","unserialize_lsh_hashtables_format2()"); 1212 error("State matchine error T1","unserialize_lsh_hashtables_format2()");
1191 } 1213 }
1192 y = H::t1 ^ O2_SERIAL_FLAGS_T1_BIT; 1214 if(read(fid, &(H::t1), sizeof(Uns32T))!=sizeof(Uns32T)){
1215 close(fid);
1216 error("Read error: t1","unserialize_lsh_hashtables_format2()");
1217 }
1218 y = H::t1;
1193 if(y>=H::N){ 1219 if(y>=H::N){
1194 close(fid); 1220 close(fid);
1195 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()"); 1221 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
1196 } 1222 }
1197 Uns32T token = unserialize_hashtable_row_format2(fid, h[x]+y); 1223 Uns32T token = unserialize_hashtable_row_format2(fid, h[x]+y);
1199 #ifdef __LSH_DUMP_CORE_TABLES__ 1225 #ifdef __LSH_DUMP_CORE_TABLES__
1200 printf("C[%d,%d]", x, y); 1226 printf("C[%d,%d]", x, y);
1201 dump_hashtable_row(h[x][y]); 1227 dump_hashtable_row(h[x][y]);
1202 #endif 1228 #endif
1203 // Check that token is valid 1229 // Check that token is valid
1204 if( !(token&O2_SERIAL_FLAGS_T1_BIT || token&O2_SERIAL_FLAGS_END_BIT) ){ 1230 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
1205 close(fid); 1231 close(fid);
1206 error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()"); 1232 error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()");
1207 } 1233 }
1208 // Check for end of table flag 1234 // Check for end of table flag
1209 if(token&O2_SERIAL_FLAGS_END_BIT){ 1235 if(token==O2_SERIAL_TOKEN_ENDTABLE){
1210 x++; 1236 x++;
1211 break; 1237 break;
1212 } 1238 }
1213 // Check for new row flag 1239 // Check for new row flag
1214 if(token&O2_SERIAL_FLAGS_T1_BIT) 1240 if(token==O2_SERIAL_TOKEN_T1)
1215 H::t1 = token; 1241 H::t1 = token;
1216 } 1242 }
1217 } 1243 }
1218 } 1244 }
1219 1245
1220 Uns32T G::unserialize_hashtable_row_format2(int fid, bucket** b){ 1246 Uns32T G::unserialize_hashtable_row_format2(int fid, bucket** b){
1221 bool pointFound = false; 1247 bool pointFound = false;
1222 if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){ 1248 if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){
1223 close(fid); 1249 close(fid);
1224 error("Read error T2 token","unserialize_hashtable_row_format2"); 1250 error("Read error T2 token","unserialize_hashtable_row_format2");
1225 } 1251 }
1226 if( !(H::t2==O2_SERIAL_FLAGS_END_BIT || H::t2==O2_SERIAL_FLAGS_T2_BIT)){ 1252 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
1227 close(fid); 1253 close(fid);
1228 error("State machine error: expected E or T2"); 1254 error("State machine error: expected E or T2");
1229 } 1255 }
1230 while(!(H::t2==O2_SERIAL_FLAGS_END_BIT || H::t2&O2_SERIAL_FLAGS_T1_BIT)){ 1256 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
1231 pointFound=false; 1257 pointFound=false;
1232 // Check for T2 token 1258 // Check for T2 token
1233 if(H::t2!=O2_SERIAL_FLAGS_T2_BIT) 1259 if(H::t2!=O2_SERIAL_TOKEN_T2){
1260 close(fid);
1234 error("State machine error T2 token", "unserialize_hashtable_row_format2()"); 1261 error("State machine error T2 token", "unserialize_hashtable_row_format2()");
1262 }
1235 // Read t2 value 1263 // Read t2 value
1236 if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){ 1264 if(read(fid, &(H::t2), sizeof(Uns32T)) != sizeof(Uns32T)){
1237 close(fid); 1265 close(fid);
1238 error("Read error t2","unserialize_hashtable_row_format2"); 1266 error("Read error t2","unserialize_hashtable_row_format2");
1239 } 1267 }
1240 if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){ 1268 if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){
1241 close(fid); 1269 close(fid);
1242 error("Read error H::p","unserialize_hashtable_row_format2"); 1270 error("Read error H::p","unserialize_hashtable_row_format2");
1243 } 1271 }
1244 while(!(H::p==O2_SERIAL_FLAGS_END_BIT || H::p&O2_SERIAL_FLAGS_T1_BIT || H::p==O2_SERIAL_FLAGS_T2_BIT )){ 1272 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
1245 pointFound=true; 1273 pointFound=true;
1246 bucket_insert_point(b); 1274 bucket_insert_point(b);
1247 if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){ 1275 if(read(fid, &(H::p), sizeof(Uns32T)) != sizeof(Uns32T)){
1248 close(fid); 1276 close(fid);
1249 error("Read error H::p","unserialize_hashtable_row_format2"); 1277 error("Read error H::p","unserialize_hashtable_row_format2");
1250 } 1278 }
1251 } 1279 }
1252 H::t2 = H::p; // Copy last found token to t2
1253 if(!pointFound) 1280 if(!pointFound)
1254 error("State machine error: point", "unserialize_hashtable_row_format2()"); 1281 error("State machine error: point", "unserialize_hashtable_row_format2()");
1282 H::t2 = H::p; // Copy last found token to t2
1255 } 1283 }
1256 return H::t2; // holds current token 1284 return H::t2; // holds current token
1257 } 1285 }
1258 1286
1259 void G::dump_hashtable_row(bucket* p){ 1287 void G::dump_hashtable_row(bucket* p){