comparison lshlib.cpp @ 296:f922c234462f

fixed file size allocation for FORMAT2 files. Made LSH index size() in bytes an unsigned long long. Changed the name of lsh_inCore flag to lsh_on_disk (to reverse the sense of the 'flag').
author mas01mc
date Fri, 01 Aug 2008 15:04:31 +0000
parents 071a108580a4
children b10ad7b6427f
comparison
equal deleted inserted replaced
295:9347d74a2578 296:f922c234462f
1 #include "lshlib.h" 1 #include "lshlib.h"
2 2
3 //#define __LSH_DUMP_CORE_TABLES__ 3 //#define __LSH_DUMP_CORE_TABLES__
4 //#define USE_U_FUNCTIONS 4 //#define USE_U_FUNCTIONS
5 //#define LSH_BLOCK_FULL_ROWS 5 #define LSH_BLOCK_FULL_ROWS
6 6
7 void err(char*s){cout << s << endl;exit(2);} 7 void err(char*s){cout << s << endl;exit(2);}
8 8
9 Uns32T get_page_logn(){ 9 Uns32T get_page_logn(){
10 int pagesz = (int)sysconf(_SC_PAGESIZE); 10 int pagesz = (int)sysconf(_SC_PAGESIZE);
51 } 51 }
52 if(use_u_functions && k%2){ 52 if(use_u_functions && k%2){
53 k++; // make sure k is even 53 k++; // make sure k is even
54 cout << "warning: setting k even" << endl; 54 cout << "warning: setting k even" << endl;
55 } 55 }
56
57 cout << "file size: ~" << (((unsigned long long)L*N*C*sizeof(SerialElementT))/1000000UL) << "MB" << endl;
58 if(((unsigned long long)L*N*C*sizeof(SerialElementT))>4000000000UL)
59 error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB");
60 else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL)
61 cout << "warning: hash tables exceed 1000MB." << endl;
62 56
63 // We have the necessary parameters, so construct hashfunction datastructures 57 // We have the necessary parameters, so construct hashfunction datastructures
64 initialize_lsh_functions(); 58 initialize_lsh_functions();
65 } 59 }
66 60
336 } 330 }
337 return h; 331 return h;
338 } 332 }
339 333
340 Uns32T H::bucket_insert_point(bucket **pp){ 334 Uns32T H::bucket_insert_point(bucket **pp){
341 Uns32T collisionCount = 0; 335 collisionCount = 0;
342 if(!*pp){ 336 if(!*pp){
343 *pp = new bucket(); 337 *pp = new bucket();
344 #ifdef LSH_BLOCK_FULL_ROWS 338 #ifdef LSH_BLOCK_FULL_ROWS
345 (*pp)->t2 = 0; // Use t2 as a collision counter for the row 339 (*pp)->t2 = 0; // Use t2 as a collision counter for the row
346 (*pp)->next = new bucket(); 340 (*pp)->next = new bucket();
353 pointCount++; 347 pointCount++;
354 collisionCount++; 348 collisionCount++;
355 __bucket_insert_point((*pp)->next); // First bucket holds collision count 349 __bucket_insert_point((*pp)->next); // First bucket holds collision count
356 } 350 }
357 #else 351 #else
358 pointCount++; 352 pointCount++;
359 __bucket_insert_point(*pp); // No collision count storage 353 __bucket_insert_point(*pp); // No collision count storage
360 #endif 354 #endif
361 return collisionCount; 355 return collisionCount;
362 } 356 }
363 357
364 void H::__bucket_insert_point(bucket* p){ 358 void H::__bucket_insert_point(bucket* p){
450 444
451 // single point insertion; inserted values are hash value and pointID 445 // single point insertion; inserted values are hash value and pointID
452 Uns32T G::insert_point(vector<float>& v, Uns32T pp){ 446 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
453 Uns32T collisionCount = 0; 447 Uns32T collisionCount = 0;
454 H::p = pp; 448 H::p = pp;
455 if(pp>H::maxp) 449 if(pp<=H::maxp)
456 H::maxp=pp; // Store highest pointID in database 450 error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
451 H::maxp=pp; // Store highest pointID in database
457 H::compute_hash_functions( v ); 452 H::compute_hash_functions( v );
458 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion 453 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
459 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); 454 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
460 collisionCount += bucket_insert_point( *(h + j) + t1 ); 455 collisionCount += bucket_insert_point( *(h + j) + t1 );
461 } 456 }
572 // { [T1(t1) T2 t2 p+ [T2 t2 p+]* ]* E. }^L 567 // { [T1(t1) T2 t2 p+ [T2 t2 p+]* ]* E. }^L
573 // 568 //
574 569
575 // Serial header constructors 570 // Serial header constructors
576 SerialHeader::SerialHeader(){;} 571 SerialHeader::SerialHeader(){;}
577 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT): 572 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
578 lshMagic(O2_SERIAL_MAGIC), 573 lshMagic(O2_SERIAL_MAGIC),
579 binWidth(W), 574 binWidth(W),
580 numTables(L), 575 numTables(L),
581 numRows(N), 576 numRows(N),
582 numCols(C), 577 numCols(C),
583 elementSize(O2_SERIAL_ELEMENT_SIZE), 578 elementSize(O2_SERIAL_ELEMENT_SIZE),
584 version(O2_SERIAL_VERSION), 579 version(O2_SERIAL_VERSION),
585 size(L * align_up(N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables 580 size(0), // we are deprecating this value
586 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
587 L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
588 flags(FMT), 581 flags(FMT),
589 dataDim(d), 582 dataDim(d),
590 numFuns(k), 583 numFuns(k),
591 radius(r), 584 radius(r),
592 maxp(p){;} // header 585 maxp(p),
586 size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables
587 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
588 (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
589 pointCount(pc){
590
591 if(FMT==O2_SERIAL_FILEFORMAT2)
592 size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE
593 + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
594 +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
595 } // header
593 596
594 float* G::get_serial_hashfunction_base(char* db){ 597 float* G::get_serial_hashfunction_base(char* db){
595 if(db&&lshHeader) 598 if(db&&lshHeader)
596 return (float*)(db+O2_SERIAL_HEADER_SIZE); 599 return (float*)(db+O2_SERIAL_HEADER_SIZE);
597 else return NULL; 600 else return NULL;
618 int dbIsNew=0; 621 int dbIsNew=0;
619 622
620 // Check requested serialFormat 623 // Check requested serialFormat
621 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2)) 624 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
622 error("Unrecognized serial file format request: ", "serialize()"); 625 error("Unrecognized serial file format request: ", "serialize()");
623 626
624 // Test to see if file exists 627 // Test to see if file exists
625 if((dbfid = open (filename, O_RDONLY)) < 0) 628 if((dbfid = open (filename, O_RDONLY)) < 0)
626 // If it doesn't, then create the file (CREATE) 629 // If it doesn't, then create the file (CREATE)
627 if(errno == ENOENT){ 630 if(errno == ENOENT){
628 // Create the file 631 // Create the file
674 int G::serial_can_merge(Uns32T format){ 677 int G::serial_can_merge(Uns32T format){
675 SerialHeaderT* that = lshHeader; 678 SerialHeaderT* that = lshHeader;
676 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) 679 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2)
677 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2) 680 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
678 || !( this->w == that->binWidth && 681 || !( this->w == that->binWidth &&
679 this->L == that->numTables && 682 this->L == that->numTables &&
680 this->N == that->numRows && 683 this->N == that->numRows &&
681 this->k == that->numFuns && 684 this->k == that->numFuns &&
682 this->d == that->dataDim && 685 this->d == that->dataDim &&
683 sizeof(SerialElementT) == that->elementSize && 686 sizeof(SerialElementT) == that->elementSize &&
684 this->radius == that->radius)){ 687 this->radius == that->radius)){
685 serial_print_header(format); 688 serial_print_header(format);
686 return 0; 689 return 0;
687 } 690 }
688 else 691 else
689 return 1; 692 return 1;
972 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) 975 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0)
973 error("Can't create serial file", filename, "open"); 976 error("Can't create serial file", filename, "open");
974 get_lock(dbfid, 1); 977 get_lock(dbfid, 1);
975 978
976 // Make header first to get size of serialized database 979 // Make header first to get size of serialized database
977 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT); 980 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);
978 981
982 cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
983 if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
984 error("Maximum size of LSH file exceded: > 4000MB");
985
979 // go to the location corresponding to the last byte 986 // go to the location corresponding to the last byte
980 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1) 987 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
981 error("lseek error in db file", "", "lseek"); 988 error("lseek error in db file", "", "lseek");
982 989
983 // write a dummy byte at the last location 990 // write a dummy byte at the last location
984 if (write (dbfid, "", 1) != 1) 991 if (write (dbfid, "", 1) != 1)
985 error("write error", "", "write"); 992 error("write error", "", "write");
986 993
987 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); 994 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
988 995
989 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE); 996 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
990 997
991 serial_munmap(db, O2_SERIAL_HEADER_SIZE); 998 serial_munmap(db, O2_SERIAL_HEADER_SIZE);
992 999
993 close(dbfid); 1000 close(dbfid);
994 1001
995 std::cout << "done initializing tables." << endl; 1002 std::cout << "done initializing tables." << endl;
996 1003
997 return 1; 1004 return 1;
1067 H::k = lshHeader->numFuns; 1074 H::k = lshHeader->numFuns;
1068 H::d = lshHeader->dataDim; 1075 H::d = lshHeader->dataDim;
1069 H::w = lshHeader->binWidth; 1076 H::w = lshHeader->binWidth;
1070 H::radius = lshHeader->radius; 1077 H::radius = lshHeader->radius;
1071 H::maxp = lshHeader->maxp; 1078 H::maxp = lshHeader->maxp;
1079 H::pointCount = lshHeader->pointCount;
1072 1080
1073 return dbfid; 1081 return dbfid;
1074 } 1082 }
1075 1083
1076 // unserialize the LSH parameters 1084 // unserialize the LSH parameters