Mercurial > hg > audiodb
comparison lshlib.cpp @ 296:f922c234462f
fixed file size allocation for FORMAT2 files. Made LSH index size() in bytes an unsigned long long. Changed the name of lsh_inCore flag to lsh_on_disk (to reverse the sense of the 'flag').
author | mas01mc |
---|---|
date | Fri, 01 Aug 2008 15:04:31 +0000 |
parents | 071a108580a4 |
children | b10ad7b6427f |
comparison
equal
deleted
inserted
replaced
295:9347d74a2578 | 296:f922c234462f |
---|---|
1 #include "lshlib.h" | 1 #include "lshlib.h" |
2 | 2 |
3 //#define __LSH_DUMP_CORE_TABLES__ | 3 //#define __LSH_DUMP_CORE_TABLES__ |
4 //#define USE_U_FUNCTIONS | 4 //#define USE_U_FUNCTIONS |
5 //#define LSH_BLOCK_FULL_ROWS | 5 #define LSH_BLOCK_FULL_ROWS |
6 | 6 |
7 void err(char*s){cout << s << endl;exit(2);} | 7 void err(char*s){cout << s << endl;exit(2);} |
8 | 8 |
9 Uns32T get_page_logn(){ | 9 Uns32T get_page_logn(){ |
10 int pagesz = (int)sysconf(_SC_PAGESIZE); | 10 int pagesz = (int)sysconf(_SC_PAGESIZE); |
51 } | 51 } |
52 if(use_u_functions && k%2){ | 52 if(use_u_functions && k%2){ |
53 k++; // make sure k is even | 53 k++; // make sure k is even |
54 cout << "warning: setting k even" << endl; | 54 cout << "warning: setting k even" << endl; |
55 } | 55 } |
56 | |
57 cout << "file size: ~" << (((unsigned long long)L*N*C*sizeof(SerialElementT))/1000000UL) << "MB" << endl; | |
58 if(((unsigned long long)L*N*C*sizeof(SerialElementT))>4000000000UL) | |
59 error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB"); | |
60 else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL) | |
61 cout << "warning: hash tables exceed 1000MB." << endl; | |
62 | 56 |
63 // We have the necessary parameters, so construct hashfunction datastructures | 57 // We have the necessary parameters, so construct hashfunction datastructures |
64 initialize_lsh_functions(); | 58 initialize_lsh_functions(); |
65 } | 59 } |
66 | 60 |
336 } | 330 } |
337 return h; | 331 return h; |
338 } | 332 } |
339 | 333 |
340 Uns32T H::bucket_insert_point(bucket **pp){ | 334 Uns32T H::bucket_insert_point(bucket **pp){ |
341 Uns32T collisionCount = 0; | 335 collisionCount = 0; |
342 if(!*pp){ | 336 if(!*pp){ |
343 *pp = new bucket(); | 337 *pp = new bucket(); |
344 #ifdef LSH_BLOCK_FULL_ROWS | 338 #ifdef LSH_BLOCK_FULL_ROWS |
345 (*pp)->t2 = 0; // Use t2 as a collision counter for the row | 339 (*pp)->t2 = 0; // Use t2 as a collision counter for the row |
346 (*pp)->next = new bucket(); | 340 (*pp)->next = new bucket(); |
353 pointCount++; | 347 pointCount++; |
354 collisionCount++; | 348 collisionCount++; |
355 __bucket_insert_point((*pp)->next); // First bucket holds collision count | 349 __bucket_insert_point((*pp)->next); // First bucket holds collision count |
356 } | 350 } |
357 #else | 351 #else |
358 pointCount++; | 352 pointCount++; |
359 __bucket_insert_point(*pp); // No collision count storage | 353 __bucket_insert_point(*pp); // No collision count storage |
360 #endif | 354 #endif |
361 return collisionCount; | 355 return collisionCount; |
362 } | 356 } |
363 | 357 |
364 void H::__bucket_insert_point(bucket* p){ | 358 void H::__bucket_insert_point(bucket* p){ |
450 | 444 |
451 // single point insertion; inserted values are hash value and pointID | 445 // single point insertion; inserted values are hash value and pointID |
452 Uns32T G::insert_point(vector<float>& v, Uns32T pp){ | 446 Uns32T G::insert_point(vector<float>& v, Uns32T pp){ |
453 Uns32T collisionCount = 0; | 447 Uns32T collisionCount = 0; |
454 H::p = pp; | 448 H::p = pp; |
455 if(pp>H::maxp) | 449 if(pp<=H::maxp) |
456 H::maxp=pp; // Store highest pointID in database | 450 error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)"); |
451 H::maxp=pp; // Store highest pointID in database | |
457 H::compute_hash_functions( v ); | 452 H::compute_hash_functions( v ); |
458 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion | 453 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion |
459 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); | 454 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); |
460 collisionCount += bucket_insert_point( *(h + j) + t1 ); | 455 collisionCount += bucket_insert_point( *(h + j) + t1 ); |
461 } | 456 } |
572 // { [T1(t1) T2 t2 p+ [T2 t2 p+]* ]* E. }^L | 567 // { [T1(t1) T2 t2 p+ [T2 t2 p+]* ]* E. }^L |
573 // | 568 // |
574 | 569 |
575 // Serial header constructors | 570 // Serial header constructors |
576 SerialHeader::SerialHeader(){;} | 571 SerialHeader::SerialHeader(){;} |
577 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT): | 572 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc): |
578 lshMagic(O2_SERIAL_MAGIC), | 573 lshMagic(O2_SERIAL_MAGIC), |
579 binWidth(W), | 574 binWidth(W), |
580 numTables(L), | 575 numTables(L), |
581 numRows(N), | 576 numRows(N), |
582 numCols(C), | 577 numCols(C), |
583 elementSize(O2_SERIAL_ELEMENT_SIZE), | 578 elementSize(O2_SERIAL_ELEMENT_SIZE), |
584 version(O2_SERIAL_VERSION), | 579 version(O2_SERIAL_VERSION), |
585 size(L * align_up(N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables | 580 size(0), // we are deprecating this value |
586 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions | |
587 L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())), | |
588 flags(FMT), | 581 flags(FMT), |
589 dataDim(d), | 582 dataDim(d), |
590 numFuns(k), | 583 numFuns(k), |
591 radius(r), | 584 radius(r), |
592 maxp(p){;} // header | 585 maxp(p), |
586 size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables | |
587 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions | |
588 (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())), | |
589 pointCount(pc){ | |
590 | |
591 if(FMT==O2_SERIAL_FILEFORMAT2) | |
592 size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE | |
593 + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T) | |
594 +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn()); | |
595 } // header | |
593 | 596 |
594 float* G::get_serial_hashfunction_base(char* db){ | 597 float* G::get_serial_hashfunction_base(char* db){ |
595 if(db&&lshHeader) | 598 if(db&&lshHeader) |
596 return (float*)(db+O2_SERIAL_HEADER_SIZE); | 599 return (float*)(db+O2_SERIAL_HEADER_SIZE); |
597 else return NULL; | 600 else return NULL; |
618 int dbIsNew=0; | 621 int dbIsNew=0; |
619 | 622 |
620 // Check requested serialFormat | 623 // Check requested serialFormat |
621 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2)) | 624 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2)) |
622 error("Unrecognized serial file format request: ", "serialize()"); | 625 error("Unrecognized serial file format request: ", "serialize()"); |
623 | 626 |
624 // Test to see if file exists | 627 // Test to see if file exists |
625 if((dbfid = open (filename, O_RDONLY)) < 0) | 628 if((dbfid = open (filename, O_RDONLY)) < 0) |
626 // If it doesn't, then create the file (CREATE) | 629 // If it doesn't, then create the file (CREATE) |
627 if(errno == ENOENT){ | 630 if(errno == ENOENT){ |
628 // Create the file | 631 // Create the file |
674 int G::serial_can_merge(Uns32T format){ | 677 int G::serial_can_merge(Uns32T format){ |
675 SerialHeaderT* that = lshHeader; | 678 SerialHeaderT* that = lshHeader; |
676 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) | 679 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) |
677 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2) | 680 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2) |
678 || !( this->w == that->binWidth && | 681 || !( this->w == that->binWidth && |
679 this->L == that->numTables && | 682 this->L == that->numTables && |
680 this->N == that->numRows && | 683 this->N == that->numRows && |
681 this->k == that->numFuns && | 684 this->k == that->numFuns && |
682 this->d == that->dataDim && | 685 this->d == that->dataDim && |
683 sizeof(SerialElementT) == that->elementSize && | 686 sizeof(SerialElementT) == that->elementSize && |
684 this->radius == that->radius)){ | 687 this->radius == that->radius)){ |
685 serial_print_header(format); | 688 serial_print_header(format); |
686 return 0; | 689 return 0; |
687 } | 690 } |
688 else | 691 else |
689 return 1; | 692 return 1; |
972 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) | 975 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) |
973 error("Can't create serial file", filename, "open"); | 976 error("Can't create serial file", filename, "open"); |
974 get_lock(dbfid, 1); | 977 get_lock(dbfid, 1); |
975 | 978 |
976 // Make header first to get size of serialized database | 979 // Make header first to get size of serialized database |
977 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT); | 980 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount); |
978 | 981 |
982 cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl; | |
983 if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE) | |
984 error("Maximum size of LSH file exceded: > 4000MB"); | |
985 | |
979 // go to the location corresponding to the last byte | 986 // go to the location corresponding to the last byte |
980 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1) | 987 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1) |
981 error("lseek error in db file", "", "lseek"); | 988 error("lseek error in db file", "", "lseek"); |
982 | 989 |
983 // write a dummy byte at the last location | 990 // write a dummy byte at the last location |
984 if (write (dbfid, "", 1) != 1) | 991 if (write (dbfid, "", 1) != 1) |
985 error("write error", "", "write"); | 992 error("write error", "", "write"); |
986 | 993 |
987 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); | 994 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); |
988 | 995 |
989 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE); | 996 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE); |
990 | 997 |
991 serial_munmap(db, O2_SERIAL_HEADER_SIZE); | 998 serial_munmap(db, O2_SERIAL_HEADER_SIZE); |
992 | 999 |
993 close(dbfid); | 1000 close(dbfid); |
994 | 1001 |
995 std::cout << "done initializing tables." << endl; | 1002 std::cout << "done initializing tables." << endl; |
996 | 1003 |
997 return 1; | 1004 return 1; |
1067 H::k = lshHeader->numFuns; | 1074 H::k = lshHeader->numFuns; |
1068 H::d = lshHeader->dataDim; | 1075 H::d = lshHeader->dataDim; |
1069 H::w = lshHeader->binWidth; | 1076 H::w = lshHeader->binWidth; |
1070 H::radius = lshHeader->radius; | 1077 H::radius = lshHeader->radius; |
1071 H::maxp = lshHeader->maxp; | 1078 H::maxp = lshHeader->maxp; |
1079 H::pointCount = lshHeader->pointCount; | |
1072 | 1080 |
1073 return dbfid; | 1081 return dbfid; |
1074 } | 1082 } |
1075 | 1083 |
1076 // unserialize the LSH parameters | 1084 // unserialize the LSH parameters |