Mercurial > hg > audiodb
changeset 296:f922c234462f
fixed file size allocation for FORMAT2 files. Made LSH index size() in bytes an unsigned long long. Changed the name of lsh_inCore flag to lsh_on_disk (to reverse the sense of the 'flag').
author | mas01mc |
---|---|
date | Fri, 01 Aug 2008 15:04:31 +0000 |
parents | 9347d74a2578 |
children | 7907b50d0995 |
files | audioDB.cpp gengetopt.in index.cpp lshlib.cpp lshlib.h |
diffstat | 5 files changed, 54 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- a/audioDB.cpp Thu Jul 31 19:26:04 2008 +0000 +++ b/audioDB.cpp Fri Aug 01 15:04:31 2008 +0000 @@ -375,7 +375,7 @@ dbName=args_info.database_arg; // Whether to store LSH hash tables for query in core (FORMAT2) - lsh_in_core = args_info.lsh_inCore_flag; + lsh_in_core = args_info.lsh_on_disk_flag; // This flag is set to 0 if on_disk requested lsh_param_w = args_info.lsh_w_arg; if(!(lsh_param_w>0 && lsh_param_w<=O2_SERIAL_MAX_BINWIDTH)) @@ -397,7 +397,9 @@ if(!(lsh_param_b>0 && lsh_param_b<=O2_SERIAL_MAX_TRACKBATCH)) error("Indexing parameter b out of range (1 <= b <= 10000)"); - lsh_param_ncols = args_info.lsh_ncols_arg; + lsh_param_ncols = args_info.lsh_ncols_arg; + if(lsh_in_core) // We don't want to block rows with FORMAT2 indexing + lsh_param_ncols = O2_SERIAL_MAX_COLS; if( !(lsh_param_ncols>0 && lsh_param_ncols<=O2_SERIAL_MAX_COLS)) error("Indexing parameter ncols out of range (1 <= ncols <= 1000"); @@ -464,8 +466,8 @@ error("queryPoint out of range: 0 <= queryPoint <= 10000"); } - // Whether to pre-load LSH hash tables for query - lsh_in_core = args_info.lsh_inCore_flag; + // Whether to pre-load LSH hash tables for query (default on, if flag set then off) + lsh_in_core = args_info.lsh_on_disk_flag; // Whether to perform exact evaluation of points returned by LSH lsh_exact = args_info.lsh_exact_flag;
--- a/gengetopt.in Thu Jul 31 19:26:04 2008 +0000 +++ b/gengetopt.in Fri Aug 01 15:04:31 2008 +0000 @@ -69,7 +69,7 @@ option "lsh_b" - "number of tracks per indexing iteration" int typestr="size" default="500" dependon="INDEX" optional option "lsh_ncols" - "number of columns (collisions) to allocate in LSH serialization" int typestr="size" default="250" dependon="INDEX" optional option "lsh_exact" - "use exact evaluation of points retrieved by LSH." flag off dependon="QUERY" optional -option "lsh_inCore" - "LSH hash tables resident in core (INDEX/QUERY)" flag on optional +option "lsh_on_disk" - "Construct LSH hash tables for on-disk query (INDEX/QUERY)" flag on optional option "lsh_use_u_functions" - "use m independent hash functions combinatorically to approximate L independent hash functions." flag off optional section "Normalization control parameters" sectiondesc="These parameters control the normalization of feaures at query time\n"
--- a/index.cpp Thu Jul 31 19:26:04 2008 +0000 +++ b/index.cpp Fri Aug 01 15:04:31 2008 +0000 @@ -122,6 +122,7 @@ printf("INDEX: lsh_k %d\n", lsh_param_k); printf("INDEX: lsh_m %d\n", lsh_param_m); printf("INDEX: lsh_N %d\n", lsh_param_N); + printf("INDEX: lsh_C %d\n", lsh_param_ncols); printf("INDEX: lsh_b %d\n", lsh_param_b); printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); fflush(stdout);
--- a/lshlib.cpp Thu Jul 31 19:26:04 2008 +0000 +++ b/lshlib.cpp Fri Aug 01 15:04:31 2008 +0000 @@ -2,7 +2,7 @@ //#define __LSH_DUMP_CORE_TABLES__ //#define USE_U_FUNCTIONS -//#define LSH_BLOCK_FULL_ROWS +#define LSH_BLOCK_FULL_ROWS void err(char*s){cout << s << endl;exit(2);} @@ -53,12 +53,6 @@ k++; // make sure k is even cout << "warning: setting k even" << endl; } - - cout << "file size: ~" << (((unsigned long long)L*N*C*sizeof(SerialElementT))/1000000UL) << "MB" << endl; - if(((unsigned long long)L*N*C*sizeof(SerialElementT))>4000000000UL) - error("Maximum size of LSH file exceded: 12*L*N*C > 4000MB"); - else if(((unsigned long long)N*C*sizeof(SerialElementT))>1000000000UL) - cout << "warning: hash tables exceed 1000MB." << endl; // We have the necessary parameters, so construct hashfunction datastructures initialize_lsh_functions(); @@ -338,7 +332,7 @@ } Uns32T H::bucket_insert_point(bucket **pp){ - Uns32T collisionCount = 0; + collisionCount = 0; if(!*pp){ *pp = new bucket(); #ifdef LSH_BLOCK_FULL_ROWS @@ -355,8 +349,8 @@ __bucket_insert_point((*pp)->next); // First bucket holds collision count } #else - pointCount++; - __bucket_insert_point(*pp); // No collision count storage + pointCount++; + __bucket_insert_point(*pp); // No collision count storage #endif return collisionCount; } @@ -452,8 +446,9 @@ Uns32T G::insert_point(vector<float>& v, Uns32T pp){ Uns32T collisionCount = 0; H::p = pp; - if(pp>H::maxp) - H::maxp=pp; // Store highest pointID in database + if(pp<=H::maxp) + error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)"); + H::maxp=pp; // Store highest pointID in database H::compute_hash_functions( v ); for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) ); @@ -574,22 +569,30 @@ // Serial header constructors SerialHeader::SerialHeader(){;} -SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT): +SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc): lshMagic(O2_SERIAL_MAGIC), binWidth(W), numTables(L), numRows(N), numCols(C), elementSize(O2_SERIAL_ELEMENT_SIZE), - version(O2_SERIAL_VERSION), - size(L * align_up(N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables - + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions - L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())), + version(O2_SERIAL_VERSION), + size(0), // we are deprecating this value flags(FMT), dataDim(d), numFuns(k), radius(r), - maxp(p){;} // header + maxp(p), + size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables + + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions + (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())), + pointCount(pc){ + + if(FMT==O2_SERIAL_FILEFORMAT2) + size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE + + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T) + +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn()); +} // header float* G::get_serial_hashfunction_base(char* db){ if(db&&lshHeader) @@ -620,7 +623,7 @@ // Check requested serialFormat if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2)) error("Unrecognized serial file format request: ", "serialize()"); - + // Test to see if file exists if((dbfid = open (filename, O_RDONLY)) < 0) // If it doesn't, then create the file (CREATE) @@ -676,12 +679,12 @@ if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2) || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2) || !( this->w == that->binWidth && - this->L == that->numTables && - this->N == that->numRows && - this->k == that->numFuns && - this->d == that->dataDim && - sizeof(SerialElementT) == that->elementSize && - this->radius == that->radius)){ + this->L == that->numTables && + this->N == that->numRows && + this->k == that->numFuns && + this->d == that->dataDim && + sizeof(SerialElementT) == that->elementSize && + this->radius == that->radius)){ serial_print_header(format); return 0; } @@ -974,8 +977,12 @@ get_lock(dbfid, 1); // Make header first to get size of serialized database - lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT); - + lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount); + + cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl; + if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE) + error("Maximum size of LSH file exceded: > 4000MB"); + // go to the location corresponding to the last byte if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1) error("lseek error in db file", "", "lseek"); @@ -985,11 +992,11 @@ error("write error", "", "write"); char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1); - + memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE); - + serial_munmap(db, O2_SERIAL_HEADER_SIZE); - + close(dbfid); std::cout << "done initializing tables." << endl; @@ -1069,6 +1076,7 @@ H::w = lshHeader->binWidth; H::radius = lshHeader->radius; H::maxp = lshHeader->maxp; + H::pointCount = lshHeader->pointCount; return dbfid; }
--- a/lshlib.h Thu Jul 31 19:26:04 2008 +0000 +++ b/lshlib.h Fri Aug 01 15:04:31 2008 +0000 @@ -59,10 +59,11 @@ #define O2_SERIAL_ELEMENT_SIZE sizeof(SerialElementT) #define O2_SERIAL_MAX_TABLES (200) #define O2_SERIAL_MAX_ROWS (1000000) -#define O2_SERIAL_MAX_COLS (1000) +#define O2_SERIAL_MAX_COLS (100000) #define O2_SERIAL_MAX_DIM (2000) #define O2_SERIAL_MAX_FUNS (100) #define O2_SERIAL_MAX_BINWIDTH (200) +#define O2_SERIAL_MAXFILESIZE (4000000000UL) // Flags for Serial Header #define O2_SERIAL_FILEFORMAT1 (0x1U) // Optimize for on-disk search @@ -118,12 +119,11 @@ Uns32T numFuns; // number of independent hash functions float radius; // 32-bit floating point radius Uns32T maxp; // number of unique IDs in the database - Uns32T value_14; // spare value - Uns32T value_15; // spare value - Uns32T value_16; // spare value + unsigned long long size_long; // long version of size + Uns32T pointCount; // number of points in the database SerialHeader(); - SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float radius, Uns32T p, Uns32T FMT); + SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float radius, Uns32T p, Uns32T FMT, Uns32T pointCount); float get_binWidth(){return binWidth;} Uns32T get_numTables(){return numTables;} @@ -132,10 +132,11 @@ Uns32T get_elementSize(){return elementSize;} Uns32T get_version(){return version;} Uns32T get_flags(){return flags;} - Uns32T get_size(){return size;} + unsigned long long get_size(){return size_long;} Uns32T get_dataDim(){return dataDim;} Uns32T get_numFuns(){return numFuns;} Uns32T get_maxp(){return maxp;} + Uns32T get_pointCount(){return pointCount;} }; #define IFLAG 0xFFFFFFFF @@ -200,6 +201,7 @@ Uns32T maxp; // highest pointID stored in database Uns32T bucketCount; // count of number of point buckets allocated Uns32T pointCount; // count of number of points inserted + Uns32T collisionCount; // number of points collided in a hash-table row Uns32T t1; // first hash table key Uns32T t2; // second hash table key