# HG changeset patch # User mas01mc # Date 1219354113 0 # Node ID c93be2f3a674b710ecdf803f9606a34f27985d3f # Parent d2c56d4f841ebf3ba797e1478234b82ed3c43427 Merge of branches/large_adb -r 514:524 onto the trunk. No conflicts. Added LARGE_ADB support. Turn on with --ntracks 20001 or greater. Use --adb_feature_root to locate feature files at QUERY time. A bug fix in LSH indexing that was incorrectly thresholding large numbers of shingles. diff -r d2c56d4f841e -r c93be2f3a674 audioDB.cpp --- a/audioDB.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/audioDB.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -1,19 +1,21 @@ #include "audioDB.h" LSH* SERVER_LSH_INDEX_SINGLETON; +char* SERVER_ADB_ROOT; +char* SERVER_ADB_FEATURE_ROOT; PointPair::PointPair(Uns32T a, Uns32T b, Uns32T c):trackID(a),qpos(b),spos(c){}; bool operator<(const PointPair& a, const PointPair& b){ - return ( (a.qpos(const PointPair& a, const PointPair& b){ - return ( (a.qpos>b.qpos) || - ((a.qpos==b.qpos) && - ( (a.trackID>b.trackID)) || ((a.trackID==b.trackID)&&(a.spos>b.spos)) ) ); + return ( (a.trackID>b.trackID) || + ( (a.trackID==b.trackID) && + ( (a.spos>b.spos) || ( (a.spos==b.spos) && (a.qpos > b.qpos) )) ) ); } bool operator==(const PointPair& a, const PointPair& b){ @@ -34,6 +36,10 @@ error("No command found"); } + // Perform database prefix substitution + if(adb_root) + prefix_name((char** const)&dbName, adb_root); + if(O2_ACTION(COM_SERVER)) startServer(); @@ -86,6 +92,9 @@ try { isServer = 1; // FIXME: Hack processArgs(argc, argv); + // Perform database prefix substitution + if(adb_root) + prefix_name((char** const)&dbName, adb_root); assert(O2_ACTION(COM_QUERY)); query(dbName, inFile, adbQueryResponse); } catch(char *err) { @@ -99,6 +108,9 @@ try { isServer = 1; // FIXME: Hack processArgs(argc, argv); + // Perform database prefix substitution + if(adb_root) + prefix_name((char** const)&dbName, adb_root); assert(O2_ACTION(COM_STATUS)); status(dbName, adbStatusResponse); } catch(char *err) { @@ -125,6 +137,12 @@ munmap(powerTable, powerTableLength); if(l2normTable) munmap(l2normTable, l2normTableLength); + if(featureFileNameTable) + munmap(featureFileNameTable, fileTableLength); + if(timesFileNameTable) + munmap(timesFileNameTable, fileTableLength); + if(powerFileNameTable) + munmap(powerFileNameTable, fileTableLength); if(trackOffsetTable) delete trackOffsetTable; if(reporter) @@ -237,6 +255,20 @@ relative_threshold = args_info.relative_threshold_arg; } + if (args_info.adb_root_given){ + adb_root = args_info.adb_root_arg; + } + + if (args_info.adb_feature_root_given){ + adb_feature_root = args_info.adb_feature_root_arg; + } + + // perform dbName path prefix SERVER-side subsitution + if(SERVER_ADB_ROOT && !adb_root) + adb_root = SERVER_ADB_ROOT; + if(SERVER_ADB_FEATURE_ROOT && !adb_feature_root) + adb_feature_root = SERVER_ADB_FEATURE_ROOT; + if(args_info.SERVER_given){ command=COM_SERVER; port=args_info.SERVER_arg; @@ -527,15 +559,23 @@ std::cout << "data dim:" << dbH->dim <dim>0){ std::cout << "total vectors:" << dbH->length/(sizeof(double)*dbH->dim)<timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << std::endl; + if(dbH->flags & O2_FLAG_LARGE_ADB) + std::cout << "vectors available:" << O2_MAX_VECTORS - (dbH->length / (sizeof(double)*dbH->dim)) << std::endl; + else + std::cout << "vectors available:" << (dbH->timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << std::endl; } - std::cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl; - std::cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" << - (100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl; + if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ + std::cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl; + std::cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" << + (100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl; + } std::cout << "flags:" << " l2norm[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_L2NORM) << "] minmax[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_MINMAX) << "] power[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_POWER) - << "] times[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_TIMES) << "]" << endl; + << "] times[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_TIMES) + << "] largeADB[" << DISPLAY_FLAG(dbH->flags&O2_FLAG_LARGE_ADB) + << "]" << endl; + std::cout << "null count: " << nullCount << " small sequence count " << dudCount-nullCount << std::endl; } else { adbStatusResponse->result.numFiles = dbH->numFiles; @@ -550,7 +590,7 @@ void audioDB::l2norm(const char* dbName) { forWrite = true; initTables(dbName, 0); - if(dbH->length>0){ + if( !(dbH->flags & O2_FLAG_LARGE_ADB ) && (dbH->length>0) ){ /* FIXME: should probably be uint64_t */ unsigned numVectors = dbH->length/(sizeof(double)*dbH->dim); CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); @@ -563,8 +603,8 @@ void audioDB::power_flag(const char *dbName) { forWrite = true; - initTables(dbName, 0); - if (dbH->length > 0) { + initTables(dbName, 0); + if( !(dbH->flags & O2_FLAG_LARGE_ADB ) && (dbH->length>0) ){ error("cannot turn on power storage for non-empty database", dbName); } dbH->flags |= O2_FLAG_POWER; @@ -583,7 +623,7 @@ assert(l2normTable); - if( !append && (dbH->flags & O2_FLAG_L2NORM) ) + if( !(dbH->flags & O2_FLAG_LARGE_ADB) && !append && (dbH->flags & O2_FLAG_L2NORM) ) error("Database is already L2 normed", "automatic norm on insert is enabled"); VERB_LOG(2, "norming %u vectors...", n); @@ -624,5 +664,7 @@ // so it is a good place to set any global state variables int main(const unsigned argc, char* const argv[]){ SERVER_LSH_INDEX_SINGLETON = 0; // Initialize global variables + SERVER_ADB_ROOT = 0; // Server-side database root prefix + SERVER_ADB_FEATURE_ROOT = 0; // Server-side features root prefix audioDB(argc, argv); } diff -r d2c56d4f841e -r c93be2f3a674 audioDB.h --- a/audioDB.h Tue Aug 12 14:25:51 2008 +0000 +++ b/audioDB.h Thu Aug 21 21:28:33 2008 +0000 @@ -80,16 +80,20 @@ #define O2_DEFAULTDBSIZE (2000000000) // 2GB table size // Bit masks for packing (trackID,pointID) into 32-bit unsigned int -#define LSH_N_POINT_BITS 14 -#define LSH_TRACK_MASK 0xFFFFC000U // 2^18 = 262144 tracks -#define LSH_POINT_MASK 0x00003FFFU // 2^14 = 16384 points per track +// This can be controlled at compile time +#define O2_DEFAULT_LSH_N_POINT_BITS 14 + +// Override the default point bit width for large database support +#ifndef LSH_N_POINT_BITS +#define LSH_N_POINT_BITS O2_DEFAULT_LSH_N_POINT_BITS +#endif // LIMIT PARAMETERS #define O2_DEFAULT_DATASIZE (1355U) // in MB #define O2_DEFAULT_NTRACKS (20000U) #define O2_DEFAULT_DATADIM (9U) #define O2_REALTYPE (double) -#define O2_MAXFILES (20000U) +#define O2_MAXFILES (1000000U) #define O2_MAXFILESTR (256U) #define O2_FILETABLE_ENTRY_SIZE (O2_MAXFILESTR) #define O2_TRACKTABLE_ENTRY_SIZE (sizeof(unsigned)) @@ -98,17 +102,21 @@ #define O2_MAXDIM (2000U) #define O2_MAXNN (1000000U) #define O2_MAXSEQLEN (8000U) // maximum feature vectors in a sequence -#define O2_MAXTRACKS (10000U) // maximum number of tracks -#define O2_MAXTRACKLEN ((LSH_POINT_MASK+1)) // maximum shingles in a track +#define O2_MAXTRACKS (1000000U) // maximum number of tracks +#define O2_MAXTRACKLEN (1<numFiles*O2_FILETABLE_ENTRY_SIZE, STR, strlen(STR)); + +#define SAFE_DELETE(PTR) delete PTR; PTR=0; +#define SAFE_DELETE_ARRAY(PTR) delete[] PTR; PTR=0; + extern LSH* SERVER_LSH_INDEX_SINGLETON; +extern char* SERVER_ADB_ROOT; +extern char* SERVER_ADB_FEATURE_ROOT; typedef struct dbTableHeader { uint32_t magic; @@ -192,8 +210,10 @@ std::ifstream *timesFile; const char *powerFileName; std::ifstream *powerFile; + const char* adb_root; + const char* adb_feature_root; + int powerfd; - int dbfid; int lshfid; bool forWrite; @@ -205,15 +225,19 @@ gsl_rng *rng; - char *fileTable; + char* fileTable; unsigned* trackTable; - off_t *trackOffsetTable; + off_t* trackOffsetTable; double* dataBuf; double* inBuf; double* l2normTable; double* timesTable; double* powerTable; + char* featureFileNameTable; + char* timesFileNameTable; + char* powerFileNameTable; + size_t fileTableLength; size_t trackTableLength; off_t dataBufLength; @@ -269,7 +293,7 @@ void initialize_arrays(int track, unsigned int numVectors, double *query, double *data_buffer, double **D, double **DD); void delete_arrays(int track, unsigned int numVectors, double **D, double **DD); - void read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p); + void read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p); void set_up_query(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned int *nvp); void set_up_query_from_key(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned *nvp, Uns32T queryIndex); void set_up_db(double **snp, double **vsnp, double **spp, double **vspp, double **mddp, unsigned int *dvp); @@ -278,7 +302,7 @@ double dot_product_points(double* q, double* p, Uns32T L); void initRNG(); void initDBHeader(const char *dbName); - void initInputFile(const char *inFile); + void initInputFile(const char *inFile, bool loadData = true); void initTables(const char* dbName, const char* inFile = 0); void initTablesFromKey(const char* dbName, const Uns32T queryIndex); void unitNorm(double* X, unsigned d, unsigned n, double* qNorm); @@ -286,6 +310,8 @@ void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata); void insertPowerData(unsigned n, int powerfd, double *powerdata); unsigned getKeyPos(char* key); + void prefix_name(char** const name, const char* prefix); + public: audioDB(const unsigned argc, char* const argv[]); audioDB(const unsigned argc, char* const argv[], adb__queryResponse *adbQueryResponse); @@ -301,6 +327,7 @@ void insert_data_vectors(off_t offset, void *buffer, size_t size); void insert(const char* dbName, const char* inFile); void batchinsert(const char* dbName, const char* inFile); + void batchinsert_large_adb(const char* dbName, const char* inFile); void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); void status(const char* dbName, adb__statusResponse *adbStatusResponse=0); unsigned random_track(unsigned *propTable, unsigned total); @@ -322,6 +349,8 @@ Uns32T lsh_param_N; // Number of rows per hash table Uns32T lsh_param_b; // Batch size, in number of tracks, per indexing iteration Uns32T lsh_param_ncols; // Maximum number of collision in a hash-table row + Uns32T lsh_n_point_bits; // How many bits to use to encode point ID within a track + // LSH vector<> containers for one in-core copy of a set of feature vectors vector::iterator vi; // feature vector iterator @@ -342,13 +371,14 @@ char* index_get_name(const char*dbName, double radius, Uns32T sequenceLength); static void index_add_point_approximate(void* instance, Uns32T pointID, Uns32T qpos, float dist); // static point reporter callback method static void index_add_point_exact(void* instance, Uns32T pointID, Uns32T qpos, float dist); // static point reporter callback method - static Uns32T index_to_trackID(Uns32T lshID); // Convert lsh point index to audioDB trackID - static Uns32T index_to_trackPos(Uns32T lshID); // Convert lsh point index to audioDB trackPos (spos) - static Uns32T index_from_trackInfo(Uns32T, Uns32T); // Convert audioDB trackID and trackPos to an lsh point index + static Uns32T index_to_trackID(Uns32T lshID, Uns32T nPntBits); // Convert lsh point index to audioDB trackID + static Uns32T index_to_trackPos(Uns32T lshID, Uns32T nPntBits); // Convert lsh point index to audioDB trackPos (spos) + static Uns32T index_from_trackInfo(Uns32T trackID, Uns32T pntID, Uns32T nPntBits); // Convert audioDB trackID and trackPos to an lsh point index void initialize_exact_evalutation_queue(); void index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos); LSH* index_allocate(char* indexName, bool load_hashTables); - + void init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp); + // Web Services void startServer(); void ws_status(const char*dbName, char* hostport); @@ -370,7 +400,9 @@ timesFile(0), \ powerFileName(0), \ powerFile(0), \ - powerfd(0), \ + adb_root(0), \ + adb_feature_root(0), \ + powerfd(0), \ dbfid(0), \ lshfid(0), \ forWrite(false), \ @@ -386,6 +418,9 @@ l2normTable(0), \ timesTable(0), \ powerTable(0), \ + featureFileNameTable(0), \ + timesFileNameTable(0), \ + powerFileNameTable(0), \ fileTableLength(0), \ trackTableLength(0), \ dataBufLength(0), \ @@ -431,5 +466,6 @@ lsh_param_N(0), \ lsh_param_b(0), \ lsh_param_ncols(0), \ + lsh_n_point_bits(0), \ vv(0) #endif diff -r d2c56d4f841e -r c93be2f3a674 common.cpp --- a/common.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/common.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -126,10 +126,18 @@ } else { fileTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE); trackTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_TRACKTABLE_ENTRY_SIZE); - dataBufLength = ALIGN_PAGE_UP(dbH->length); - timesTableLength = ALIGN_PAGE_UP(2*(dbH->length / dbH->dim)); - powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); - l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + dataBufLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE); + timesTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE); + powerTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLE_ENTRY_SIZE); + l2normTableLength = 0; + } + else{ + dataBufLength = ALIGN_PAGE_UP(dbH->length); + timesTableLength = ALIGN_PAGE_UP(2*(dbH->length / dbH->dim)); + powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + } } CHECKED_MMAP(char *, fileTable, dbH->fileTableOffset, fileTableLength); CHECKED_MMAP(unsigned *, trackTable, dbH->trackTableOffset, trackTableLength); @@ -143,9 +151,18 @@ * * CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); */ - CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength); - CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength); - CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength); + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + CHECKED_MMAP(char *, featureFileNameTable, dbH->dataOffset, fileTableLength); + if( dbH->flags & O2_FLAG_TIMES ) + CHECKED_MMAP(char *, timesFileNameTable, dbH->timesTableOffset, fileTableLength); + if( dbH->flags & O2_FLAG_POWER ) + CHECKED_MMAP(char *, powerFileNameTable, dbH->powerTableOffset, fileTableLength); + } + else{ + CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength); + CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength); + CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength); + } } // build track offset table @@ -154,10 +171,15 @@ for(Uns32T k = 0; k < dbH->numFiles; k++){ trackOffsetTable[k] = cumTrack; cumTrack += trackTable[k] * dbH->dim; - } + } + + // Assign correct number of point bits per track in LSH indexing / retrieval + lsh_n_point_bits = dbH->flags >> 28; + if( !lsh_n_point_bits ) + lsh_n_point_bits = O2_DEFAULT_LSH_N_POINT_BITS; } -void audioDB::initInputFile (const char *inFile) { +void audioDB::initInputFile (const char *inFile, bool loadData) { if (inFile) { if ((infid = open(inFile, O_RDONLY)) < 0) { error("can't open input file for reading", inFile, "open"); @@ -189,7 +211,7 @@ } } - if ((indata = (char *) mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, infid, 0)) == (caddr_t) -1) { + if (loadData && ((indata = (char *) mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, infid, 0)) == (caddr_t) -1)) { error("mmap error for input", inFile, "mmap"); } } @@ -208,3 +230,21 @@ initInputFile(inFile); } +// If name is relative path, side effect name with prefix/name +// Do not free original pointer +void audioDB::prefix_name(char** const name, const char* prefix){ + // No prefix if prefix is empty + if(!prefix) + return; + // Allocate new memory, keep old memory + assert(name && *name); + if (strlen(*name) + strlen(prefix) + 1 > O2_MAXFILESTR) + error("error: path prefix + filename too long",prefix); + // Do not prefix absolute path+filename + if(**name=='/') + return; + // OK to prefix relative path+filename + char* prefixedName = (char*) malloc(O2_MAXFILESTR); + sprintf(prefixedName, "%s/%s", prefix, *name); + *name = prefixedName; // side effect new name to old name +} diff -r d2c56d4f841e -r c93be2f3a674 create.cpp --- a/create.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/create.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -2,6 +2,7 @@ /* Make a new database. +IF size(featuredata) < O2_LARGE_ADB_SIZE The database consists of: * a header (see dbTableHeader struct definition); @@ -12,6 +13,16 @@ * timesTable: (start,end) time points for each feature vector; * powerTable: associated power for each feature vector; * l2normTable: squared l2norms for each feature vector. + +ELSE the database consists of: + + * a header (see dbTableHeader struct definition); + * keyTable: list of keys of tracks + * trackTable: sizes of tracks + * featureTable: list of feature file names + * timesTable: list of times file names + * powerTable: list of power file names + */ void audioDB::create(const char* dbName){ @@ -41,10 +52,31 @@ off_t databytes = ((off_t) datasize) * 1024 * 1024; off_t auxbytes = databytes / datadim; - dbH->timesTableOffset = ALIGN_PAGE_UP(dbH->dataOffset + databytes); - dbH->powerTableOffset = ALIGN_PAGE_UP(dbH->timesTableOffset + 2*auxbytes); - dbH->l2normTableOffset = ALIGN_PAGE_UP(dbH->powerTableOffset + auxbytes); - dbH->dbSize = ALIGN_PAGE_UP(dbH->l2normTableOffset + auxbytes); + // For backward-compatibility, Record the point-encoding parameter for LSH indexing in the adb header + // If this value is 0 then it will be set to 14 + +#if O2_LSH_N_POINT_BITS > 15 +#error "AudioDB Compile ERROR: consistency check of O2_LSH_POINT_BITS failed (>15)" +#endif + + dbH->flags |= LSH_N_POINT_BITS << 28; + + // If database will fit in a single file the vectors are copied into the AudioDB instance + // Else all the vectors are left on the FileSystem and we use the dataOffset as storage + // for the location of the features, powers and times files (assuming that arbitrary keys are used for the fileTable) + if(ntrackstimesTableOffset = ALIGN_PAGE_UP(dbH->dataOffset + databytes); + dbH->powerTableOffset = ALIGN_PAGE_UP(dbH->timesTableOffset + 2*auxbytes); + dbH->l2normTableOffset = ALIGN_PAGE_UP(dbH->powerTableOffset + auxbytes); + dbH->dbSize = ALIGN_PAGE_UP(dbH->l2normTableOffset + auxbytes); + } + else{ // Create LARGE_ADB, features and powers kept on filesystem + dbH->flags |= O2_FLAG_LARGE_ADB; + dbH->timesTableOffset = ALIGN_PAGE_UP(dbH->dataOffset + O2_FILETABLE_ENTRY_SIZE*ntracks); + dbH->powerTableOffset = ALIGN_PAGE_UP(dbH->timesTableOffset + O2_FILETABLE_ENTRY_SIZE*ntracks); + dbH->l2normTableOffset = ALIGN_PAGE_UP(dbH->powerTableOffset + O2_FILETABLE_ENTRY_SIZE*ntracks); + dbH->dbSize = dbH->l2normTableOffset; + } write(dbfid, dbH, O2_HEADERSIZE); diff -r d2c56d4f841e -r c93be2f3a674 dump.cpp --- a/dump.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/dump.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -5,6 +5,10 @@ initTables(dbName, 0); } + if(dbH->flags & O2_FLAG_LARGE_ADB){ + error("error: dump not supported for LARGE_ADB"); + } + if((mkdir(output, S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { error("error making output directory", output, "mkdir"); } diff -r d2c56d4f841e -r c93be2f3a674 gengetopt.in --- a/gengetopt.in Tue Aug 12 14:25:51 2008 +0000 +++ b/gengetopt.in Thu Aug 21 21:28:33 2008 +0000 @@ -8,6 +8,7 @@ section "Database Operations" sectiondesc="All database operations require a database argument." option "database" d "database file required by Database commands." string typestr="filename" optional +option "adb_root" - "path prefix for database" string typestr="path" dependon="database" optional section "Database Creation" sectiondesc="Creating a new database file." @@ -23,7 +24,7 @@ option "output" - "output directory" string dependon="DUMP" default="audioDB.dump" optional option "L2NORM" L "unit norm vectors and norm all future inserts." dependon="database" optional option "POWER" P "turn on power flag for database." dependon="database" optional -option "INDEX" X "build an index for -d database at -R radius" dependon="database" dependon="radius" optional + section "Database Information" sectiondesc="Information about databases." option "STATUS" S "output database information to stdout." dependon="database" optional @@ -33,7 +34,7 @@ section "Database Insertion" sectiondesc="The following commands insert feature files, with optional keys and timestamps.\n" option "INSERT" I "add feature vectors to an existing database." dependon="features" optional -option "UPDATE" U "replace inserted vectors associated with key with new input vectors." dependon="features" dependon="key" dependon="database" optional hidden +option "adb_feature_root" - "path prefix for feature files, times files and power files" string typestr="path" optional option "features" f "binary series of vectors file {int sz:ieee double[][sz]:eof}." string typestr="filename" dependon="database" optional option "times" t "list of time points (ascii) for feature vectors." string typestr="filename" dependon="features" optional option "power" w "binary power feature file." string typestr="filename" dependon="database" optional @@ -62,6 +63,7 @@ section "Locality-sensitive hashing (LSH) parameters" sectiondesc="These parameters control LSH indexing and retrieval\n" +option "INDEX" X "build an index for -d database at -R radius and -l sequenceLength" dependon="database" dependon="radius" optional option "lsh_w" - "width of LSH hash-function bins. " double default="4.0" dependon="INDEX" optional hidden option "lsh_k" - "even number of independent hash functions to employ with LSH" int typestr="size" default="8" dependon="INDEX" optional option "lsh_m" - "number of hash tables is m(m-1)/2" int typestr="size" default="5" dependon="INDEX" optional @@ -79,9 +81,10 @@ section "Web Services" sectiondesc="These commands enable the database process to establish a connection via the internet and operate as separate client and server processes.\n" option "SERVER" s "run as standalone web service on named port." int typestr="port" default="14475" optional +option "load_index" - "make web service with memory-resident hashtables" flag off dependon="radius" optional option "client" c "run as a client using named host service." string typestr="hostname:port" optional -option "load_index" - "make web service with memory-resident hashtables" flag off dependon="radius" optional + text " -Copyright (c) 2007 Michael Casey, Christophe Rhodes +Copyright (c) 2007-2008 Michael Casey, Christophe Rhodes Goldsmiths, University of London" diff -r d2c56d4f841e -r c93be2f3a674 index.cpp --- a/index.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/index.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -8,22 +8,27 @@ // // Author: Michael Casey // Date: 23 June 2008 +// +// 19th August 2008 - added O2_FLAG_LARGE_ADB support #include "audioDB.h" #include "ReporterBase.h" /************************* LSH point index to audioDB conversion *****************/ -Uns32T audioDB::index_to_trackID(Uns32T lshID){ - return lshID>>LSH_N_POINT_BITS; +Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){ + assert(nPntBits); + return lshID>>nPntBits; } -Uns32T audioDB::index_to_trackPos(Uns32T lshID){ - return lshID&LSH_POINT_MASK; +Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){ + assert(nPntBits); + return lshID&((1<get_indexName(), indexName, MAXSTR)==0) ) @@ -78,19 +87,20 @@ // Prepare the AudioDB database for read access and allocate auxillary memory void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { + if (!(dbH->flags & O2_FLAG_POWER)) { + error("INDEXed database must be power-enabled", dbName); + } + + double *snpp = *snp, *sppp = 0; + *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector - - double *snpp = *snp, *sppp = 0; - memcpy(*snp, l2normTable, *dvp * sizeof(double)); - - if (!(dbH->flags & O2_FLAG_POWER)) { - error("database not power-enabled", dbName); - } *spp = new double[*dvp]; // song powertable pointer sppp = *spp; + memcpy(*snp, l2normTable, *dvp * sizeof(double)); memcpy(*spp, powerTable, *dvp * sizeof(double)); - + + for(Uns32T i = 0; i < dbH->numFiles; i++){ if(trackTable[i] >= sequenceLength) { sequence_sum(snpp, trackTable[i], sequenceLength); @@ -102,10 +112,10 @@ snpp += trackTable[i]; sppp += trackTable[i]; } - + *vsnp = *snp; *vspp = *spp; - + // Move the feature vector read pointer to start of fetures in database lseek(dbfid, dbH->dataOffset, SEEK_SET); } @@ -113,22 +123,28 @@ /************************ LSH indexing ***********************************/ void audioDB::index_index_db(const char* dbName){ - char* newIndexName; double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; Uns32T dbVectors = 0; + printf("INDEX: initializing header\n"); // Check if audioDB exists, initialize header and open database for read forWrite = false; initDBHeader(dbName); + if(dbH->flags & O2_FLAG_POWER) + usingPower = true; + + if(dbH->flags & O2_FLAG_TIMES) + usingTimes = true; + newIndexName = index_get_name(dbName, radius, sequenceLength); // Set unit norming flag override audioDB::normalizedDistance = !audioDB::no_unit_norming; - printf("INDEX: dim %d\n", dbH->dim); + printf("INDEX: dim %d\n", (int)dbH->dim); printf("INDEX: R %f\n", radius); printf("INDEX: seqlen %d\n", sequenceLength); printf("INDEX: lsh_w %f\n", lsh_param_w); @@ -141,8 +157,6 @@ fflush(stdout); - index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); - if((lshfid = open(newIndexName,O_RDONLY))<0){ printf("INDEX: constructing new LSH index\n"); printf("INDEX: making index file %s\n", newIndexName); @@ -160,7 +174,10 @@ if( endTrack > dbH->numFiles) endTrack = dbH->numFiles; // Insert up to lsh_param_b tracks - index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); + if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ + index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); + } + index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Clean up @@ -177,7 +194,7 @@ // Get the lsh header info and find how many tracks are inserted already lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here assert(lsh); - Uns32T maxs = index_to_trackID(lsh->get_maxp())+1; + Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1; delete lsh; lsh = 0; @@ -211,14 +228,66 @@ exit(1); } + delete[] newIndexName; + delete[] sNorm; + delete[] sPower; +} - delete[] newIndexName; - if(sNorm) - delete[] sNorm; - if(sPower) - delete[] sPower; +// initialize auxillary track data from filesystem +// pre-conditions: +// dbH->flags & O2_FLAG_LARGE_ADB +// feature data allocated and copied (fvp) +// +// post-conditions: +// allocated power data +// allocated l2norm data +// +void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ + if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) + error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); + // Allocate and read the power sequence + if(trackTable[trackID]>=sequenceLength){ + + char* prefixedString = new char[O2_MAXFILESTR]; + char* tmpStr = prefixedString; + // Open and check dimensions of power file + strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); + prefix_name((char ** const)&prefixedString, adb_feature_root); + if(prefixedString!=tmpStr) + delete[] tmpStr; + powerfd = open(prefixedString, O_RDONLY); + if (powerfd < 0) { + error("failed to open power file", prefixedString); + } + if (fstat(powerfd, &statbuf) < 0) { + error("fstat error finding size of power file", prefixedString, "fstat"); + } + + if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) + error("Dimension mismatch: numPowers != numVectors", prefixedString); + + *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values + assert(*sPowerp); + *spPtrp = *sPowerp; + insertPowerData(trackTable[trackID], powerfd, *sPowerp); + if (0 < powerfd) { + close(powerfd); + } + + sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); + sequence_average(*sPowerp, trackTable[trackID], sequenceLength); + powerTable = 0; + + // Allocate and calculate the l2norm sequence + *sNormpp = new double[trackTable[trackID]]; + assert(*sNormpp); + *snPtrp = *sNormpp; + unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp); + sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); + sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); + } } void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, @@ -230,13 +299,35 @@ VERB_LOG(1, "indexing tracks..."); - + int trackfd = dbfid; for(trackID = start_track ; trackID < end_track ; trackID++ ){ - read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + char* prefixedString = new char[O2_MAXFILESTR]; + char* tmpStr = prefixedString; + // Open and check dimensions of feature file + strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); + prefix_name((char ** const) &prefixedString, adb_feature_root); + if(prefixedString!=tmpStr) + delete[] tmpStr; + initInputFile(prefixedString, false); // nommap, file pointer at correct position + trackfd = infid; + } + read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv *fvpp = fvp; // Protect memory allocation and free() for track data + + if( dbH->flags & O2_FLAG_LARGE_ADB ) + // Load power and calculate power and l2norm sequence sums + init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); + if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) break; - } + if ( dbH->flags & O2_FLAG_LARGE_ADB ){ + close(infid); + delete[] *sNormpp; + delete[] *sPowerp; + *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; + } + } // end for(trackID = start_track ; ... ) std::cout << "finished inserting." << endl; } @@ -256,13 +347,17 @@ numVecs = trackTable[trackID] - sequenceLength + 1; } } - vv = index_initialize_shingles(numVecs); - - for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) - index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); - Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); - Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp); + Uns32T numVecsAboveThreshold = 0, collisionCount = 0; + if(numVecs){ + vv = index_initialize_shingles(numVecs); + + for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) + index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); + + numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); + collisionCount = index_insert_shingles(vv, trackID, *sppp); + } float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; /* index_norm_shingles() only goes as far as the end of the @@ -273,9 +368,11 @@ * So let's be certain the pointers are in the correct place */ - *snpp += trackTable[trackID]; - *sppp += trackTable[trackID]; - *fvpp += trackTable[trackID] * dbH->dim; + if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ + *snpp += trackTable[trackID]; + *sppp += trackTable[trackID]; + *fvpp += trackTable[trackID] * dbH->dim; + } std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; std::cout.flush(); @@ -285,10 +382,10 @@ Uns32T audioDB::index_insert_shingles(vector >* vv, Uns32T trackID, double* spp){ Uns32T collisionCount = 0; cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; - for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop) - if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){ - collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID)); - spp+=sequenceHop; + for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){ + if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))) + collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits)); + spp+=sequenceHop; } return collisionCount; } @@ -386,14 +483,14 @@ if(lsh!=SERVER_LSH_INDEX_SINGLETON){ if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE)) printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius()); - printf("INDEX: dim %d\n", dbH->dim); + printf("INDEX: dim %d\n", (int)dbH->dim); printf("INDEX: R %f\n", lsh->get_radius()); printf("INDEX: seqlen %d\n", sequenceLength); printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth()); printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns()); printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables()); printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows()); - printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp())); + printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)); printf("INDEX: Opened LSH index file %s\n", indexName); fflush(stdout); } @@ -415,8 +512,8 @@ void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ assert(instancePtr); // We need an instance for this callback audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance - Uns32T trackID = index_to_trackID(pointID); - Uns32T spos = index_to_trackPos(pointID); + Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits); + Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits); // Skip identity in query_from_key if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) myself->reporter->add_point(trackID, qpos, spos, dist); @@ -427,8 +524,8 @@ void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ assert(instancePtr); // We need an instance for this callback audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance - Uns32T trackID = index_to_trackID(pointID); - Uns32T spos = index_to_trackPos(pointID); + Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits); + Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits); // Skip identity in query_from_key if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) myself->index_insert_exact_evaluation_queue(trackID, qpos, spos); @@ -449,10 +546,10 @@ // return nqv: if index exists int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) { - unsigned int numVectors; - double *query, *query_data; - double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; - double meanQdur; + unsigned int numVectors = 0; + double *query = 0, *query_data = 0; + double *qNorm = 0, *qnPtr = 0, *qPower = 0, *qpPtr = 0; + double meanQdur = 0; void (*add_point_func)(void*,Uns32T,Uns32T,float); // Set the point-reporter callback based on the value of lsh_exact diff -r d2c56d4f841e -r c93be2f3a674 insert.cpp --- a/insert.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/insert.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -11,7 +11,7 @@ } bool audioDB::enough_data_space_free(off_t size) { - return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); + return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); } void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { @@ -23,6 +23,9 @@ forWrite = true; initTables(dbName, inFile); + if(dbH->flags & O2_FLAG_LARGE_ADB) + error("Single-feature inserts not allowed with LARGE audioDB instances"); + if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) error("Must use timestamps with timestamped database","use --times"); @@ -49,6 +52,7 @@ if(alreadyInserted) { VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); + // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08 return; } @@ -64,7 +68,7 @@ return; } - strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)); + INSERT_FILETABLE_STRING(fileTable, key); off_t insertoffset = dbH->length;// Store current state @@ -153,14 +157,14 @@ } void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { - if (usingPower) { + if(usingPower){ if (!(dbH->flags & O2_FLAG_POWER)) { error("Cannot insert power data on non-power DB", dbName); } - + int one; unsigned int count; - + count = read(powerfd, &one, sizeof(unsigned int)); if (count != sizeof(unsigned int)) { error("powerfd read failed", "int", "read"); @@ -168,7 +172,7 @@ if (one != 1) { error("dimensionality of power file not 1", powerFileName); } - + // FIXME: should check that the powerfile is the right size for // this. -- CSR, 2007-10-30 count = read(powerfd, powerdata, numVectors * sizeof(double)); @@ -183,6 +187,12 @@ forWrite = true; initDBHeader(dbName); + // Treat large ADB instances differently + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + batchinsert_large_adb(dbName, inFile) ; + return; + } + if(!key) key=inFile; std::ifstream *filesIn = 0; @@ -289,8 +299,9 @@ close(thispowerfd); } } - strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)); - + + INSERT_FILETABLE_STRING(fileTable, thisKey); + off_t insertoffset = dbH->length;// Store current state // Increment file count @@ -301,7 +312,7 @@ // Update track to file index map memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); - + insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); // Norm the vectors on input if the database is already L2 normed @@ -334,3 +345,171 @@ // Report status status(dbName); } + + +// BATCHINSERT_LARGE_ADB +// +// This method inserts file pointers into the ADB instance rather than the actual feature data +// +// This method is intended for databases that are large enough to only support indexed query +// So exhaustive searching across all feature vectors will not be performed +// +// We insert featureFileName, [powerFileName], [timesFileName] +// +// l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time +// +// LIMITS: +// +// We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles +// +void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) { + + if(!key) + key=inFile; + std::ifstream *filesIn = 0; + std::ifstream *keysIn = 0; + std::ifstream* thisTimesFile = 0; + int thispowerfd = 0; + + if(!(filesIn = new std::ifstream(inFile))) + error("Could not open batch in file", inFile); + if(key && key!=inFile) + if(!(keysIn = new std::ifstream(key))) + error("Could not open batch key file",key); + + if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) + error("Must use timestamps with timestamped database","use --times"); + + if(!usingPower && (dbH->flags & O2_FLAG_POWER)) + error("Must use power with power-enabled database", dbName); + + unsigned totalVectors=0; + char *thisFile = new char[MAXSTR]; + char *thisKey = 0; + if (key && (key != inFile)) { + thisKey = new char[MAXSTR]; + } + char *thisTimesFileName = new char[MAXSTR]; + char *thisPowerFileName = new char[MAXSTR]; + + std::set s; + + for (unsigned k = 0; k < dbH->numFiles; k++) { + s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); + } + + do { + filesIn->getline(thisFile,MAXSTR); + if(key && key!=inFile) { + keysIn->getline(thisKey,MAXSTR); + } else { + thisKey = thisFile; + } + if(usingTimes) { + timesFile->getline(thisTimesFileName,MAXSTR); + } + if(usingPower) { + powerFile->getline(thisPowerFileName, MAXSTR); + } + + if(filesIn->eof()) { + break; + } + + initInputFile(thisFile, false); + + if(!enough_per_file_space_free()) { + error("batchinsert failed: no more room for metadata", thisFile); + } + + if(s.count(thisKey)) { + VERB_LOG(0, "key already exists in database: %s\n", thisKey); + } else { + s.insert(thisKey); + // Make a track index table of features to file indexes + unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); + if(!numVectors) { + VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); + } + else{ + // Check that time-stamp file exists + if(usingTimes){ + if(timesFile->eof()) { + error("not enough timestamp files in timesList", timesFileName); + } + thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); + if(!thisTimesFile->is_open()) { + error("Cannot open timestamp file", thisTimesFileName); + } + if(thisTimesFile) + delete thisTimesFile; + } + + // Check that power file exists + if (usingPower) { + if(powerFile->eof()) { + error("not enough power files in powerList", powerFileName); + } + thispowerfd = open(thisPowerFileName, O_RDONLY); + if (thispowerfd < 0) { + error("failed to open power file", thisPowerFileName); + } + if (0 < thispowerfd) { + close(thispowerfd); + } + } + + // persist links to the feature files for reading from filesystem later + + // Primary Keys + INSERT_FILETABLE_STRING(fileTable, thisKey); + + // Feature Vector fileNames + INSERT_FILETABLE_STRING(featureFileNameTable, thisFile); + + // Time Stamp fileNames + if(usingTimes) + INSERT_FILETABLE_STRING(timesFileNameTable, thisTimesFileName); + + + // Power fileNames + if(usingPower) + INSERT_FILETABLE_STRING(powerFileNameTable, thisPowerFileName); + + // Increment file count + dbH->numFiles++; + + // Update Header information + dbH->length+=(statbuf.st_size-sizeof(int)); + + // Update track to file index map + memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); + + totalVectors+=numVectors; + + // Copy the header back to the database + memcpy (db, dbH, sizeof(dbTableHeaderT)); + } + } + // CLEAN UP + if(indata) + munmap(indata,statbuf.st_size); + if(infid>0) + close(infid); + } while(!filesIn->eof()); + + VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); + + delete [] thisPowerFileName; + if(key && (key != inFile)) { + delete [] thisKey; + } + delete [] thisFile; + delete [] thisTimesFileName; + + delete filesIn; + delete keysIn; + + // Report status + status(dbName); +} diff -r d2c56d4f841e -r c93be2f3a674 lshlib.cpp --- a/lshlib.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/lshlib.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -771,9 +771,12 @@ // Align each hash table to page boundary char* dbtable = serial_mmap(fid, hashTableSize, 1, align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn())); +#ifdef __CYGWIN__ + // No madvise in CYGWIN +#else if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0) error("could not advise hashtable memory","","madvise"); - +#endif maxColCount=0; minColCount=O2_SERIAL_MAX_COLS; meanColCount=0; @@ -1161,8 +1164,12 @@ // Align each hash table to page boundary char* dbtable = serial_mmap(fid, hashTableSize, 0, align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn())); +#ifdef __CYGWIN__ + // No madvise in CYGWIN +#else if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0) error("could not advise hashtable memory","","madvise"); +#endif pt=(SerialElementT*)dbtable; for( y = 0 ; y < H::N ; y++ ){ // Move disk pointer to beginning of row @@ -1331,8 +1338,12 @@ // memory map a single hash table for random access char* db = serial_mmap(dbfid, hashTableSize, 0, align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn())); +#ifdef __CYGWIN__ + // No madvise in CYGWIN +#else if(madvise(db, hashTableSize, MADV_RANDOM)<0) error("could not advise local hashtable memory","","madvise"); +#endif SerialElementT* pe = (SerialElementT*)db ; for(Uns32T qpos=0; qposnumCols, qpos); // Point to correct row @@ -1384,8 +1399,12 @@ // memory map a single hash table for random access char* db = serial_mmap(dbfid, hashTableSize, 0, align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn())); +#ifdef __CYGWIN__ + // No madvise in CYGWIN +#else if(madvise(db, hashTableSize, MADV_SEQUENTIAL)<0) error("could not advise local hashtable memory","","madvise"); +#endif SerialElementT* pe = (SerialElementT*)db ; printf("*********** TABLE %d ***************\n", j); fflush(stdout); diff -r d2c56d4f841e -r c93be2f3a674 lshlib.h --- a/lshlib.h Tue Aug 12 14:25:51 2008 +0000 +++ b/lshlib.h Thu Aug 21 21:28:33 2008 +0000 @@ -58,23 +58,25 @@ #define O2_SERIAL_HEADER_SIZE sizeof(SerialHeaderT) #define O2_SERIAL_ELEMENT_SIZE sizeof(SerialElementT) #define O2_SERIAL_MAX_TABLES (200) -#define O2_SERIAL_MAX_ROWS (1000000) -#define O2_SERIAL_MAX_COLS (100000) +#define O2_SERIAL_MAX_ROWS (1000000000) +#define O2_SERIAL_MAX_COLS (1000000) #define O2_SERIAL_MAX_DIM (2000) #define O2_SERIAL_MAX_FUNS (100) #define O2_SERIAL_MAX_BINWIDTH (200) #define O2_SERIAL_MAXFILESIZE (4000000000UL) // Flags for Serial Header -#define O2_SERIAL_FILEFORMAT1 (0x1U) // Optimize for on-disk search -#define O2_SERIAL_FILEFORMAT2 (0x2U) // Optimize for in-core search +#define O2_SERIAL_FILEFORMAT1 (0x1U) // Optimize disk format for on-disk search +#define O2_SERIAL_FILEFORMAT2 (0x2U) // Optimize disk format for in-core search +#define O2_SERIAL_COREFORMAT1 (0x4U) +#define O2_SERIAL_COREFORMAT2 (0x8U) // Flags for serialization fileformat2: use high 3 bits of Uns32T -#define O2_SERIAL_TOKEN_T1 (0xFFFFFFFC) +#define O2_SERIAL_TOKEN_T1 (0xFFFFFFFCU) #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU) #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU) -#define O2_INDEX_MAXSTR (512) +#define O2_INDEX_MAXSTR (256) unsigned align_up(unsigned x, unsigned w); @@ -320,7 +322,7 @@ // Callback Function for point reporting void* calling_instance; // store calling object instance for member-function callback - void (*add_point_callback)(void*, Uns32T, Uns32T, float); // The callback + ReporterCallbackPtr add_point_callback; // Pointer to the callback function public: G(char* lshFile, bool lshInCore = false); // unserialize constructor diff -r d2c56d4f841e -r c93be2f3a674 query.cpp --- a/query.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/query.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -46,7 +46,7 @@ if(index_exists(dbName, radius, sequenceLength)){ char* indexName = index_get_name(dbName, radius, sequenceLength); lsh = index_allocate(indexName, false); - reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp())+1); + reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1); delete[] indexName; } else @@ -62,7 +62,7 @@ if(index_exists(dbName, radius, sequenceLength)){ char* indexName = index_get_name(dbName, radius, sequenceLength); lsh = index_allocate(indexName, false); - reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp())+1); + reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1); delete[] indexName; } else @@ -220,7 +220,7 @@ } } -void audioDB::read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p) { +void audioDB::read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p) { if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) { if(*data_buffer_p) { free(*data_buffer_p); @@ -235,7 +235,7 @@ } } - read(dbfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); + read(trkfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); } // These names deserve some unpicking. The names starting with a "q" @@ -341,48 +341,70 @@ VERB_LOG(1, "performing norms... "); - // Read query feature vectors from database - *qp = NULL; - lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); - size_t allocatedSize = 0; - read_data(queryIndex, qp, &allocatedSize); - // Consistency check on allocated memory and query feature size - if(*nvp*sizeof(double)*dbH->dim != allocatedSize) - error("Query memory allocation failed consitency check","set_up_query_from_key"); - - Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors - // Copy L2 norm partial-sum coefficients - assert(*qnp = new double[*nvp]); - memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); - sequence_sum(*qnp, *nvp, sequenceLength); - sequence_sqrt(*qnp, *nvp, sequenceLength); - - if( usingPower ){ - // Copy Power partial-sum coefficients - assert(*qpp = new double[*nvp]); - memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); - sequence_sum(*qpp, *nvp, sequenceLength); - sequence_average(*qpp, *nvp, sequenceLength); + // For LARGE_ADB load query features from file + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + if(infid>0) + close(infid); + char* prefixedString = new char[O2_MAXFILESTR]; + char* tmpStr = prefixedString; + strncpy(prefixedString, featureFileNameTable+queryIndex*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); + prefix_name(&prefixedString, adb_feature_root); + if(tmpStr!=prefixedString) + delete[] tmpStr; + initInputFile(prefixedString, false); // nommap, file pointer at correct position + size_t allocatedSize = 0; + read_data(infid, queryIndex, qp, &allocatedSize); // over-writes qp and allocatedSize + // Consistency check on allocated memory and query feature size + if(*nvp*sizeof(double)*dbH->dim != allocatedSize) + error("Query memory allocation failed consitency check","set_up_query_from_key"); + // Allocated and calculate auxillary sequences: l2norm and power + init_track_aux_data(queryIndex, *qp, qnp, vqnp, qpp, vqpp); + } + else{ // Load from self-contained ADB database + // Read query feature vectors from database + *qp = NULL; + lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); + size_t allocatedSize = 0; + read_data(dbfid, queryIndex, qp, &allocatedSize); + // Consistency check on allocated memory and query feature size + if(*nvp*sizeof(double)*dbH->dim != allocatedSize) + error("Query memory allocation failed consitency check","set_up_query_from_key"); + + Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors + // Copy L2 norm partial-sum coefficients + assert(*qnp = new double[*nvp]); + memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); + sequence_sum(*qnp, *nvp, sequenceLength); + sequence_sqrt(*qnp, *nvp, sequenceLength); + + if( usingPower ){ + // Copy Power partial-sum coefficients + assert(*qpp = new double[*nvp]); + memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); + sequence_sum(*qpp, *nvp, sequenceLength); + sequence_average(*qpp, *nvp, sequenceLength); + } + + if (usingTimes) { + unsigned int k; + *mqdp = 0.0; + double *querydurs = new double[*nvp]; + double *timesdata = new double[*nvp*2]; + assert(querydurs && timesdata); + memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double)); + for(k = 0; k < *nvp; k++) { + querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; + *mqdp += querydurs[k]; + } + *mqdp /= k; + + VERB_LOG(1, "mean query file duration: %f\n", *mqdp); + + delete [] querydurs; + delete [] timesdata; + } } - if (usingTimes) { - unsigned int k; - *mqdp = 0.0; - double *querydurs = new double[*nvp]; - double *timesdata = new double[*nvp*2]; - assert(querydurs && timesdata); - memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double)); - for(k = 0; k < *nvp; k++) { - querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; - *mqdp += querydurs[k]; - } - *mqdp /= k; - - VERB_LOG(1, "mean query file duration: %f\n", *mqdp); - - delete [] querydurs; - delete [] timesdata; - } // Defaults, for exhaustive search (!usingQueryPoint) *vqp = *qp; *vqnp = *qnp; @@ -487,7 +509,8 @@ // Compute database info // FIXME: we more than likely don't need very much of the database // so make a new method to build these values per-track or, even better, per-point - set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); + if( !( dbH->flags & O2_FLAG_LARGE_ADB) ) + set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); VERB_LOG(1, "matching points..."); @@ -495,48 +518,82 @@ assert(trackNN>0 && trackNN<=O2_MAXNN); // We are guaranteed that the order of points is sorted by: - // qpos, trackID, spos + // trackID, spos, qpos // so we can be relatively efficient in initialization of track data. // Here we assume that points don't overlap, so we will use exhaustive dot - // product evaluation over the sequence + // product evaluation instead of memoization of partial sums which is used + // for exhaustive brute-force evaluation from smaller databases: e.g. query_loop() double dist; size_t data_buffer_size = 0; double *data_buffer = 0; - Uns32T trackOffset; - Uns32T trackIndexOffset; + Uns32T trackOffset = 0; + Uns32T trackIndexOffset = 0; Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range Uns32T npairs = exact_evaluation_queue->size(); while(npairs--){ PointPair pp = exact_evaluation_queue->top(); - trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset - trackIndexOffset=trackOffset/dbH->dim; // num vectors offset - if((!(usingPower) || powers_acceptable(qpPtr[usingQueryPoint?0:pp.qpos], sPower[trackIndexOffset+pp.spos])) && - ((usingQueryPoint?0:pp.qpos) < numVectors-sequenceLength+1 && pp.spos < trackTable[pp.trackID]-sequenceLength+1)){ + // Large ADB track data must be loaded here for sPower + if(dbH->flags & O2_FLAG_LARGE_ADB){ + trackOffset=0; + trackIndexOffset=0; if(currentTrack!=pp.trackID){ + char* prefixedString = new char[O2_MAXFILESTR]; + char* tmpStr = prefixedString; + // On currentTrack change, allocate and load track data currentTrack=pp.trackID; - lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); - read_data(currentTrack, &data_buffer, &data_buffer_size); + SAFE_DELETE_ARRAY(sNorm); + SAFE_DELETE_ARRAY(sPower); + if(infid>0) + close(infid); + // Open and check dimensions of feature file + strncpy(prefixedString, featureFileNameTable+pp.trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); + prefix_name((char ** const) &prefixedString, adb_feature_root); + if (prefixedString!=tmpStr) + delete[] tmpStr; + initInputFile(prefixedString, false); // nommap, file pointer at correct position + // Load the feature vector data for current track into data_buffer + read_data(infid, pp.trackID, &data_buffer, &data_buffer_size); + // Load power and calculate power and l2norm sequence sums + init_track_aux_data(pp.trackID, data_buffer, &sNorm, &snPtr, &sPower, &spPtr); } - dist = dot_product_points(query+(usingQueryPoint?0:pp.qpos*dbH->dim), data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); + } + else{ + // These offsets are w.r.t. the entire database of feature vectors and auxillary variables + trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset + trackIndexOffset=trackOffset/dbH->dim; // num vectors offset + } + Uns32T qPos = usingQueryPoint?0:pp.qpos;// index for query point + Uns32T sPos = trackIndexOffset+pp.spos; // index into l2norm table + // Test power thresholds before computing distance + if( ( !usingPower || powers_acceptable(qpPtr[qPos], sPower[sPos])) && + ( qPosflags & O2_FLAG_LARGE_ADB) && (currentTrack!=pp.trackID) ){ + // On currentTrack change, allocate and load track data + currentTrack=pp.trackID; + lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); + read_data(dbfid, currentTrack, &data_buffer, &data_buffer_size); + } + // Compute distance + dist = dot_product_points(query+qPos*dbH->dim, data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); + double qn = qnPtr[qPos]; + double sn = sNorm[sPos]; if(normalizedDistance) - dist = 2-(2/(qnPtr[usingQueryPoint?0:pp.qpos]*sNorm[trackIndexOffset+pp.spos]))*dist; + dist = 2 - (2/(qn*sn))*dist; else if(no_unit_norming) - dist = qnPtr[usingQueryPoint?0:pp.qpos]*qnPtr[usingQueryPoint?0:pp.qpos]+sNorm[trackIndexOffset+pp.spos]*sNorm[trackIndexOffset+pp.spos] - 2*dist; + dist = qn*qn + sn*sn - 2*dist; // else // dist = dist; if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) - reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); + reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); } exact_evaluation_queue->pop(); } // Cleanup - if(sNorm) - delete sNorm; - if(sPower) - delete sPower; - if(meanDBdur) - delete meanDBdur; + SAFE_DELETE_ARRAY(sNorm); + SAFE_DELETE_ARRAY(sPower); + SAFE_DELETE_ARRAY(meanDBdur); } // A completely unprotected dot-product method @@ -555,6 +612,9 @@ double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; double meanQdur; + if( dbH->flags & O2_FLAG_LARGE_ADB ) + error("error: LARGE_ADB requires indexed query"); + if(query_from_key) set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); else @@ -618,7 +678,7 @@ trackIndexOffset=trackOffset/dbH->dim; // numVectors offset - read_data(track, &data_buffer, &data_buffer_size); + read_data(dbfid, track, &data_buffer, &data_buffer_size); if(sequenceLength <= trackTable[track]) { // test for short sequences VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]); diff -r d2c56d4f841e -r c93be2f3a674 reporter.h --- a/reporter.h Tue Aug 12 14:25:51 2008 +0000 +++ b/reporter.h Thu Aug 21 21:28:33 2008 +0000 @@ -292,6 +292,7 @@ } std::vector::reverse_iterator rit; std::priority_queue< NNresult, std::vector< NNresult>, std::greater > point_queue; + NNresult rk; if(adbQueryResponse==0) { for(rit = v.rbegin(); rit < v.rend(); rit++) { @@ -309,31 +310,57 @@ } for(unsigned int k = 0; k < qsize; k++) { - NNresult rk = point_queue.top(); + rk = point_queue.top(); std::cout << rk.dist << " " << rk.qpos << " " << rk.spos << std::endl; point_queue.pop(); } } } else { - ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size; - ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size; - ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size; - ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size; - ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size]; - ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size]; - ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size]; - ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size]; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size*pointNN]; + ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size*pointNN]; + ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size*pointNN]; + ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size*pointNN]; unsigned int k = 0; - for(rit = v.rbegin(); rit < v.rend(); rit++, k++) { + // Loop over returned tracks + for(rit = v.rbegin(); rit < v.rend(); rit++) { r = *rit; - ((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR]; - ((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = r.dist; - ((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = r.qpos; - ((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = r.spos; - if(fileTable) - snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE); - else - snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID); + // Reverse the order of the points stored in point_queues + unsigned int qsize=point_queues[r.trackID].size(); + while(qsize--){ + point_queue.push(point_queues[r.trackID].top()); + point_queues[r.trackID].pop(); + } + qsize=point_queue.size(); + unsigned int numReports = pointNN; + while(numReports--){ // pop the rest of the points + if(qsize) + rk = point_queue.top(); // Take one point from the top of the queue + else{ + rk.dist = 1000000000.0; + rk.qpos = 0xFFFFFFFF; + rk.spos = 0xFFFFFFFF; + } + + ((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR]; + ((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = rk.dist; + ((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = rk.qpos; + ((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = rk.spos; + if(qsize){ + if(fileTable) + snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE); + else + snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID); + point_queue.pop(); + qsize--; + } + else + snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "NULL"); + k++; + } } } // clean up @@ -641,17 +668,17 @@ } } else { - ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size; - ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size; - ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size; - ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size; - ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size]; - ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size]; - ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size]; - ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size]; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeRlist=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeDist=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeQpos=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.__sizeSpos=size*pointNN; + ((adb__queryResponse*)adbQueryResponse)->result.Rlist= new char*[size*pointNN]; + ((adb__queryResponse*)adbQueryResponse)->result.Dist = new double[size*pointNN]; + ((adb__queryResponse*)adbQueryResponse)->result.Qpos = new unsigned int[size*pointNN]; + ((adb__queryResponse*)adbQueryResponse)->result.Spos = new unsigned int[size*pointNN]; unsigned int k = 0; // Loop over returned tracks - for(rit = v.rbegin(); rit < v.rend(); rit++, k++) { + for(rit = v.rbegin(); rit < v.rend(); rit++) { r = *rit; // Reverse the order of the points stored in point_queues unsigned int qsize=point_queues[r.trackID].size(); @@ -660,17 +687,32 @@ point_queues[r.trackID].pop(); } qsize=point_queue.size(); - rk = point_queue.top(); // Take one point from the top of the queue - ((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR]; - ((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = rk.dist; - ((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = rk.qpos; - ((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = rk.spos; - if(fileTable) - snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE); - else - snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID); - while(qsize--) // pop the rest of the points - point_queue.pop(); + unsigned int numReports = pointNN; + while(numReports--){ // pop the rest of the points + if(qsize) + rk = point_queue.top(); // Take one point from the top of the queue + else{ + rk.dist = 1000000000.0; + rk.qpos = 0xFFFFFFFF; + rk.spos = 0xFFFFFFFF; + } + + ((adb__queryResponse*)adbQueryResponse)->result.Rlist[k] = new char[O2_MAXFILESTR]; + ((adb__queryResponse*)adbQueryResponse)->result.Dist[k] = rk.dist; + ((adb__queryResponse*)adbQueryResponse)->result.Qpos[k] = rk.qpos; + ((adb__queryResponse*)adbQueryResponse)->result.Spos[k] = rk.spos; + if(qsize){ + if(fileTable) + snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLE_ENTRY_SIZE); + else + snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "%d", r.trackID); + point_queue.pop(); + qsize--; + } + else + snprintf(((adb__queryResponse*)adbQueryResponse)->result.Rlist[k], O2_MAXFILESTR, "NULL"); + k++; + } } } delete[] point_queues; diff -r d2c56d4f841e -r c93be2f3a674 sample.cpp --- a/sample.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/sample.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -56,7 +56,10 @@ void audioDB::sample(const char *dbName) { initTables(dbName, 0); - + if(dbH->flags & O2_FLAG_LARGE_ADB){ + error("error: sample not yet supported for LARGE_ADB"); + } + // build track offset table (FIXME: cut'n'pasted from query.cpp) off_t *trackOffsetTable = new off_t[dbH->numFiles]; unsigned cumTrack=0; diff -r d2c56d4f841e -r c93be2f3a674 soap.cpp --- a/soap.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/soap.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -18,7 +18,7 @@ std::cout << "length = " << adbStatusResponse.result.length << std::endl; std::cout << "dudCount = " << adbStatusResponse.result.dudCount << std::endl; std::cout << "nullCount = " << adbStatusResponse.result.nullCount << std::endl; - std::cout << "flags = " << adbStatusResponse.result.flags << std::endl; + std::cout << "flags = " << (adbStatusResponse.result.flags & 0x00FFFFFF) << std::endl; } else { soap_print_fault(&soap,stderr); } @@ -126,8 +126,8 @@ strncpy(queryType, "sequence", strlen("sequence")); else if(qType == O2_TRACK_QUERY) strncpy(queryType,"track", strlen("track")); - else - strncpy(queryType, "", strlen("")); + else if(qType == O2_N_SEQUENCE_QUERY) + strncpy(queryType,"nsequence", strlen("nsequence")); if(pointNN==0) pointNN=10; @@ -285,6 +285,12 @@ fflush(stderr); delete[] indexName; } + + // Server-side path prefix to databases and features + if(adb_root) + SERVER_ADB_ROOT = (char*)adb_root; // Server-side database root + if(adb_feature_root) + SERVER_ADB_FEATURE_ROOT = (char*)adb_feature_root; // Server-side features root for (int i = 1; ; i++) {