# HG changeset patch # User mas01cr # Date 1230116234 0 # Node ID 16a903968d181b7fc4923a417bb665c74866f31f # Parent 25ee0b77f8ca202e4f832e5afa8f674b33dc8076 Almost finish with audioDB::query_loop. This patch is a little bit noisy, because we rename adb->keys to adb->keymap, introduce a new vector adb->keys (essentially to replace fileTable), and introduce new functionality (both include and exclude keylists in adb_query_refine_t) as well as modifying the query_loop function itself to take advantage of all of these goodies. Oh, and we also fix an embarrassing state bug in adb->track_offsets for insert -- what was I thinking? (Thank you, regression test suites). Since we are on a private branch at the moment, we can take the luxury of renumbering the ADB_REFINE_ flags to include the exclude list at the logical place; once we have an ABI to support, that won't be possible. Now audioDB::query builds up include and exclude lists as appropriate; query_loop does an [O(NlogN) probably] buildup of the keys to consider, and then iterates over tracks sequentially, seeking only if one or more tracks have been excluded. No more trackFile, yay! The only remaining thing to deal with is the accumulator. It's easy enough to pass it around, but I want to read the indexed version before doing so to see how that all fits together. diff -r 25ee0b77f8ca -r 16a903968d18 audioDB-internals.h --- a/audioDB-internals.h Wed Dec 24 10:57:09 2008 +0000 +++ b/audioDB-internals.h Wed Dec 24 10:57:14 2008 +0000 @@ -12,7 +12,8 @@ int fd; int flags; adb_header_t *header; - std::map *keys; + std::vector *keys; + std::map *keymap; std::vector *track_lengths; std::vector *track_offsets; }; @@ -174,8 +175,8 @@ static inline uint32_t audiodb_key_index(adb_t *adb, const char *key) { std::map::iterator it; - it = adb->keys->find(key); - if(it == adb->keys->end()) { + it = adb->keymap->find(key); + if(it == adb->keymap->end()) { return (uint32_t) -1; } else { return (*it).second; diff -r 25ee0b77f8ca -r 16a903968d18 audioDB.h --- a/audioDB.h Wed Dec 24 10:57:09 2008 +0000 +++ b/audioDB.h Wed Dec 24 10:57:14 2008 +0000 @@ -325,7 +325,7 @@ void error(const char* a, const char* b = "", const char *sysFunc = 0); void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata); - int query_loop(adb_t *adb, adb_query_spec_t *spec, Uns32T queryIndex); + int query_loop(adb_t *adb, adb_query_spec_t *spec); void query_loop_points(adb_query_spec_t *spec, double* query, adb_qpointers_internal_t *qpointers); void initRNG(); void initDBHeader(const char *dbName); diff -r 25ee0b77f8ca -r 16a903968d18 audioDB_API.h --- a/audioDB_API.h Wed Dec 24 10:57:09 2008 +0000 +++ b/audioDB_API.h Wed Dec 24 10:57:14 2008 +0000 @@ -102,17 +102,23 @@ uint32_t ipos; } adb_result_t; -#define ADB_REFINE_KEYLIST 1 -#define ADB_REFINE_RADIUS 2 -#define ADB_REFINE_ABSOLUTE_THRESHOLD 4 -#define ADB_REFINE_RELATIVE_THRESHOLD 8 -#define ADB_REFINE_DURATION_RATIO 16 -#define ADB_REFINE_HOP_SIZE 32 +#define ADB_REFINE_INCLUDE_KEYLIST 1 +#define ADB_REFINE_EXCLUDE_KEYLIST 2 +#define ADB_REFINE_RADIUS 4 +#define ADB_REFINE_ABSOLUTE_THRESHOLD 8 +#define ADB_REFINE_RELATIVE_THRESHOLD 16 +#define ADB_REFINE_DURATION_RATIO 32 +#define ADB_REFINE_HOP_SIZE 64 + +typedef struct adbkeylist { + uint32_t nkeys; + const char **keys; +} adb_keylist_t; typedef struct adbqueryrefine { uint32_t flags; - uint32_t nkeys; - const char **keys; + adb_keylist_t include; + adb_keylist_t exclude; double radius; double absolute_threshold; double relative_threshold; diff -r 25ee0b77f8ca -r 16a903968d18 close.cpp --- a/close.cpp Wed Dec 24 10:57:09 2008 +0000 +++ b/close.cpp Wed Dec 24 10:57:14 2008 +0000 @@ -8,6 +8,7 @@ free(adb->path); free(adb->header); delete adb->keys; + delete adb->keymap; delete adb->track_lengths; delete adb->track_offsets; close(adb->fd); diff -r 25ee0b77f8ca -r 16a903968d18 insert.cpp --- a/insert.cpp Wed Dec 24 10:57:09 2008 +0000 +++ b/insert.cpp Wed Dec 24 10:57:14 2008 +0000 @@ -57,15 +57,15 @@ * 3. check that datum->dim and adb->header->dim agree (or that the * header dimension is zero, in which case write datum->dim to * adb->header->dim). - * 4. check for presence of datum->key in adb->keys; + * 4. check for presence of datum->key in adb->keymap; * 5. check for consistency between power and O2_FLAG_POWER, and * times and O2_FLAG_TIMES; * 6. write in data, power, times as appropriate; add to track * and key tables too; * 7. if O2_FLAG_L2NORM and !O2_FLAG_LARGE_ADB, compute norms and fill * in table; - * 8. update adb->keys, adb->track_lengths, adb->track_offsets and - * adb->header; + * 8. update adb->keys, adb->keymap, adb->track_lengths, + * adb->track_offsets and adb->header; * 9. sync adb->header with disk. * * Step 9 essentially commits the transaction; until we update @@ -105,8 +105,8 @@ } else if (adb->header->dim != datum->dim) { return 1; } - /* 4. check for presence of datum->key in adb->keys; */ - if(adb->keys->count(datum->key)) { + /* 4. check for presence of datum->key in adb->keymap; */ + if(adb->keymap->count(datum->key)) { /* not part of an explicit API/ABI, but we need a distinguished value in this circumstance to preserve somewhat wonky behaviour of audioDB::batchinsert. */ @@ -195,14 +195,13 @@ l2norm_buffer = NULL; } - /* 8. update adb->keys, adb->track_lengths, adb->track_offsets and - * adb->header; + /* 8. update adb->keys, adb->keymap, adb->track_lengths, + * adb->track_offsets and adb->header; */ - (*adb->keys)[datum->key] = adb->header->numFiles; + adb->keys->push_back(datum->key); + (*adb->keymap)[datum->key] = adb->header->numFiles; adb->track_lengths->push_back(datum->nvectors); - if(adb->header->numFiles > 0) { - adb->track_offsets->push_back(offset); - } + adb->track_offsets->push_back(offset); adb->header->numFiles += 1; adb->header->length += sizeof(double) * datum->nvectors * datum->dim; diff -r 25ee0b77f8ca -r 16a903968d18 open.cpp --- a/open.cpp Wed Dec 24 10:57:09 2008 +0000 +++ b/open.cpp Wed Dec 24 10:57:14 2008 +0000 @@ -31,7 +31,8 @@ key_table_length = ALIGN_PAGE_UP(nfiles * O2_FILETABLE_ENTRY_SIZE); mmap_or_goto_error(char *, key_table, adb->header->fileTableOffset, key_table_length); for (unsigned int k = 0; k < nfiles; k++) { - (*adb->keys)[(key_table + k*O2_FILETABLE_ENTRY_SIZE)] = k; + adb->keys->push_back(key_table + k*O2_FILETABLE_ENTRY_SIZE); + (*adb->keymap)[(key_table + k*O2_FILETABLE_ENTRY_SIZE)] = k; } munmap(key_table, key_table_length); } @@ -103,10 +104,14 @@ goto error; } - adb->keys = new std::map; + adb->keys = new std::vector; if(!adb->keys) { goto error; } + adb->keymap = new std::map; + if(!adb->keymap) { + goto error; + } if(audiodb_collect_keys(adb)) { goto error; } @@ -136,6 +141,9 @@ if(adb->keys) { delete adb->keys; } + if(adb->keymap) { + delete adb->keymap; + } if(adb->track_lengths) { delete adb->track_lengths; } diff -r 25ee0b77f8ca -r 16a903968d18 query.cpp --- a/query.cpp Wed Dec 24 10:57:09 2008 +0000 +++ b/query.cpp Wed Dec 24 10:57:14 2008 +0000 @@ -30,7 +30,28 @@ adb_datum_t datum = {0}; qspec.refine.flags = 0; - /* FIXME: trackFile / ADB_REFINE_KEYLIST */ + if(trackFile) { + qspec.refine.flags |= ADB_REFINE_INCLUDE_KEYLIST; + std::vector v; + char *k = new char[MAXSTR]; + trackFile->getline(k, MAXSTR); + while(!trackFile->eof()) { + v.push_back(k); + k = new char[MAXSTR]; + trackFile->getline(k, MAXSTR); + } + delete [] k; + qspec.refine.include.nkeys = v.size(); + qspec.refine.include.keys = new const char *[qspec.refine.include.nkeys]; + for(unsigned int k = 0; k < qspec.refine.include.nkeys; k++) { + qspec.refine.include.keys[k] = v[k]; + } + } + if(query_from_key) { + qspec.refine.flags |= ADB_REFINE_EXCLUDE_KEYLIST; + qspec.refine.exclude.nkeys = 1; + qspec.refine.exclude.keys = &key; + } if(radius) { qspec.refine.flags |= ADB_REFINE_RADIUS; qspec.refine.radius = radius; @@ -205,7 +226,7 @@ } else{ VERB_LOG(1, "Calling brute-force query on database %s\n", dbName); - if(query_loop(adb, &qspec, query_from_key_index)) { + if(query_loop(adb, &qspec)) { error("query_loop failed"); } } @@ -677,7 +698,7 @@ } if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) { adb_result_t r; - r.key = fileTable + pp.trackID * O2_FILETABLE_ENTRY_SIZE; + r.key = (*adb->keys)[pp.trackID].c_str(); r.dist = dist; r.qpos = pp.qpos; r.ipos = pp.spos; @@ -692,13 +713,29 @@ SAFE_DELETE_ARRAY(dbpointers.mean_duration); } -int audioDB::query_loop(adb_t *adb, adb_query_spec_t *spec, Uns32T queryIndex) { +int audioDB::query_loop(adb_t *adb, adb_query_spec_t *spec) { double *query, *query_data; adb_qpointers_internal_t qpointers = {0}, dbpointers = {0}; bool power_refine = spec->refine.flags & (ADB_REFINE_ABSOLUTE_THRESHOLD|ADB_REFINE_RELATIVE_THRESHOLD); + std::set keys; + if(spec->refine.flags & ADB_REFINE_INCLUDE_KEYLIST) { + for(unsigned int k = 0; k < spec->refine.include.nkeys; k++) { + keys.insert(spec->refine.include.keys[k]); + } + } else { + for(unsigned int k = 0; k < adb->header->numFiles; k++) { + keys.insert((*adb->keys)[k]); + } + } + if(spec->refine.flags & ADB_REFINE_EXCLUDE_KEYLIST) { + for(unsigned int k = 0; k < spec->refine.exclude.nkeys; k++) { + keys.erase(spec->refine.exclude.keys[k]); + } + } + if(adb->header->flags & O2_FLAG_LARGE_ADB) { /* FIXME: actually it would be nice to support this mode of * operation, but for now... */ @@ -721,46 +758,26 @@ D = new double*[qpointers.nvectors]; // pre-allocate DD = new double*[qpointers.nvectors]; - unsigned processedTracks = 0; off_t trackIndexOffset; - char nextKey[MAXSTR]; // Track loop size_t data_buffer_size = 0; double *data_buffer = 0; lseek(adb->fd, adb->header->dataOffset, SEEK_SET); - for(processedTracks=0, track=0 ; processedTracks < adb->header->numFiles ; track++, processedTracks++) { + for(track = 0; track < adb->header->numFiles; track++) { + unsigned t = track; - trackOffset = (*adb->track_offsets)[track]; - - // get trackID from file if using a control file - if(trackFile) { - trackFile->getline(nextKey,MAXSTR); - if(!trackFile->eof()) { - track = audiodb_key_index(adb, nextKey); - if(track == (uint32_t) -1) { - return 1; - } - trackOffset = (*adb->track_offsets)[track]; - lseek(adb->fd, adb->header->dataOffset + trackOffset, SEEK_SET); - } else { - break; + while (keys.find((*adb->keys)[track]) == keys.end()) { + track++; + if(track == adb->header->numFiles) { + goto loop_finish; } } - - // skip identity on query_from_key - if( query_from_key && (track == queryIndex) ) { - if(queryIndex!=adb->header->numFiles-1){ - track++; - trackOffset = (*adb->track_offsets)[track]; - lseek(adb->fd, adb->header->dataOffset + trackOffset, SEEK_SET); - } - else{ - break; - } + trackOffset = (*adb->track_offsets)[track]; + if(track != t) { + lseek(adb->fd, adb->header->dataOffset + trackOffset, SEEK_SET); } - trackIndexOffset = trackOffset / (adb->header->dim * sizeof(double)); // dbpointers.nvectors offset if(audiodb_read_data(adb, adb->fd, track, &data_buffer, &data_buffer_size)) { @@ -796,7 +813,7 @@ if((!(spec->refine.flags & ADB_REFINE_RADIUS)) || thisDist <= (spec->refine.radius+O2_DISTANCE_TOLERANCE)) { adb_result_t r; - r.key = fileTable + track * O2_FILETABLE_ENTRY_SIZE; + r.key = (*adb->keys)[track].c_str(); r.dist = thisDist; if(spec->qid.flags & ADB_QUERY_ID_FLAG_EXHAUSTIVE) { r.qpos = j; @@ -814,6 +831,8 @@ } } + loop_finish: + free(data_buffer); // Clean up