# HG changeset patch # User mas01cr # Date 1251479646 0 # Node ID e21a3db643afcea6b7e821b28a58ad88e9c3c2c3 # Parent 368320b31db67a3352ac441eaaa73aff1e5a7b8c MORE MEMORY SANITY Move the logic tracking which points have been visited already (including the std::set datastructure) into the indexed query codepaths, rather than inside accumulators. This has the effect of drastically reducing the memory used in non-indexed queries, such that the working set for a 500-file database with 100000 vectors total goes from 1.2GB to slightly under 3MB. All this and less code, too! diff -r 368320b31db6 -r e21a3db643af audioDB-internals.h --- a/audioDB-internals.h Fri Aug 21 15:23:32 2009 +0000 +++ b/audioDB-internals.h Fri Aug 28 17:14:06 2009 +0000 @@ -67,21 +67,6 @@ double *mean_duration; } adb_qpointers_internal_t; -/* this struct is for maintaining per-query state. We don't want to - * store this stuff in the adb struct itself, because (a) it doesn't - * belong there and (b) in principle people might do two queries in - * parallel using the same adb handle. (b) is in practice a little - * bit academic because at the moment we're seeking all over the disk - * using adb->fd, but changing to use pread() might win us - * threadsafety eventually. - */ -typedef struct adb_qstate_internal { - Accumulator *accumulator; - std::set *allowed_keys; - std::priority_queue *exact_evaluation_queue; - LSH *lsh; -} adb_qstate_internal_t; - /* this struct is the in-memory representation of the binary * information stored at the head of each adb file */ typedef struct adbheader { @@ -168,6 +153,22 @@ } } adb_result_triple_lt; +/* this struct is for maintaining per-query state. We don't want to + * store this stuff in the adb struct itself, because (a) it doesn't + * belong there and (b) in principle people might do two queries in + * parallel using the same adb handle. (b) is in practice a little + * bit academic because at the moment we're seeking all over the disk + * using adb->fd, but changing to use pread() might win us + * threadsafety eventually. + */ +typedef struct adb_qstate_internal { + Accumulator *accumulator; + std::set *allowed_keys; + std::priority_queue *exact_evaluation_queue; + std::set< adb_result_t, adb_result_triple_lt > *set; + LSH *lsh; +} adb_qstate_internal_t; + /* We could go gcc-specific here and use typeof() instead of passing * in an explicit type. Answers on a postcard as to whether that's a * good plan or not. */ diff -r 368320b31db6 -r e21a3db643af dbaccumulator.h --- a/dbaccumulator.h Fri Aug 21 15:23:32 2009 +0000 +++ b/dbaccumulator.h Fri Aug 28 17:14:06 2009 +0000 @@ -7,32 +7,24 @@ private: unsigned int pointNN; std::priority_queue< adb_result_t, std::vector, T > *queue; - std::set< adb_result_t, adb_result_triple_lt > *set; }; template DBAccumulator::DBAccumulator(unsigned int pointNN) - : pointNN(pointNN), queue(0), set(0) { + : pointNN(pointNN), queue(0) { queue = new std::priority_queue< adb_result_t, std::vector, T>; - set = new std::set; } template DBAccumulator::~DBAccumulator() { if(queue) { delete queue; } - if(set) { - delete set; - } } template void DBAccumulator::add_point(adb_result_t *r) { if(!isnan(r->dist)) { - if(set->find(*r) == set->end()) { - set->insert(*r); - queue->push(*r); - if(queue->size() > pointNN) { - queue->pop(); - } + queue->push(*r); + if(queue->size() > pointNN) { + queue->pop(); } } } diff -r 368320b31db6 -r e21a3db643af nearestaccumulator.h --- a/nearestaccumulator.h Fri Aug 21 15:23:32 2009 +0000 +++ b/nearestaccumulator.h Fri Aug 28 17:14:06 2009 +0000 @@ -5,20 +5,15 @@ void add_point(adb_result_t *r); adb_query_results_t *get_points(); private: - std::set< adb_result_t, adb_result_triple_lt > *set; std::set< adb_result_t, adb_result_qpos_lt > *points; }; template NearestAccumulator::NearestAccumulator() - : set(0), points(0) { - set = new std::set< adb_result_t, adb_result_triple_lt >; + : points(0) { points = new std::set< adb_result_t, adb_result_qpos_lt >; } template NearestAccumulator::~NearestAccumulator() { - if(set) { - delete set; - } if(points) { delete points; } @@ -26,17 +21,13 @@ template void NearestAccumulator::add_point(adb_result_t *r) { if(!isnan(r->dist)) { - if(set->find(*r) == set->end()) { - set->insert(*r); - - std::set< adb_result_t, adb_result_qpos_lt >::iterator it; - it = points->find(*r); - if(it == points->end()) { - points->insert(*r); - } else if(T()(*(const adb_result_t *)r,(*it))) { - points->erase(it); - points->insert(*r); - } + std::set< adb_result_t, adb_result_qpos_lt >::iterator it; + it = points->find(*r); + if(it == points->end()) { + points->insert(*r); + } else if(T()(*(const adb_result_t *)r,(*it))) { + points->erase(it); + points->insert(*r); } } } diff -r 368320b31db6 -r e21a3db643af pertrackaccumulator.h --- a/pertrackaccumulator.h Fri Aug 21 15:23:32 2009 +0000 +++ b/pertrackaccumulator.h Fri Aug 28 17:14:06 2009 +0000 @@ -8,13 +8,11 @@ unsigned int pointNN; unsigned int trackNN; std::map, T > *, adb_result_key_lt> *queues; - std::set< adb_result_t, adb_result_triple_lt > *set; }; template PerTrackAccumulator::PerTrackAccumulator(unsigned int pointNN, unsigned int trackNN) - : pointNN(pointNN), trackNN(trackNN), queues(0), set(0) { + : pointNN(pointNN), trackNN(trackNN), queues(0) { queues = new std::map, T > *, adb_result_key_lt>; - set = new std::set< adb_result_t, adb_result_triple_lt >; } template PerTrackAccumulator::~PerTrackAccumulator() { @@ -25,30 +23,23 @@ } delete queues; } - if(set) { - delete set; - } } template void PerTrackAccumulator::add_point(adb_result_t *r) { if(!isnan(r->dist)) { - if(set->find(*r) == set->end()) { - set->insert(*r); - - typename std::map< adb_result_t, std::priority_queue< adb_result_t, std::vector< adb_result_t >, T > *, adb_result_key_lt>::iterator it; - std::priority_queue< adb_result_t, std::vector< adb_result_t >, T > *queue; - it = queues->find(*r); - if(it == queues->end()) { - queue = new std::priority_queue< adb_result_t, std::vector< adb_result_t >, T >; - (*queues)[*r] = queue; - } else { - queue = (*it).second; - } - - queue->push(*r); - if(queue->size() > pointNN) { - queue->pop(); - } + typename std::map< adb_result_t, std::priority_queue< adb_result_t, std::vector< adb_result_t >, T > *, adb_result_key_lt>::iterator it; + std::priority_queue< adb_result_t, std::vector< adb_result_t >, T > *queue; + it = queues->find(*r); + if(it == queues->end()) { + queue = new std::priority_queue< adb_result_t, std::vector< adb_result_t >, T >; + (*queues)[*r] = queue; + } else { + queue = (*it).second; + } + + queue->push(*r); + if(queue->size() > pointNN) { + queue->pop(); } } } diff -r 368320b31db6 -r e21a3db643af query-indexed.cpp --- a/query-indexed.cpp Fri Aug 21 15:23:32 2009 +0000 +++ b/query-indexed.cpp Fri Aug 28 17:14:06 2009 +0000 @@ -54,7 +54,10 @@ r.dist = dist; r.qpos = qpos; r.ipos = spos; - qstate->accumulator->add_point(&r); + if(qstate->set->find(r) == qstate->set->end()) { + qstate->set->insert(r); + qstate->accumulator->add_point(&r); + } } } @@ -95,6 +98,8 @@ bool use_absolute_threshold = spec->refine.flags & ADB_REFINE_ABSOLUTE_THRESHOLD; double absolute_threshold = spec->refine.absolute_threshold; + qstate->set = new std::set< adb_result_t, adb_result_triple_lt >; + if(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES) { add_point_func = &audiodb_index_add_point_approximate; } else { @@ -163,6 +168,9 @@ if(!(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES)) { audiodb_query_queue_loop(adb, spec, qstate, query, &qpointers); } + + delete qstate->set; + // Clean up if(query_data) diff -r 368320b31db6 -r e21a3db643af query.cpp --- a/query.cpp Fri Aug 21 15:23:32 2009 +0000 +++ b/query.cpp Fri Aug 28 17:14:06 2009 +0000 @@ -477,7 +477,10 @@ r.dist = dist; r.qpos = pp.qpos; r.ipos = pp.spos; - qstate->accumulator->add_point(&r); + if(qstate->set->find(r) == qstate->set->end()) { + qstate->set->insert(r); + qstate->accumulator->add_point(&r); + } } } qstate->exact_evaluation_queue->pop();