# HG changeset patch # User mas01mc # Date 1233969605 0 # Node ID 06ed85832c3bb1c424a006e8bb8d1bcebb91a607 # Parent 02e0a9ecfd0ff4188c8dfa1276df93ed28d64689 Optimized the query_loop_points inner loop for memcpy and I/O efficiency. Uses sparse seeks and reads to perform scattered reads across data set. Current version does not cache fid between open calls to the same trackID. diff -r 02e0a9ecfd0f -r 06ed85832c3b audioDB-internals.h --- a/audioDB-internals.h Fri Feb 06 21:08:35 2009 +0000 +++ b/audioDB-internals.h Sat Feb 07 01:20:05 2009 +0000 @@ -290,8 +290,8 @@ } int audiodb_read_data(adb_t *, int, int, double **, size_t *); -int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *); -int audiodb_track_id_datum(adb_t *, uint32_t, adb_datum_t *); +int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size); +int audiodb_track_id_datum(adb_t *, uint32_t , adb_datum_t , off_t vector_offset, size_t vector_size); int audiodb_free_datum(adb_datum_t *); int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *); int audiodb_datum_qpointers_partial(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *, adb_qstate_internal_t *); diff -r 02e0a9ecfd0f -r 06ed85832c3b insert.cpp --- a/insert.cpp Fri Feb 06 21:08:35 2009 +0000 +++ b/insert.cpp Sat Feb 07 01:20:05 2009 +0000 @@ -288,7 +288,7 @@ return 0; } -int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum) { +int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset=0, size_t data_size=0) { int fd = 0; FILE *file = NULL; struct stat st; @@ -304,12 +304,17 @@ goto error; } read_or_goto_error(fd, &(datum->dim), sizeof(uint32_t)); - size = st.st_size - sizeof(uint32_t); + if(data_size) + size = data_size; + else + size = st.st_size - sizeof(uint32_t); datum->nvectors = size / (sizeof(double) * datum->dim); datum->data = (double *) malloc(size); if(!datum->data) { goto error; } + if(data_offset) + lseek(fd, data_offset, SEEK_CUR); read_or_goto_error(fd, datum->data, size); close(fd); fd = 0; @@ -347,7 +352,7 @@ * * I hate C. */ - if(((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) { + if( (!data_size) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) { goto error; } read_or_goto_error(fd, &dim, sizeof(uint32_t)); @@ -358,6 +363,8 @@ if(!datum->power) { goto error; } + if(data_offset) + lseek(fd, data_offset/datum->dim, SEEK_CUR); read_or_goto_error(fd, datum->power, size / datum->dim); close(fd); } @@ -373,6 +380,10 @@ if(fscanf(file, " %lf", &t) != 1) { goto error; } + if(data_offset) + while(data_offset-- > 1) + if(fscanf(file, " %lf", &t) != 1) + goto error; tp = datum->times; *tp++ = t; for(unsigned int n = 0; n < datum->nvectors - 1; n++) { diff -r 02e0a9ecfd0f -r 06ed85832c3b query.cpp --- a/query.cpp Fri Feb 06 21:08:35 2009 +0000 +++ b/query.cpp Sat Feb 07 01:20:05 2009 +0000 @@ -198,7 +198,7 @@ return 1; } -int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d) { +int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0) { off_t track_offset = (*adb->track_offsets)[track_id]; if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) { /* create a reference/insert, then use adb_insert_create_datum() */ @@ -217,24 +217,27 @@ read_or_goto_error(adb->fd, times, ADB_MAXSTR); reference.times = times; } - return audiodb_insert_create_datum(&reference, d); + return audiodb_insert_create_datum(&reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double)); } else { /* initialize from sources of data that we already have */ - d->nvectors = (*adb->track_lengths)[track_id]; + if(num_vectors) + d->nvectors = num_vectors; + else + d->nvectors = (*adb->track_lengths)[track_id]; d->dim = adb->header->dim; d->key = (*adb->keys)[track_id].c_str(); /* read out stuff from the database tables */ d->data = (double *) malloc(d->nvectors * d->dim * sizeof(double)); - lseek(adb->fd, adb->header->dataOffset + track_offset, SEEK_SET); + lseek(adb->fd, adb->header->dataOffset + track_offset + vector_offset*d->dim*sizeof(double), SEEK_SET); read_or_goto_error(adb->fd, d->data, d->nvectors * d->dim * sizeof(double)); if(adb->header->flags & ADB_HEADER_FLAG_POWER) { d->power = (double *) malloc(d->nvectors * sizeof(double)); - lseek(adb->fd, adb->header->powerTableOffset + track_offset / d->dim, SEEK_SET); + lseek(adb->fd, adb->header->powerTableOffset + track_offset / d->dim + vector_offset*sizeof(double), SEEK_SET); read_or_goto_error(adb->fd, d->power, d->nvectors * sizeof(double)); } if(adb->header->flags & ADB_HEADER_FLAG_TIMES) { d->times = (double *) malloc(2 * d->nvectors * sizeof(double)); - lseek(adb->fd, adb->header->timesTableOffset + track_offset / d->dim, SEEK_SET); + lseek(adb->fd, adb->header->timesTableOffset + track_offset / d->dim + 2 * vector_offset*sizeof(double), SEEK_SET); read_or_goto_error(adb->fd, d->times, 2 * d->nvectors * sizeof(double)); } return 0; @@ -285,34 +288,19 @@ adb_qstate_internal_t *qstate){ uint32_t nvectors = d->nvectors; qpointers->nvectors = nvectors; - std::priority_queue, greater > ppairs(*qstate->exact_evaluation_queue); - size_t vector_size = nvectors * sizeof(double) * d->dim; - - if(d->power) - qpointers->power_data = new double[vector_size / d->dim]; - - uint32_t seq_len_dbl = sequence_length*sizeof(double); - PointPair pp = ppairs.top(); - uint32_t tid = pp.trackID; - - while( !ppairs.empty() && pp.trackID==tid){ - uint32_t spos = pp.spos; + PointPair pp = (*qstate->exact_evaluation_queue).top(); #ifdef _LSH_DEBUG_ - cout << "tid=" << pp.trackID << " qpos=" << pp.qpos << " spos=" << pp.spos << endl; - cout.flush(); + cout << "tid=" << pp.trackID << " qpos=" << pp.qpos << " spos=" << pp.spos << endl; + cout.flush(); #endif - - if(d->power) { - memcpy(qpointers->power_data+spos, d->power+spos, seq_len_dbl); - audiodb_sequence_sum(qpointers->power_data+spos, sequence_length, sequence_length); - audiodb_sequence_average(qpointers->power_data+spos, sequence_length, sequence_length); - } - ppairs.pop(); - if(!ppairs.empty()) - pp = ppairs.top(); + + if(d->power) { + //memcpy(qpointers->power_data, d->power, seq_len_dbl); + audiodb_sequence_sum(d->power, sequence_length, sequence_length); + audiodb_sequence_average(d->power, sequence_length, sequence_length); } - + if(d->times) { qpointers->mean_duration = new double[1]; *qpointers->mean_duration = 0; @@ -321,11 +309,11 @@ } *qpointers->mean_duration /= nvectors; } - + *vector = d->data; *vector_data = d->data; qpointers->l2norm = 0 ; - qpointers->power = qpointers->power_data; + qpointers->power = d->power; return 0; } @@ -496,7 +484,6 @@ */ double dist; double *dbdata = 0, *dbdata_pointer; - Uns32T currentTrack = 0x80000000; // KLUDGE: Initialize with a value outside of track index range Uns32T npairs = qstate->exact_evaluation_queue->size(); #ifdef _LSH_DEBUG_ cout << "Num vector pairs to evaluate: " << npairs << "..." << endl; @@ -505,33 +492,28 @@ adb_datum_t d = {0}; while(npairs--) { PointPair pp = qstate->exact_evaluation_queue->top(); - if(currentTrack != pp.trackID) { - maybe_delete_array(dbpointers.power_data); - maybe_delete_array(dbpointers.mean_duration); - currentTrack = pp.trackID; + maybe_delete_array(dbpointers.mean_duration); + if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length)) { + delete qstate->exact_evaluation_queue; + delete qstate->set; + return 1; + } + + if(audiodb_datum_qpointers_partial(&d, sequence_length, &dbdata, &dbdata_pointer, &dbpointers, qstate)) { + delete qstate->exact_evaluation_queue; + delete qstate->set; audiodb_free_datum(&d); - if(audiodb_track_id_datum(adb, pp.trackID, &d)) { - delete qstate->exact_evaluation_queue; - delete qstate->set; - return 1; - } + return 1; + } - if(audiodb_datum_qpointers_partial(&d, sequence_length, &dbdata, &dbdata_pointer, &dbpointers, qstate)) { - delete qstate->exact_evaluation_queue; - delete qstate->set; - audiodb_free_datum(&d); - return 1; - } - } Uns32T qPos = (spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) ? pp.qpos : 0; - Uns32T sPos = pp.spos; // index into l2norm table // Test power thresholds before computing distance - if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[sPos])) && - ( qPosnvectors-sequence_length+1 && sPos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){ + if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[0])) && + ( qPosnvectors-sequence_length+1 && pp.spos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){ // Compute distance - dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length); + dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata, adb->header->dim*sequence_length); double qn = audiodb_dot_product(query + qPos*adb->header->dim, query + qPos*adb->header->dim, adb->header->dim*sequence_length); - double sn = audiodb_dot_product(dbdata + sPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length); + double sn = audiodb_dot_product(dbdata, dbdata, adb->header->dim*sequence_length); qn = sqrt(qn); sn = sqrt(sn); switch(spec->params.distance) { @@ -553,13 +535,13 @@ } } qstate->exact_evaluation_queue->pop(); + audiodb_free_datum(&d); } // Cleanup - audiodb_free_datum(&d); // maybe_delete_array(dbdata); //maybe_delete_array(dbpointers.l2norm_data); - maybe_delete_array(dbpointers.power_data); + //maybe_delete_array(dbpointers.power_data); maybe_delete_array(dbpointers.mean_duration); delete qstate->exact_evaluation_queue; delete qstate->set;