Mercurial > hg > audiodb
changeset 541:52d82badc544 multiprobeLSH
Added file caching for sparse datum reads. This required making a new type called adb_fd_cache_t and modifying the read methods audiodb_track_id_datum() and audiodb_insert_create_datum() to use the cache struct if one is provided by the user.
author | mas01mc |
---|---|
date | Sat, 07 Feb 2009 16:59:31 +0000 |
parents | 1bf090279174 |
children | 79ffab663ace |
files | audioDB-internals.h insert.cpp query.cpp |
diffstat | 3 files changed, 229 insertions(+), 46 deletions(-) [+] |
line wrap: on
line diff
--- a/audioDB-internals.h Sat Feb 07 12:12:46 2009 +0000 +++ b/audioDB-internals.h Sat Feb 07 16:59:31 2009 +0000 @@ -56,6 +56,18 @@ LSH *lsh; } adb_qstate_internal_t; +/* this struct is for caching file descriptors for multiple reads from a data file + */ + +typedef struct adb_fd_cache { + uint32_t track_id; + adb_reference_t* reference; + char* fname; + int data_fd; + int power_fd; + FILE* times_file; +} adb_fd_cache_t; + /* this struct is the in-memory representation of the binary * information stored at the head of each adb file */ typedef struct adbheader { @@ -290,9 +302,11 @@ } int audiodb_read_data(adb_t *, int, int, double **, size_t *); -int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size); -int audiodb_track_id_datum(adb_t *, uint32_t , adb_datum_t , off_t vector_offset, size_t vector_size); +int audiodb_insert_create_datum(adb_insert_t *, adb_datum_t *, off_t data_offset, size_t data_size, adb_fd_cache_t * cache); +int audiodb_track_id_datum(adb_t *, uint32_t , adb_datum_t , off_t vector_offset, size_t vector_size, adb_fd_cache_t * cache); int audiodb_free_datum(adb_datum_t *); +int audiodb_free_datum_cache(adb_fd_cache_t *); +int audiodb_free_datum_reference(adb_reference_t * reference); int audiodb_datum_qpointers(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *); int audiodb_datum_qpointers_partial(adb_datum_t *, uint32_t, double **, double **, adb_qpointers_internal_t *, adb_qstate_internal_t *); int audiodb_query_spec_qpointers(adb_t *, const adb_query_spec_t *, double **, double **, adb_qpointers_internal_t *);
--- a/insert.cpp Sat Feb 07 12:12:46 2009 +0000 +++ b/insert.cpp Sat Feb 07 16:59:31 2009 +0000 @@ -288,44 +288,145 @@ return 0; } -int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset=0, size_t data_size=0) { +int audiodb_free_datum_cache(adb_fd_cache_t *cache){ + if(cache){ + if(cache->fname){ + free(cache->fname); + cache->fname = NULL; + } + if(cache->data_fd){ + close(cache->data_fd); + cache->data_fd = 0; + } + if(cache->power_fd){ + close(cache->power_fd); + cache->power_fd = 0; + } + if(cache->times_file){ + fclose(cache->times_file); + cache->times_file = NULL; + } + if(cache->reference){ + audiodb_free_datum_reference(cache->reference); + cache->reference = NULL; + } + } + return 0; +} + +int audiodb_free_datum_reference(adb_reference_t * reference){ + if(reference){ + if(reference->features){ + free((char *)reference->features); + reference->features = 0; + } + if(reference->power){ + free((char *)reference->power); + reference->power = 0; + } + if(reference->times){ + free((char *)reference->times); + reference->times = 0; + } + } + return 0; +} + +int audiodb_insert_create_datum(adb_insert_t *insert, adb_datum_t *datum, off_t data_offset=0, size_t data_size=0, adb_fd_cache_t *cache=0) { int fd = 0; FILE *file = NULL; struct stat st; off_t size; + bool clear_cache = false; - datum->data = NULL; - datum->power = NULL; - datum->times = NULL; - if((fd = open(insert->features, O_RDONLY)) == -1) { - goto error; + if(!cache){ + datum->data = NULL; + datum->power = NULL; + datum->times = NULL; } + + // STEP 1 check if we need to clear the cache + if(cache && (cache->fname && strncmp(cache->fname, insert->features, strlen(insert->features))!=0)) + clear_cache = true; + + // STEP 2. Clear the cache if necessary + if(cache && clear_cache){ + close(cache->data_fd); + cache->data_fd = 0; + free(cache->fname); + cache->fname = 0; + } + + // STEP 3. Use the cached file descriptor or open a new file descriptor + if (cache && cache->data_fd ){ + fd = cache->data_fd; + } + else{ + if ((fd = open(insert->features, O_RDONLY)) == -1) { + goto error; + } + if(cache){ + cache->fname = (char*) malloc(strlen(insert->features)); + strncpy(cache->fname, insert->features, strlen(insert->features)); + } + } + if(fstat(fd, &st)) { goto error; } - read_or_goto_error(fd, &(datum->dim), sizeof(uint32_t)); + + // STEP 4. If file descriptor is new, read the dimensionality, maybe cache the file descriptor + if( !( cache && cache->data_fd ) ){ + read_or_goto_error(fd, &(datum->dim), sizeof(uint32_t)); + if(cache) + cache->data_fd = fd; + } + + // STEP 5. Allocate data memory if necessary, read the requested amount of data if(data_size) size = data_size; else size = st.st_size - sizeof(uint32_t); + datum->nvectors = size / (sizeof(double) * datum->dim); - datum->data = (double *) malloc(size); + + if(!datum->data){ + datum->data = (double *) malloc(size); + } + if(!datum->data) { goto error; } + if(data_offset) - lseek(fd, data_offset, SEEK_CUR); + lseek(fd, sizeof(uint32_t) + data_offset, SEEK_SET); read_or_goto_error(fd, datum->data, size); - close(fd); - fd = 0; + + // STEP 6. Close the file descriptor, unless we are caching it + if(!cache) + close(fd); + fd = 0; // we're done with the data + if(insert->power) { int dim; - if((fd = open(insert->power, O_RDONLY)) == -1) { + + // Clear the cache if necessary + if(clear_cache){ + close(cache->power_fd); + cache->power_fd = 0; + } + + // Use the cached file descriptor or open a new file descriptor + if (cache && cache->power_fd) + fd = cache->power_fd; + else if((fd = open(insert->power, O_RDONLY)) == -1) { goto error; } + if(fstat(fd, &st)) { goto error; } + /* This cast is so non-trivial that it deserves a comment. * * The data types in this expression, left to right, are: off_t, @@ -352,36 +453,71 @@ * * I hate C. */ + if( (!data_size) && ((off_t) (st.st_size - sizeof(uint32_t))) != (size / datum->dim)) { goto error; } - read_or_goto_error(fd, &dim, sizeof(uint32_t)); - if(dim != 1) { - goto error; + + // If file descriptor is new, read the dimensionality, maybe cache the file descriptor + if( !( cache && cache->power_fd ) ){ + read_or_goto_error(fd, &dim, sizeof(uint32_t)); + if(dim != 1) { + goto error; + } + if(cache) + cache->power_fd = fd; } - datum->power = (double *) malloc(size / datum->dim); + + // Allocate data memory if necessary, read the requested amount of data + if(!datum->power) + datum->power = (double *) malloc(size / datum->dim); if(!datum->power) { goto error; } + if(data_offset) - lseek(fd, data_offset/datum->dim, SEEK_CUR); + lseek(fd, sizeof(uint32_t) + data_offset/datum->dim, SEEK_SET); + read_or_goto_error(fd, datum->power, size / datum->dim); - close(fd); + + if(!cache) + close(fd); + fd = 0; } + if(insert->times) { double t, *tp; - if(!(file = fopen(insert->times, "r"))) { - goto error; + + // Clear the cache if necessary + if(clear_cache){ + fclose(cache->times_file); + cache->times_file = 0; } - datum->times = (double *) malloc(2 * size / datum->dim); + + // Use the cached file descriptor or open a new file descriptor and maybe cache + if (cache && cache->times_file) + file = cache->times_file; + else{ + if(!(file = fopen(insert->times, "r"))) { + goto error; + } + if(cache) + cache->times_file = file; + } + + // Allocate data memory if necessary, read the requested amount of data + if(!datum->times) + datum->times = (double *) malloc(2 * size / datum->dim); if(!datum->times) { goto error; } + + rewind(file); if(fscanf(file, " %lf", &t) != 1) { goto error; } if(data_offset) - while(data_offset--) + while(data_offset-- != 1 ) if(fscanf(file, " %lf", &t) != 1) goto error; tp = datum->times; @@ -397,7 +533,10 @@ goto error; } *tp = t; - fclose(file); + if(!cache){ + fclose(file); + file=0; + } } datum->key = insert->key ? insert->key : insert->features; return 0; @@ -410,8 +549,10 @@ fclose(file); } audiodb_free_datum(datum); + if(cache) + audiodb_free_datum_cache(cache); return 1; -} + } int audiodb_insert(adb_t *adb, adb_insert_t *insert) { if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) {
--- a/query.cpp Sat Feb 07 12:12:46 2009 +0000 +++ b/query.cpp Sat Feb 07 16:59:31 2009 +0000 @@ -198,26 +198,46 @@ return 1; } -int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0) { +int audiodb_track_id_datum(adb_t *adb, uint32_t track_id, adb_datum_t *d, off_t vector_offset=0, size_t num_vectors=0, adb_fd_cache_t* cache=0){ off_t track_offset = (*adb->track_offsets)[track_id]; + if(adb->header->flags & ADB_HEADER_FLAG_REFERENCES) { /* create a reference/insert, then use adb_insert_create_datum() */ - adb_reference_t reference = {0}; - char features[ADB_MAXSTR], power[ADB_MAXSTR], times[ADB_MAXSTR]; - lseek(adb->fd, adb->header->dataOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); - read_or_goto_error(adb->fd, features, ADB_MAXSTR); - reference.features = features; - if(adb->header->flags & ADB_HEADER_FLAG_POWER) { - lseek(adb->fd, adb->header->powerTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); - read_or_goto_error(adb->fd, power, ADB_MAXSTR); - reference.power = power; + adb_reference_t *reference = NULL; + if(! (cache && cache->reference) ){ + reference = (adb_reference_t *) malloc(sizeof(adb_reference_t)); + reference->features = (char*) malloc(ADB_MAXSTR*sizeof(char)); + if(adb->header->flags & ADB_HEADER_FLAG_POWER) + reference->power = (char*) malloc(ADB_MAXSTR*sizeof(char)); + if(adb->header->flags & ADB_HEADER_FLAG_TIMES) + reference->times = (char*)malloc(ADB_MAXSTR*sizeof(char)); + if(cache) + cache->reference = reference; } - if(adb->header->flags & ADB_HEADER_FLAG_TIMES) { - lseek(adb->fd, adb->header->timesTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); - read_or_goto_error(adb->fd, times, ADB_MAXSTR); - reference.times = times; + else + reference = cache->reference; + + if(! (cache && cache->track_id==track_id) ){ + if(cache) + cache->track_id = track_id; + lseek(adb->fd, adb->header->dataOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); + read_or_goto_error(adb->fd, (void *)reference->features, ADB_MAXSTR); + if(adb->header->flags & ADB_HEADER_FLAG_POWER) { + lseek(adb->fd, adb->header->powerTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); + read_or_goto_error(adb->fd, (void *)reference->power, ADB_MAXSTR); + } + if(adb->header->flags & ADB_HEADER_FLAG_TIMES) { + lseek(adb->fd, adb->header->timesTableOffset + track_id * ADB_FILETABLE_ENTRY_SIZE, SEEK_SET); + read_or_goto_error(adb->fd, (void *)reference->times, ADB_MAXSTR); + } } - return audiodb_insert_create_datum(&reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double)); + + int retval = audiodb_insert_create_datum(reference, d, vector_offset*adb->header->dim*sizeof(double), num_vectors*adb->header->dim*sizeof(double), cache); + if(!cache){ + audiodb_free_datum_reference(reference); + free(reference); + } + return retval; } else { /* initialize from sources of data that we already have */ if(num_vectors) @@ -485,17 +505,26 @@ double dist; double *dbdata = 0, *dbdata_pointer; Uns32T npairs = qstate->exact_evaluation_queue->size(); + Uns32T currentTrack = qstate->exact_evaluation_queue->top().trackID+1; // i.e. not first track #ifdef _LSH_DEBUG_ cout << "Num vector pairs to evaluate: " << npairs << "..." << endl; cout.flush(); #endif adb_datum_t d = {0}; + adb_fd_cache_t c = {0}; + c.track_id = currentTrack; while(npairs--) { PointPair pp = qstate->exact_evaluation_queue->top(); + if(pp.trackID != currentTrack){ + audiodb_free_datum(&d); + currentTrack = pp.trackID; + } maybe_delete_array(dbpointers.mean_duration); - if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length)) { + if(audiodb_track_id_datum(adb, pp.trackID, &d, pp.spos, sequence_length, &c)) { delete qstate->exact_evaluation_queue; delete qstate->set; + audiodb_free_datum(&d); + audiodb_free_datum_cache(&c); return 1; } @@ -503,6 +532,7 @@ delete qstate->exact_evaluation_queue; delete qstate->set; audiodb_free_datum(&d); + audiodb_free_datum_cache(&c); return 1; } @@ -535,13 +565,11 @@ } } qstate->exact_evaluation_queue->pop(); - audiodb_free_datum(&d); } // Cleanup - // maybe_delete_array(dbdata); - //maybe_delete_array(dbpointers.l2norm_data); - //maybe_delete_array(dbpointers.power_data); + audiodb_free_datum(&d); + audiodb_free_datum_cache(&c); maybe_delete_array(dbpointers.mean_duration); delete qstate->exact_evaluation_queue; delete qstate->set;