Mercurial > hg > audiodb
changeset 404:1fb8bee777e5 api-inversion
Begin working towards inverting audioDB::insert() / audiodb_insert().
New data type audiodb_datum_t, roughly corresponding to a "track" in
current audioDB parlance; it contains exactly the feature information
and metadata to record.
New function audiodb_insert_datum() to insert one of these
audiodb_datum_t objects into the database; the intention is that not
only can insertion of feature files be implemented in terms of this
function, but that it will be a useful function in its own right,
callable perhaps from PD, Max/MSP, and/or a VAMP plugin. This function
is complicated enough that it actually gets a comment.
Implement audioDB::insert() in terms of audiodb_insert_datum(), via a
wrapper which handles the slightly wacky error/non-error case of
attempting to insert features with a key that already exists in the
database.
Delete whole rafts of code. We can't quite delete everything because
there's batchinsert / batchinsert_large_adb to sort out; the good news
is that the batchinsert operation can simply be implemented as a loop
around audiodb_insert_datum() without loss of efficiency.
(There's also a stray extra audiodb_insert() in libtests/0027/, found
through an earlier iteration of this patch.)
author | mas01cr |
---|---|
date | Fri, 05 Dec 2008 22:32:43 +0000 |
parents | 7038f31124d1 |
children | ef4792df8f93 |
files | audioDB.h audioDB_API.h insert.cpp libtests/0027/prog1.c |
diffstat | 4 files changed, 240 insertions(+), 206 deletions(-) [+] |
line wrap: on
line diff
--- a/audioDB.h Wed Dec 03 17:40:17 2008 +0000 +++ b/audioDB.h Fri Dec 05 22:32:43 2008 +0000 @@ -358,9 +358,8 @@ void release_lock(int fd); void create(const char* dbName); bool enough_per_file_space_free(); - bool enough_data_space_free(off_t size); - void insert_data_vectors(off_t offset, void *buffer, size_t size); void insert(const char* dbName, const char* inFile); + void insertDatum(const char *inFile, std::ifstream *timesFile, int powerfd, const char *key); void batchinsert(const char* dbName, const char* inFile); void batchinsert_large_adb(const char* dbName, const char* inFile); void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0);
--- a/audioDB_API.h Wed Dec 03 17:40:17 2008 +0000 +++ b/audioDB_API.h Fri Dec 05 22:32:43 2008 +0000 @@ -21,6 +21,16 @@ that we should prefer "audiodb_" */ typedef struct adb adb_t, *adb_ptr; +struct adb_datum { + uint32_t nvectors; + uint32_t dim; + const char *key; + double *data; + double *power; + double *times; +}; +typedef struct adb_datum adb_datum_t; + //used for both insert and batchinsert struct adbinsert { @@ -102,6 +112,7 @@ int audiodb_power(adb_ptr mydb); /* insert functions */ +int audiodb_insert_datum(adb_t *, adb_datum_t *); int audiodb_insert(adb_ptr mydb, adb_insert_ptr ins); int audiodb_batchinsert(adb_ptr mydb, adb_insert_ptr ins, unsigned int size);
--- a/insert.cpp Wed Dec 03 17:40:17 2008 +0000 +++ b/insert.cpp Fri Dec 05 22:32:43 2008 +0000 @@ -1,4 +1,162 @@ #include "audioDB.h" +extern "C" { +#include "audioDB_API.h" +} +#include "audioDB-internals.h" + +static bool audiodb_enough_data_space_free(adb_t *adb, off_t size) { + adb_header_t *header = adb->header; + /* FIXME: timesTableOffset isn't necessarily the next biggest offset + after dataOffset. Maybe make the offsets into an array that we + can iterate over... */ + return (header->timesTableOffset > + header->dataOffset + header->length + size); +} + +static bool audiodb_enough_per_file_space_free(adb_t *adb) { + /* FIXME: the comment above about the ordering of the tables applies + here too. */ + adb_header_t *header = adb->header; + off_t file_table_length = header->trackTableOffset - header->fileTableOffset; + off_t track_table_length = header->dataOffset - header->trackTableOffset; + int fmaxfiles = file_table_length / O2_FILETABLE_ENTRY_SIZE; + int tmaxfiles = track_table_length / O2_TRACKTABLE_ENTRY_SIZE; + /* maxfiles is the _minimum_ of the two. Do not be confused... */ + unsigned int maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles; + return (header->numFiles < maxfiles); +} + +/* + * Hey, look, a comment. Normally I wouldn't bother, as the code + * should be self-documenting, but a lot of logic is concentrated in + * this one place, so let's give an overview beforehand. To insert a + * datum into the database, we: + * + * 1. check write permission; + * 2. check !O2_FLAG_LARGE_ADB; + * 3. check for enough space; + * 4. check that datum->dim and adb->header->dim agree (or that the + * header dimension is zero, in which case write datum->dim to + * adb->header->dim). + * 5. check for presence of datum->key in adb->keys; + * 6. check for consistency between power and O2_FLAG_POWER, and + * times and O2_FLAG_TIMES; + * 7. write in data, power, times as appropriate; add to track + * and key tables too; + * 8. if O2_FLAG_L2NORM, compute norms and fill in table; + * 9. update adb->keys and adb->header; + * 10. sync adb->header with disk. + * + * Step 10 essentially commits the transaction; until we update + * header->length, nothing will recognize the newly-written data. + * In principle, if it fails, we should roll back, which we can in + * fact do on the assumption that nothing in step 9 can ever fail; + * on the other hand, if it's failed, then it's unlikely that + * rolling back by syncing the original header back to disk is going + * to work desperately well. + */ +int audiodb_insert_datum(adb_t *adb, adb_datum_t *datum) { + + off_t size, offset, nfiles; + double *l2norm_buffer, *lp, *dp; + + /* 1. check write permission; */ + if(!(adb->flags & O_RDWR)) { + return 1; + } + /* 2. check !O2_FLAG_LARGE_ADB; */ + if(adb->header->flags & O2_FLAG_LARGE_ADB) { + return 1; + } + /* 3. check for enough space; */ + size = sizeof(double) * datum->nvectors * datum->dim; + if(!audiodb_enough_data_space_free(adb, size)) { + return 1; + } + if(!audiodb_enough_per_file_space_free(adb)) { + return 1; + } + /* 4. check that datum->dim and adb->header->dim agree (or that the + * header dimension is zero, in which case write datum->dim to + * adb->header->dim). + */ + if(adb->header->dim == 0) { + adb->header->dim = datum->dim; + } else if (adb->header->dim != datum->dim) { + return 1; + } + /* 5. check for presence of datum->key in adb->keys; */ + if(adb->keys->count(datum->key)) { + /* not part of an explicit API/ABI, but we need a distinguished + value in this circumstance to preserve somewhat wonky behaviour + of audioDB::batchinsert. */ + return 2; + } + /* 6. check for consistency between power and O2_FLAG_POWER, and + * times and O2_FLAG_TIMES; + */ + if((datum->power && !(adb->header->flags & O2_FLAG_POWER)) || + ((adb->header->flags & O2_FLAG_POWER) && !datum->power)) { + return 1; + } + if(datum->times && !(adb->header->flags & O2_FLAG_TIMES)) { + if(adb->header->numFiles == 0) { + adb->header->flags |= O2_FLAG_TIMES; + } else { + return 1; + } + } else if ((adb->header->flags & O2_FLAG_TIMES) && !datum->times) { + return 1; + } + /* 7. write in data, power, times as appropriate; add to track + * and key tables too; + */ + offset = adb->header->length; + nfiles = adb->header->numFiles; + + /* FIXME: checking for all these lseek()s and write()s */ + lseek(adb->fd, adb->header->dataOffset + offset, SEEK_SET); + write(adb->fd, datum->data, sizeof(double) * datum->nvectors * datum->dim); + if(datum->power) { + lseek(adb->fd, adb->header->powerTableOffset + offset / datum->dim, SEEK_SET); + write(adb->fd, datum->power, sizeof(double) * datum->nvectors); + } + if(datum->times) { + lseek(adb->fd, adb->header->timesTableOffset + offset / datum->dim * 2, SEEK_SET); + write(adb->fd, datum->times, sizeof(double) * datum->nvectors * 2); + } + lseek(adb->fd, adb->header->trackTableOffset + nfiles * O2_TRACKTABLE_ENTRY_SIZE, SEEK_SET); + write(adb->fd, &datum->nvectors, O2_TRACKTABLE_ENTRY_SIZE); + lseek(adb->fd, adb->header->fileTableOffset + nfiles * O2_FILETABLE_ENTRY_SIZE, SEEK_SET); + write(adb->fd, datum->key, strlen(datum->key)+1); + + /* 8. if O2_FLAG_L2NORM, compute norms and fill in table; */ + l2norm_buffer = (double *) malloc(datum->nvectors * sizeof(double)); + + /* FIXME: shared code with audiodb_norm_existing() */ + dp = datum->data; + lp = l2norm_buffer; + for(size_t i = 0; i < datum->nvectors; i++) { + *lp = 0; + for(unsigned int k = 0; k < datum->dim; k++) { + *lp += (*dp)*(*dp); + dp++; + } + lp++; + } + lseek(adb->fd, adb->header->l2normTableOffset + offset / datum->dim, SEEK_SET); + write(adb->fd, l2norm_buffer, sizeof(double) * datum->nvectors); + free(l2norm_buffer); + + adb->keys->insert(datum->key); + adb->header->numFiles += 1; + adb->header->length += sizeof(double) * datum->nvectors * datum->dim; + + return audiodb_sync_header(adb); + + error: + return 1; +} bool audioDB::enough_per_file_space_free() { unsigned int fmaxfiles, tmaxfiles; @@ -10,110 +168,66 @@ return(dbH->numFiles < maxfiles); } -bool audioDB::enough_data_space_free(off_t size) { - return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); -} - -void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { - if(lseek(dbfid, dbH->dataOffset + offset, SEEK_SET) == (off_t) -1) { - error("error seeking to offset", "", "lseek"); +void audioDB::insertDatum(const char *inFile, std::ifstream *timesFile, int powerfd, const char *key) { + adb_datum_t datum; + int fd; + struct stat statbuf; + off_t size; + int err; + + datum.times = 0; + datum.power = 0; + + if((fd = open(inFile, O_RDONLY)) == -1) { + error("failed to open input file", inFile); } - CHECKED_WRITE(dbfid, buffer, size); + if(fstat(fd, &statbuf)) { + error("failed to stat input file", inFile); + } + read(fd, &(datum.dim), sizeof(uint32_t)); + size = statbuf.st_size - sizeof(uint32_t); + datum.nvectors = size / (sizeof(double) * datum.dim); + datum.data = (double *) malloc(size); + if(!datum.data) { + error("failed to allocate memory"); + } + read(fd, datum.data, size); + close(fd); + if(timesFile) { + datum.times = (double *) malloc(sizeof(double) * datum.nvectors * 2); + if(!datum.times) { + error("failed to allocate memory"); + } + insertTimeStamps(datum.nvectors, timesFile, datum.times); + } + if(powerfd) { + datum.power = (double *) malloc(sizeof(double) * datum.nvectors); + if(!datum.power) { + error("failed to allocate memory"); + } + insertPowerData(datum.nvectors, powerfd, datum.power); + } + datum.key = key ? key : inFile; + err = audiodb_insert_datum(adb, &datum); + if(err && (err != 2)) { + error("failed to insert data for file", inFile); + } } void audioDB::insert(const char* dbName, const char* inFile) { - forWrite = true; - initTables(dbName, inFile); - - if(dbH->flags & O2_FLAG_LARGE_ADB) - error("Single-feature inserts not allowed with LARGE audioDB instances"); - - if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - error("Must use timestamps with timestamped database","use --times"); - - if(!usingPower && (dbH->flags & O2_FLAG_POWER)) - error("Must use power with power-enabled database", dbName); - - if(!enough_per_file_space_free()) { - error("Insert failed: no more room for metadata", inFile); + if(!adb) { + if(!(adb = audiodb_open(dbName, O_RDWR))) { + error("failed to open database", dbName); + } } - - if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { - error("Insert failed: no more room in database", inFile); + if(adb->header->flags & O2_FLAG_LARGE_ADB) { + + } else { + /* at this point, we have powerfd (an fd), timesFile (a + * std::ifstream *) and inFile (a char *). Wacky, huh? */ + insertDatum(inFile, timesFile, powerfd, key); } - - if(!key) - key=inFile; - // Linear scan of filenames check for pre-existing feature - unsigned alreadyInserted=0; - for(unsigned k=0; k<dbH->numFiles; k++) - if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)+1)==0){ - alreadyInserted=1; - break; - } - - if(alreadyInserted) { - VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); - // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08 - return; - } - - // Make a track index table of features to file indexes - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - if(!numVectors) { - VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key); - - // CLEAN UP - munmap(indata,statbuf.st_size); - munmap(db,dbH->dbSize); - close(infid); - return; - } - - INSERT_FILETABLE_STRING(fileTable, key); - - off_t insertoffset = dbH->length;// Store current state - - // Check times status and insert times from file - unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); - double *timesdata = timesTable + 2*indexoffset; - - if(2*(indexoffset + numVectors) > timesTableLength) { - error("out of space for times", key); - } - - if (usingTimes) { - insertTimeStamps(numVectors, timesFile, timesdata); - } - - double *powerdata = powerTable + indexoffset; - insertPowerData(numVectors, powerfd, powerdata); - - // Increment file count - dbH->numFiles++; - - // Update Header information - dbH->length+=(statbuf.st_size-sizeof(int)); - - // Update track to file index map - memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned)); - - insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); - - // Norm the vectors on input if the database is already L2 normed - if(dbH->flags & O2_FLAG_L2NORM) - unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors); - - // Report status status(dbName); - VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int))); - - // Copy the header back to the database - memcpy (db, dbH, sizeof(dbTableHeaderT)); - - // CLEAN UP - munmap(indata,statbuf.st_size); - close(infid); } void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) { @@ -121,12 +235,6 @@ unsigned numtimes = 0; - if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) { - dbH->flags=dbH->flags|O2_FLAG_TIMES; - } else if(!(dbH->flags & O2_FLAG_TIMES)) { - error("Timestamp file used with non-timestamped database", timesFileName); - } - if(!timesFile->is_open()) { error("problem opening times file on timestamped database", timesFileName); } @@ -160,10 +268,6 @@ void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { if(usingPower){ - if (!(dbH->flags & O2_FLAG_POWER)) { - error("Cannot insert power data on non-power DB", dbName); - } - int one; unsigned int count; @@ -185,7 +289,6 @@ } void audioDB::batchinsert(const char* dbName, const char* inFile) { - forWrite = true; initDBHeader(dbName); @@ -207,12 +310,6 @@ if(key && key!=inFile) if(!(keysIn = new std::ifstream(key))) error("Could not open batch key file",key); - - if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - error("Must use timestamps with timestamped database","use --times"); - - if(!usingPower && (dbH->flags & O2_FLAG_POWER)) - error("Must use power with power-enabled database", dbName); unsigned totalVectors=0; char *thisFile = new char[MAXSTR]; @@ -223,12 +320,6 @@ char *thisTimesFileName = new char[MAXSTR]; char *thisPowerFileName = new char[MAXSTR]; - std::set<std::string> s; - - for (unsigned k = 0; k < dbH->numFiles; k++) { - s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); - } - do { filesIn->getline(thisFile,MAXSTR); if(key && key!=inFile) { @@ -246,93 +337,26 @@ if(filesIn->eof()) { break; } - initInputFile(thisFile); - - if(!enough_per_file_space_free()) { - error("batchinsert failed: no more room for metadata", thisFile); + if(usingTimes){ + if(timesFile->eof()) { + error("not enough timestamp files in timesList", timesFileName); + } + thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); } - - if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { - error("batchinsert failed: no more room in database", thisFile); - } - - if(s.count(thisKey)) { - VERB_LOG(0, "key already exists in database: %s\n", thisKey); - } else { - s.insert(thisKey); - // Make a track index table of features to file indexes - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - if(!numVectors) { - VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); + if (usingPower) { + if(powerFile->eof()) { + error("not enough power files in powerList", powerFileName); } - else{ - if(usingTimes){ - if(timesFile->eof()) { - error("not enough timestamp files in timesList", timesFileName); - } - thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); - if(!thisTimesFile->is_open()) { - error("Cannot open timestamp file", thisTimesFileName); - } - off_t insertoffset = dbH->length; - unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double)); - double *timesdata = timesTable + 2*indexoffset; - if(2*(indexoffset + numVectors) > timesTableLength) { - error("out of space for times", key); - } - insertTimeStamps(numVectors, thisTimesFile, timesdata); - if(thisTimesFile) - delete thisTimesFile; - } - - if (usingPower) { - if(powerFile->eof()) { - error("not enough power files in powerList", powerFileName); - } - thispowerfd = open(thisPowerFileName, O_RDONLY); - if (thispowerfd < 0) { - error("failed to open power file", thisPowerFileName); - } - off_t insertoffset = dbH->length; - unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double)); - double *powerdata = powerTable + poweroffset; - insertPowerData(numVectors, thispowerfd, powerdata); - if (0 < thispowerfd) { - close(thispowerfd); - } - } - - INSERT_FILETABLE_STRING(fileTable, thisKey); - - off_t insertoffset = dbH->length;// Store current state - - // Increment file count - dbH->numFiles++; - - // Update Header information - dbH->length+=(statbuf.st_size-sizeof(int)); - - // Update track to file index map - memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); - - insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); - - // Norm the vectors on input if the database is already L2 normed - if(dbH->flags & O2_FLAG_L2NORM) - unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors); - - totalVectors+=numVectors; - - // Copy the header back to the database - memcpy (db, dbH, sizeof(dbTableHeaderT)); + thispowerfd = open(thisPowerFileName, O_RDONLY); + if (thispowerfd < 0) { + error("failed to open power file", thisPowerFileName); } } - - // CLEAN UP - munmap(indata,statbuf.st_size); - indata = NULL; - close(infid); - infid = 0; + insertDatum(thisFile, thisTimesFile, thispowerfd, thisKey); + if(thisTimesFile) + delete thisTimesFile; + if(thispowerfd) + close(thispowerfd); } while(!filesIn->eof()); VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
--- a/libtests/0027/prog1.c Wed Dec 03 17:40:17 2008 +0000 +++ b/libtests/0027/prog1.c Fri Dec 05 22:32:43 2008 +0000 @@ -76,7 +76,7 @@ //${AUDIODB} -d testdb -I -f testfeature -w testpower myinsert.features="testfeature"; myinsert.power="testpower"; - myerr=audiodb_insert(mydbp,&myinsert); + //myerr=audiodb_insert(mydbp,&myinsert); if (audiodb_insert(mydbp,&myinsert)){ returnval=-1; }