Mercurial > hg > audiodb
view insert.cpp @ 302:74824093c1c4
Implement O((M+N) log(M+N)) duplicate key detection in batchinsert,
rather than naive O(N^2).
Note that I haven't measured the constants in those complexity
expressions; I am anticipating that 40000 is a sufficiently large N for
this to matter.
Write a test case for duplicate keys, too. Use 0037, since no-one else
seems to be writing tests, and everything is merged onto the trunk these
days anyway.
author | mas01cr |
---|---|
date | Mon, 04 Aug 2008 10:00:34 +0000 |
parents | 34ce7f7a177d |
children | 25572f1bd37f c93be2f3a674 |
line wrap: on
line source
#include "audioDB.h" bool audioDB::enough_per_file_space_free() { unsigned int fmaxfiles, tmaxfiles; unsigned int maxfiles; fmaxfiles = fileTableLength / O2_FILETABLE_ENTRY_SIZE; tmaxfiles = trackTableLength / O2_TRACKTABLE_ENTRY_SIZE; maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles; return(dbH->numFiles < maxfiles); } bool audioDB::enough_data_space_free(off_t size) { return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); } void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); write(dbfid, buffer, size); } void audioDB::insert(const char* dbName, const char* inFile) { forWrite = true; initTables(dbName, inFile); if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) error("Must use timestamps with timestamped database","use --times"); if(!usingPower && (dbH->flags & O2_FLAG_POWER)) error("Must use power with power-enabled database", dbName); if(!enough_per_file_space_free()) { error("Insert failed: no more room for metadata", inFile); } if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { error("Insert failed: no more room in database", inFile); } if(!key) key=inFile; // Linear scan of filenames check for pre-existing feature unsigned alreadyInserted=0; for(unsigned k=0; k<dbH->numFiles; k++) if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)+1)==0){ alreadyInserted=1; break; } if(alreadyInserted) { VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); return; } // Make a track index table of features to file indexes unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); if(!numVectors) { VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key); // CLEAN UP munmap(indata,statbuf.st_size); munmap(db,dbH->dbSize); close(infid); return; } strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)); off_t insertoffset = dbH->length;// Store current state // Check times status and insert times from file unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); double *timesdata = timesTable + 2*indexoffset; if(2*(indexoffset + numVectors) > timesTableLength) { error("out of space for times", key); } if (usingTimes) { insertTimeStamps(numVectors, timesFile, timesdata); } double *powerdata = powerTable + indexoffset; insertPowerData(numVectors, powerfd, powerdata); // Increment file count dbH->numFiles++; // Update Header information dbH->length+=(statbuf.st_size-sizeof(int)); // Update track to file index map memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned)); insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); // Norm the vectors on input if the database is already L2 normed if(dbH->flags & O2_FLAG_L2NORM) unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append // Report status status(dbName); VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int))); // Copy the header back to the database memcpy (db, dbH, sizeof(dbTableHeaderT)); // CLEAN UP munmap(indata,statbuf.st_size); close(infid); } void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) { assert(usingTimes); unsigned numtimes = 0; if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) { dbH->flags=dbH->flags|O2_FLAG_TIMES; } else if(!(dbH->flags & O2_FLAG_TIMES)) { error("Timestamp file used with non-timestamped database", timesFileName); } if(!timesFile->is_open()) { error("problem opening times file on timestamped database", timesFileName); } double timepoint, next; *timesFile >> timepoint; if (timesFile->eof()) { error("no entries in times file", timesFileName); } numtimes++; do { *timesFile >> next; if (timesFile->eof()) { break; } numtimes++; timesdata[0] = timepoint; timepoint = (timesdata[1] = next); timesdata += 2; } while (numtimes < numVectors + 1); if (numtimes < numVectors + 1) { error("too few timepoints in times file", timesFileName); } *timesFile >> next; if (!timesFile->eof()) { error("too many timepoints in times file", timesFileName); } } void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { if (usingPower) { if (!(dbH->flags & O2_FLAG_POWER)) { error("Cannot insert power data on non-power DB", dbName); } int one; unsigned int count; count = read(powerfd, &one, sizeof(unsigned int)); if (count != sizeof(unsigned int)) { error("powerfd read failed", "int", "read"); } if (one != 1) { error("dimensionality of power file not 1", powerFileName); } // FIXME: should check that the powerfile is the right size for // this. -- CSR, 2007-10-30 count = read(powerfd, powerdata, numVectors * sizeof(double)); if (count != numVectors * sizeof(double)) { error("powerfd read failed", "double", "read"); } } } void audioDB::batchinsert(const char* dbName, const char* inFile) { forWrite = true; initDBHeader(dbName); if(!key) key=inFile; std::ifstream *filesIn = 0; std::ifstream *keysIn = 0; std::ifstream* thisTimesFile = 0; int thispowerfd = 0; if(!(filesIn = new std::ifstream(inFile))) error("Could not open batch in file", inFile); if(key && key!=inFile) if(!(keysIn = new std::ifstream(key))) error("Could not open batch key file",key); if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) error("Must use timestamps with timestamped database","use --times"); if(!usingPower && (dbH->flags & O2_FLAG_POWER)) error("Must use power with power-enabled database", dbName); unsigned totalVectors=0; char *thisFile = new char[MAXSTR]; char *thisKey = 0; if (key && (key != inFile)) { thisKey = new char[MAXSTR]; } char *thisTimesFileName = new char[MAXSTR]; char *thisPowerFileName = new char[MAXSTR]; std::set<std::string> s; for (unsigned k = 0; k < dbH->numFiles; k++) { s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); } do { filesIn->getline(thisFile,MAXSTR); if(key && key!=inFile) { keysIn->getline(thisKey,MAXSTR); } else { thisKey = thisFile; } if(usingTimes) { timesFile->getline(thisTimesFileName,MAXSTR); } if(usingPower) { powerFile->getline(thisPowerFileName, MAXSTR); } if(filesIn->eof()) { break; } initInputFile(thisFile); if(!enough_per_file_space_free()) { error("batchinsert failed: no more room for metadata", thisFile); } if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { error("batchinsert failed: no more room in database", thisFile); } if(s.count(thisKey)) { VERB_LOG(0, "key already exists in database: %s\n", thisKey); } else { s.insert(thisKey); // Make a track index table of features to file indexes unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); if(!numVectors) { VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); } else{ if(usingTimes){ if(timesFile->eof()) { error("not enough timestamp files in timesList", timesFileName); } thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); if(!thisTimesFile->is_open()) { error("Cannot open timestamp file", thisTimesFileName); } off_t insertoffset = dbH->length; unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double)); double *timesdata = timesTable + 2*indexoffset; if(2*(indexoffset + numVectors) > timesTableLength) { error("out of space for times", key); } insertTimeStamps(numVectors, thisTimesFile, timesdata); if(thisTimesFile) delete thisTimesFile; } if (usingPower) { if(powerFile->eof()) { error("not enough power files in powerList", powerFileName); } thispowerfd = open(thisPowerFileName, O_RDONLY); if (thispowerfd < 0) { error("failed to open power file", thisPowerFileName); } off_t insertoffset = dbH->length; unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double)); double *powerdata = powerTable + poweroffset; insertPowerData(numVectors, thispowerfd, powerdata); if (0 < thispowerfd) { close(thispowerfd); } } strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)); off_t insertoffset = dbH->length;// Store current state // Increment file count dbH->numFiles++; // Update Header information dbH->length+=(statbuf.st_size-sizeof(int)); // Update track to file index map memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); // Norm the vectors on input if the database is already L2 normed if(dbH->flags & O2_FLAG_L2NORM) unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append totalVectors+=numVectors; // Copy the header back to the database memcpy (db, dbH, sizeof(dbTableHeaderT)); } } // CLEAN UP munmap(indata,statbuf.st_size); close(infid); } while(!filesIn->eof()); VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); delete [] thisPowerFileName; if(key && (key != inFile)) { delete [] thisKey; } delete [] thisFile; delete [] thisTimesFileName; delete filesIn; delete keysIn; // Report status status(dbName); }