Mercurial > hg > audiodb
diff insert.cpp @ 316:25572f1bd37f large_adb
Adding large_adb support (up to 1M tracks)
author | mas01mc |
---|---|
date | Tue, 19 Aug 2008 14:27:21 +0000 |
parents | 74824093c1c4 |
children | c270d9e4659a |
line wrap: on
line diff
--- a/insert.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/insert.cpp Tue Aug 19 14:27:21 2008 +0000 @@ -11,7 +11,7 @@ } bool audioDB::enough_data_space_free(off_t size) { - return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); + return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); } void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { @@ -23,6 +23,9 @@ forWrite = true; initTables(dbName, inFile); + if(dbH->flags & O2_FLAG_LARGE_ADB) + error("Single-feature inserts not allowed with LARGE audioDB instances"); + if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) error("Must use timestamps with timestamped database","use --times"); @@ -49,6 +52,7 @@ if(alreadyInserted) { VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); + // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08 return; } @@ -64,7 +68,7 @@ return; } - strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)); + INSERT_FILETABLE_STRING(fileTable, key); off_t insertoffset = dbH->length;// Store current state @@ -183,6 +187,12 @@ forWrite = true; initDBHeader(dbName); + // Treat large ADB instances differently + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + batchinsert_large_adb(dbName, inFile) ; + return; + } + if(!key) key=inFile; std::ifstream *filesIn = 0; @@ -289,8 +299,9 @@ close(thispowerfd); } } - strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)); - + + INSERT_FILETABLE_STRING(fileTable, thisKey); + off_t insertoffset = dbH->length;// Store current state // Increment file count @@ -301,7 +312,7 @@ // Update track to file index map memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); - + insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); // Norm the vectors on input if the database is already L2 normed @@ -334,3 +345,169 @@ // Report status status(dbName); } + + +// BATCHINSERT_LARGE_ADB +// +// This method inserts file pointers into the ADB instance rather than the actual feature data +// +// This method is intended for databases that are large enough to only support indexed query +// So exhaustive searching across all feature vectors will not be performed +// +// We insert featureFileName, [powerFileName], [timesFileName] +// +// l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time +// +// LIMITS: +// +// We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles +// +void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) { + + if(!key) + key=inFile; + std::ifstream *filesIn = 0; + std::ifstream *keysIn = 0; + std::ifstream* thisTimesFile = 0; + int thispowerfd = 0; + + if(!(filesIn = new std::ifstream(inFile))) + error("Could not open batch in file", inFile); + if(key && key!=inFile) + if(!(keysIn = new std::ifstream(key))) + error("Could not open batch key file",key); + + if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) + error("Must use timestamps with timestamped database","use --times"); + + if(!usingPower && (dbH->flags & O2_FLAG_POWER)) + error("Must use power with power-enabled database", dbName); + + unsigned totalVectors=0; + char *thisFile = new char[MAXSTR]; + char *thisKey = 0; + if (key && (key != inFile)) { + thisKey = new char[MAXSTR]; + } + char *thisTimesFileName = new char[MAXSTR]; + char *thisPowerFileName = new char[MAXSTR]; + + std::set<std::string> s; + + for (unsigned k = 0; k < dbH->numFiles; k++) { + s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); + } + + do { + filesIn->getline(thisFile,MAXSTR); + if(key && key!=inFile) { + keysIn->getline(thisKey,MAXSTR); + } else { + thisKey = thisFile; + } + if(usingTimes) { + timesFile->getline(thisTimesFileName,MAXSTR); + } + if(usingPower) { + powerFile->getline(thisPowerFileName, MAXSTR); + } + + if(filesIn->eof()) { + break; + } + + initInputFile(thisFile, false); + + if(!enough_per_file_space_free()) { + error("batchinsert failed: no more room for metadata", thisFile); + } + + if(s.count(thisKey)) { + VERB_LOG(0, "key already exists in database: %s\n", thisKey); + } else { + s.insert(thisKey); + // Make a track index table of features to file indexes + unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); + if(!numVectors) { + VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); + } + else{ + // Check that time-stamp file exists + if(usingTimes){ + if(timesFile->eof()) { + error("not enough timestamp files in timesList", timesFileName); + } + thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); + if(!thisTimesFile->is_open()) { + error("Cannot open timestamp file", thisTimesFileName); + } + if(thisTimesFile) + delete thisTimesFile; + } + + // Check that power file exists + if (usingPower) { + if(powerFile->eof()) { + error("not enough power files in powerList", powerFileName); + } + thispowerfd = open(thisPowerFileName, O_RDONLY); + if (thispowerfd < 0) { + error("failed to open power file", thisPowerFileName); + } + if (0 < thispowerfd) { + close(thispowerfd); + } + } + + // persist links to the feature files for reading from filesystem later + + // Primary Keys + INSERT_FILETABLE_STRING(fileTable, thisKey); + + // Feature Vector fileNames + INSERT_FILETABLE_STRING(dbH->dataOffset, thisFile); + + // Time Stamp fileNames + if(usingTimes) + INSERT_FILETABLE_STRING(dbH->timesTableOffset, thisTimesFileName); + + + // Power fileNames + if(usingPower) + INSERT_FILETABLE_STRING(dbH->powerTableOffset, thisPowerFileName); + + // Increment file count + dbH->numFiles++; + + // Update Header information + dbH->length+=(statbuf.st_size-sizeof(int)); + + // Update track to file index map + memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); + + totalVectors+=numVectors; + + // Copy the header back to the database + memcpy (db, dbH, sizeof(dbTableHeaderT)); + } + } + // CLEAN UP + munmap(indata,statbuf.st_size); + close(infid); + } while(!filesIn->eof()); + + VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); + + delete [] thisPowerFileName; + if(key && (key != inFile)) { + delete [] thisKey; + } + delete [] thisFile; + delete [] thisTimesFileName; + + delete filesIn; + delete keysIn; + + // Report status + status(dbName); +}