mas01cr@239: #include "audioDB.h" mas01cr@239: mas01cr@251: bool audioDB::enough_per_file_space_free() { mas01cr@251: unsigned int fmaxfiles, tmaxfiles; mas01cr@251: unsigned int maxfiles; mas01cr@251: mas01cr@256: fmaxfiles = fileTableLength / O2_FILETABLE_ENTRY_SIZE; mas01cr@256: tmaxfiles = trackTableLength / O2_TRACKTABLE_ENTRY_SIZE; mas01cr@251: maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles; mas01cr@251: return(dbH->numFiles < maxfiles); mas01cr@251: } mas01cr@251: mas01cr@239: bool audioDB::enough_data_space_free(off_t size) { mas01mc@316: return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); mas01cr@239: } mas01cr@239: mas01cr@239: void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { mas01cr@239: lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); mas01cr@239: write(dbfid, buffer, size); mas01cr@239: } mas01cr@239: mas01cr@239: void audioDB::insert(const char* dbName, const char* inFile) { mas01cr@239: forWrite = true; mas01cr@239: initTables(dbName, inFile); mas01cr@239: mas01mc@316: if(dbH->flags & O2_FLAG_LARGE_ADB) mas01mc@316: error("Single-feature inserts not allowed with LARGE audioDB instances"); mas01mc@316: mas01cr@239: if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) mas01cr@239: error("Must use timestamps with timestamped database","use --times"); mas01cr@239: mas01cr@239: if(!usingPower && (dbH->flags & O2_FLAG_POWER)) mas01cr@239: error("Must use power with power-enabled database", dbName); mas01cr@239: mas01cr@251: if(!enough_per_file_space_free()) { mas01cr@251: error("Insert failed: no more room for metadata", inFile); mas01cr@251: } mas01cr@251: mas01cr@239: if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { mas01cr@239: error("Insert failed: no more room in database", inFile); mas01cr@239: } mas01cr@239: mas01cr@239: if(!key) mas01cr@239: key=inFile; mas01cr@239: // Linear scan of filenames check for pre-existing feature mas01cr@239: unsigned alreadyInserted=0; mas01cr@239: for(unsigned k=0; knumFiles; k++) mas01cr@256: if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)+1)==0){ mas01cr@239: alreadyInserted=1; mas01cr@239: break; mas01cr@239: } mas01cr@239: mas01cr@239: if(alreadyInserted) { mas01cr@239: VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); mas01mc@316: // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08 mas01cr@239: return; mas01cr@239: } mas01cr@239: mas01cr@239: // Make a track index table of features to file indexes mas01cr@239: unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); mas01cr@239: if(!numVectors) { mas01cr@239: VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key); mas01cr@239: mas01cr@239: // CLEAN UP mas01cr@239: munmap(indata,statbuf.st_size); mas01cr@239: munmap(db,dbH->dbSize); mas01cr@239: close(infid); mas01cr@239: return; mas01cr@239: } mas01cr@239: mas01mc@316: INSERT_FILETABLE_STRING(fileTable, key); mas01cr@239: mas01cr@239: off_t insertoffset = dbH->length;// Store current state mas01cr@239: mas01cr@239: // Check times status and insert times from file mas01cr@239: unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); mas01cr@239: double *timesdata = timesTable + 2*indexoffset; mas01cr@239: mas01cr@239: if(2*(indexoffset + numVectors) > timesTableLength) { mas01cr@239: error("out of space for times", key); mas01cr@239: } mas01cr@239: mas01cr@239: if (usingTimes) { mas01cr@239: insertTimeStamps(numVectors, timesFile, timesdata); mas01cr@239: } mas01cr@239: mas01cr@239: double *powerdata = powerTable + indexoffset; mas01cr@239: insertPowerData(numVectors, powerfd, powerdata); mas01cr@239: mas01cr@239: // Increment file count mas01cr@239: dbH->numFiles++; mas01cr@239: mas01cr@239: // Update Header information mas01cr@239: dbH->length+=(statbuf.st_size-sizeof(int)); mas01cr@239: mas01cr@239: // Update track to file index map mas01cr@239: memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned)); mas01cr@239: mas01cr@239: insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); mas01cr@239: mas01cr@239: // Norm the vectors on input if the database is already L2 normed mas01cr@239: if(dbH->flags & O2_FLAG_L2NORM) mas01cr@239: unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append mas01cr@239: mas01cr@239: // Report status mas01cr@239: status(dbName); mas01cr@239: VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int))); mas01cr@239: mas01cr@239: // Copy the header back to the database mas01cr@239: memcpy (db, dbH, sizeof(dbTableHeaderT)); mas01cr@239: mas01cr@239: // CLEAN UP mas01cr@239: munmap(indata,statbuf.st_size); mas01cr@239: close(infid); mas01cr@239: } mas01cr@239: mas01cr@239: void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) { mas01cr@239: assert(usingTimes); mas01cr@239: mas01cr@239: unsigned numtimes = 0; mas01cr@239: mas01cr@239: if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) { mas01cr@239: dbH->flags=dbH->flags|O2_FLAG_TIMES; mas01cr@239: } else if(!(dbH->flags & O2_FLAG_TIMES)) { mas01cr@239: error("Timestamp file used with non-timestamped database", timesFileName); mas01cr@239: } mas01cr@239: mas01cr@239: if(!timesFile->is_open()) { mas01cr@239: error("problem opening times file on timestamped database", timesFileName); mas01cr@239: } mas01cr@239: mas01cr@239: double timepoint, next; mas01cr@239: *timesFile >> timepoint; mas01cr@239: if (timesFile->eof()) { mas01cr@239: error("no entries in times file", timesFileName); mas01cr@239: } mas01cr@239: numtimes++; mas01cr@239: do { mas01cr@239: *timesFile >> next; mas01cr@239: if (timesFile->eof()) { mas01cr@239: break; mas01cr@239: } mas01cr@239: numtimes++; mas01cr@239: timesdata[0] = timepoint; mas01cr@239: timepoint = (timesdata[1] = next); mas01cr@239: timesdata += 2; mas01cr@239: } while (numtimes < numVectors + 1); mas01cr@239: mas01cr@239: if (numtimes < numVectors + 1) { mas01cr@239: error("too few timepoints in times file", timesFileName); mas01cr@239: } mas01cr@239: mas01cr@239: *timesFile >> next; mas01cr@239: if (!timesFile->eof()) { mas01cr@239: error("too many timepoints in times file", timesFileName); mas01cr@239: } mas01cr@239: } mas01cr@239: mas01cr@239: void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { mas01mc@320: if(usingPower){ mas01cr@239: if (!(dbH->flags & O2_FLAG_POWER)) { mas01cr@239: error("Cannot insert power data on non-power DB", dbName); mas01cr@239: } mas01mc@320: mas01cr@239: int one; mas01cr@239: unsigned int count; mas01mc@320: mas01cr@239: count = read(powerfd, &one, sizeof(unsigned int)); mas01cr@239: if (count != sizeof(unsigned int)) { mas01cr@239: error("powerfd read failed", "int", "read"); mas01cr@239: } mas01cr@239: if (one != 1) { mas01cr@239: error("dimensionality of power file not 1", powerFileName); mas01cr@239: } mas01mc@320: mas01cr@239: // FIXME: should check that the powerfile is the right size for mas01cr@239: // this. -- CSR, 2007-10-30 mas01cr@239: count = read(powerfd, powerdata, numVectors * sizeof(double)); mas01cr@239: if (count != numVectors * sizeof(double)) { mas01cr@239: error("powerfd read failed", "double", "read"); mas01cr@239: } mas01cr@239: } mas01cr@239: } mas01cr@239: mas01cr@239: void audioDB::batchinsert(const char* dbName, const char* inFile) { mas01cr@239: mas01cr@239: forWrite = true; mas01cr@239: initDBHeader(dbName); mas01cr@239: mas01mc@316: // Treat large ADB instances differently mas01mc@316: if( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@316: batchinsert_large_adb(dbName, inFile) ; mas01mc@316: return; mas01mc@316: } mas01mc@316: mas01cr@239: if(!key) mas01cr@239: key=inFile; mas01cr@239: std::ifstream *filesIn = 0; mas01cr@239: std::ifstream *keysIn = 0; mas01cr@239: std::ifstream* thisTimesFile = 0; mas01cr@239: int thispowerfd = 0; mas01cr@239: mas01cr@239: if(!(filesIn = new std::ifstream(inFile))) mas01cr@239: error("Could not open batch in file", inFile); mas01cr@239: if(key && key!=inFile) mas01cr@239: if(!(keysIn = new std::ifstream(key))) mas01cr@239: error("Could not open batch key file",key); mas01cr@239: mas01cr@239: if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) mas01cr@239: error("Must use timestamps with timestamped database","use --times"); mas01cr@239: mas01cr@239: if(!usingPower && (dbH->flags & O2_FLAG_POWER)) mas01cr@239: error("Must use power with power-enabled database", dbName); mas01cr@239: mas01cr@239: unsigned totalVectors=0; mas01cr@239: char *thisFile = new char[MAXSTR]; mas01cr@262: char *thisKey = 0; mas01cr@262: if (key && (key != inFile)) { mas01cr@262: thisKey = new char[MAXSTR]; mas01cr@262: } mas01cr@239: char *thisTimesFileName = new char[MAXSTR]; mas01cr@239: char *thisPowerFileName = new char[MAXSTR]; mas01cr@302: mas01cr@302: std::set s; mas01cr@302: mas01cr@302: for (unsigned k = 0; k < dbH->numFiles; k++) { mas01cr@302: s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); mas01cr@302: } mas01cr@302: mas01cr@302: do { mas01cr@239: filesIn->getline(thisFile,MAXSTR); mas01cr@262: if(key && key!=inFile) { mas01cr@239: keysIn->getline(thisKey,MAXSTR); mas01cr@262: } else { mas01cr@239: thisKey = thisFile; mas01cr@262: } mas01cr@262: if(usingTimes) { mas01cr@262: timesFile->getline(thisTimesFileName,MAXSTR); mas01cr@262: } mas01cr@262: if(usingPower) { mas01cr@239: powerFile->getline(thisPowerFileName, MAXSTR); mas01cr@262: } mas01cr@239: mas01cr@262: if(filesIn->eof()) { mas01cr@239: break; mas01cr@262: } mas01cr@239: initInputFile(thisFile); mas01cr@239: mas01cr@251: if(!enough_per_file_space_free()) { mas01cr@251: error("batchinsert failed: no more room for metadata", thisFile); mas01cr@251: } mas01cr@251: mas01cr@239: if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { mas01cr@239: error("batchinsert failed: no more room in database", thisFile); mas01cr@239: } mas01cr@239: mas01cr@302: if(s.count(thisKey)) { mas01cr@239: VERB_LOG(0, "key already exists in database: %s\n", thisKey); mas01cr@239: } else { mas01cr@302: s.insert(thisKey); mas01cr@239: // Make a track index table of features to file indexes mas01cr@239: unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); mas01cr@239: if(!numVectors) { mas01cr@239: VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); mas01cr@239: } mas01cr@239: else{ mas01cr@239: if(usingTimes){ mas01cr@239: if(timesFile->eof()) { mas01cr@239: error("not enough timestamp files in timesList", timesFileName); mas01cr@239: } mas01cr@239: thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); mas01cr@239: if(!thisTimesFile->is_open()) { mas01cr@239: error("Cannot open timestamp file", thisTimesFileName); mas01cr@239: } mas01cr@239: off_t insertoffset = dbH->length; mas01cr@239: unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double)); mas01cr@239: double *timesdata = timesTable + 2*indexoffset; mas01cr@239: if(2*(indexoffset + numVectors) > timesTableLength) { mas01cr@239: error("out of space for times", key); mas01cr@239: } mas01cr@239: insertTimeStamps(numVectors, thisTimesFile, timesdata); mas01cr@239: if(thisTimesFile) mas01cr@239: delete thisTimesFile; mas01cr@239: } mas01cr@239: mas01cr@239: if (usingPower) { mas01cr@239: if(powerFile->eof()) { mas01cr@239: error("not enough power files in powerList", powerFileName); mas01cr@239: } mas01cr@239: thispowerfd = open(thisPowerFileName, O_RDONLY); mas01cr@239: if (thispowerfd < 0) { mas01cr@239: error("failed to open power file", thisPowerFileName); mas01cr@239: } mas01cr@239: off_t insertoffset = dbH->length; mas01cr@239: unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double)); mas01cr@239: double *powerdata = powerTable + poweroffset; mas01cr@239: insertPowerData(numVectors, thispowerfd, powerdata); mas01cr@239: if (0 < thispowerfd) { mas01cr@239: close(thispowerfd); mas01cr@239: } mas01cr@239: } mas01mc@316: mas01mc@316: INSERT_FILETABLE_STRING(fileTable, thisKey); mas01mc@316: mas01cr@239: off_t insertoffset = dbH->length;// Store current state mas01cr@239: mas01cr@239: // Increment file count mas01cr@239: dbH->numFiles++; mas01cr@239: mas01cr@239: // Update Header information mas01cr@239: dbH->length+=(statbuf.st_size-sizeof(int)); mas01cr@239: mas01cr@239: // Update track to file index map mas01cr@239: memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); mas01mc@316: mas01cr@239: insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); mas01cr@239: mas01cr@239: // Norm the vectors on input if the database is already L2 normed mas01cr@239: if(dbH->flags & O2_FLAG_L2NORM) mas01cr@239: unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append mas01cr@239: mas01cr@239: totalVectors+=numVectors; mas01cr@239: mas01cr@239: // Copy the header back to the database mas01cr@239: memcpy (db, dbH, sizeof(dbTableHeaderT)); mas01cr@239: } mas01cr@239: } mas01cr@239: // CLEAN UP mas01cr@239: munmap(indata,statbuf.st_size); mas01cr@239: close(infid); mas01cr@239: } while(!filesIn->eof()); mas01cr@239: mas01cr@239: VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); mas01cr@262: mas01cr@262: delete [] thisPowerFileName; mas01cr@262: if(key && (key != inFile)) { mas01cr@262: delete [] thisKey; mas01cr@262: } mas01cr@262: delete [] thisFile; mas01cr@262: delete [] thisTimesFileName; mas01cr@239: mas01cr@262: delete filesIn; mas01cr@262: delete keysIn; mas01cr@262: mas01cr@239: // Report status mas01cr@239: status(dbName); mas01cr@239: } mas01mc@316: mas01mc@316: mas01mc@316: // BATCHINSERT_LARGE_ADB mas01mc@316: // mas01mc@316: // This method inserts file pointers into the ADB instance rather than the actual feature data mas01mc@316: // mas01mc@316: // This method is intended for databases that are large enough to only support indexed query mas01mc@316: // So exhaustive searching across all feature vectors will not be performed mas01mc@316: // mas01mc@316: // We insert featureFileName, [powerFileName], [timesFileName] mas01mc@316: // mas01mc@316: // l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time mas01mc@316: // mas01mc@316: // LIMITS: mas01mc@316: // mas01mc@316: // We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles mas01mc@316: // mas01mc@316: void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) { mas01mc@316: mas01mc@316: if(!key) mas01mc@316: key=inFile; mas01mc@316: std::ifstream *filesIn = 0; mas01mc@316: std::ifstream *keysIn = 0; mas01mc@316: std::ifstream* thisTimesFile = 0; mas01mc@316: int thispowerfd = 0; mas01mc@316: mas01mc@316: if(!(filesIn = new std::ifstream(inFile))) mas01mc@316: error("Could not open batch in file", inFile); mas01mc@316: if(key && key!=inFile) mas01mc@316: if(!(keysIn = new std::ifstream(key))) mas01mc@316: error("Could not open batch key file",key); mas01mc@316: mas01mc@316: if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) mas01mc@316: error("Must use timestamps with timestamped database","use --times"); mas01mc@316: mas01mc@316: if(!usingPower && (dbH->flags & O2_FLAG_POWER)) mas01mc@316: error("Must use power with power-enabled database", dbName); mas01mc@316: mas01mc@316: unsigned totalVectors=0; mas01mc@316: char *thisFile = new char[MAXSTR]; mas01mc@316: char *thisKey = 0; mas01mc@316: if (key && (key != inFile)) { mas01mc@316: thisKey = new char[MAXSTR]; mas01mc@316: } mas01mc@316: char *thisTimesFileName = new char[MAXSTR]; mas01mc@316: char *thisPowerFileName = new char[MAXSTR]; mas01mc@316: mas01mc@316: std::set s; mas01mc@316: mas01mc@316: for (unsigned k = 0; k < dbH->numFiles; k++) { mas01mc@316: s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); mas01mc@316: } mas01mc@316: mas01mc@316: do { mas01mc@316: filesIn->getline(thisFile,MAXSTR); mas01mc@316: if(key && key!=inFile) { mas01mc@316: keysIn->getline(thisKey,MAXSTR); mas01mc@316: } else { mas01mc@316: thisKey = thisFile; mas01mc@316: } mas01mc@316: if(usingTimes) { mas01mc@316: timesFile->getline(thisTimesFileName,MAXSTR); mas01mc@316: } mas01mc@316: if(usingPower) { mas01mc@316: powerFile->getline(thisPowerFileName, MAXSTR); mas01mc@316: } mas01mc@316: mas01mc@316: if(filesIn->eof()) { mas01mc@316: break; mas01mc@316: } mas01mc@316: mas01mc@316: initInputFile(thisFile, false); mas01mc@316: mas01mc@316: if(!enough_per_file_space_free()) { mas01mc@316: error("batchinsert failed: no more room for metadata", thisFile); mas01mc@316: } mas01mc@316: mas01mc@316: if(s.count(thisKey)) { mas01mc@316: VERB_LOG(0, "key already exists in database: %s\n", thisKey); mas01mc@316: } else { mas01mc@316: s.insert(thisKey); mas01mc@316: // Make a track index table of features to file indexes mas01mc@316: unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); mas01mc@316: if(!numVectors) { mas01mc@316: VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); mas01mc@316: } mas01mc@316: else{ mas01mc@316: // Check that time-stamp file exists mas01mc@316: if(usingTimes){ mas01mc@316: if(timesFile->eof()) { mas01mc@316: error("not enough timestamp files in timesList", timesFileName); mas01mc@316: } mas01mc@316: thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); mas01mc@316: if(!thisTimesFile->is_open()) { mas01mc@316: error("Cannot open timestamp file", thisTimesFileName); mas01mc@316: } mas01mc@316: if(thisTimesFile) mas01mc@316: delete thisTimesFile; mas01mc@316: } mas01mc@316: mas01mc@316: // Check that power file exists mas01mc@316: if (usingPower) { mas01mc@316: if(powerFile->eof()) { mas01mc@316: error("not enough power files in powerList", powerFileName); mas01mc@316: } mas01mc@316: thispowerfd = open(thisPowerFileName, O_RDONLY); mas01mc@316: if (thispowerfd < 0) { mas01mc@316: error("failed to open power file", thisPowerFileName); mas01mc@316: } mas01mc@316: if (0 < thispowerfd) { mas01mc@316: close(thispowerfd); mas01mc@316: } mas01mc@316: } mas01mc@316: mas01mc@316: // persist links to the feature files for reading from filesystem later mas01mc@316: mas01mc@316: // Primary Keys mas01mc@316: INSERT_FILETABLE_STRING(fileTable, thisKey); mas01mc@316: mas01mc@316: // Feature Vector fileNames mas01mc@318: INSERT_FILETABLE_STRING(featureFileNameTable, thisFile); mas01mc@316: mas01mc@316: // Time Stamp fileNames mas01mc@316: if(usingTimes) mas01mc@318: INSERT_FILETABLE_STRING(timesFileNameTable, thisTimesFileName); mas01mc@316: mas01mc@316: mas01mc@316: // Power fileNames mas01mc@316: if(usingPower) mas01mc@318: INSERT_FILETABLE_STRING(powerFileNameTable, thisPowerFileName); mas01mc@316: mas01mc@316: // Increment file count mas01mc@316: dbH->numFiles++; mas01mc@316: mas01mc@316: // Update Header information mas01mc@316: dbH->length+=(statbuf.st_size-sizeof(int)); mas01mc@316: mas01mc@316: // Update track to file index map mas01mc@316: memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); mas01mc@316: mas01mc@316: totalVectors+=numVectors; mas01mc@316: mas01mc@316: // Copy the header back to the database mas01mc@316: memcpy (db, dbH, sizeof(dbTableHeaderT)); mas01mc@316: } mas01mc@316: } mas01mc@316: // CLEAN UP mas01mc@321: if(indata) mas01mc@321: munmap(indata,statbuf.st_size); mas01mc@321: if(infid>0) mas01mc@321: close(infid); mas01mc@316: } while(!filesIn->eof()); mas01mc@316: mas01mc@316: VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); mas01mc@316: mas01mc@316: delete [] thisPowerFileName; mas01mc@316: if(key && (key != inFile)) { mas01mc@316: delete [] thisKey; mas01mc@316: } mas01mc@316: delete [] thisFile; mas01mc@316: delete [] thisTimesFileName; mas01mc@316: mas01mc@316: delete filesIn; mas01mc@316: delete keysIn; mas01mc@316: mas01mc@316: // Report status mas01mc@316: status(dbName); mas01mc@316: }