# HG changeset patch # User mas01cr # Date 1195816095 0 # Node ID 8c81cacf5aabe7341b4b0e9e4b6467a6ef9e0493 # Parent 0e75deb7d4d1e8262fa58d318c0fb8993f8d977a Merge -r228:254 from no-big-mmap branch. Although the last log message from that branch only mentioned working create and status (-N and -S), it turned out that I seemed to have done everything right for dump and search on huge DBs to work too. Additionally: * bump the DB format version; * CHECKED_MMAP() for the powerTable; * move the powerTable above the timesTable, so that all the code everywhere which computes the length of the data buffer assuming that the timesTable is the next thing on the disk still works. diff -r 0e75deb7d4d1 -r 8c81cacf5aab audioDB.cpp --- a/audioDB.cpp Wed Nov 21 11:38:19 2007 +0000 +++ b/audioDB.cpp Fri Nov 23 11:08:15 2007 +0000 @@ -113,7 +113,18 @@ if(indata) munmap(indata,statbuf.st_size); if(db) - munmap(db,dbH->dbSize); + munmap(db,getpagesize()); + if(fileTable) + munmap(fileTable, fileTableLength); + if(trackTable) + munmap(trackTable, trackTableLength); + if(dataBuf) + munmap(dataBuf, dataBufLength); + if(timesTable) + munmap(timesTable, timesTableLength); + if(l2normTable) + munmap(l2normTable, l2normTableLength); + if(dbfid>0) close(dbfid); if(infid>0) @@ -156,10 +167,10 @@ } if(args_info.size_given) { - if (args_info.size_arg < 50 || args_info.size_arg > 4000) { + if (args_info.size_arg < 50 || args_info.size_arg > 32000) { error("Size out of range", ""); } - size = args_info.size_arg * 1000000; + size = (off_t) args_info.size_arg * 1000000; } if(args_info.radius_given){ @@ -427,21 +438,9 @@ error("Can't create database file", dbName, "open"); get_lock(dbfid, 1); - // go to the location corresponding to the last byte - if (lseek (dbfid, size - 1, SEEK_SET) == -1) - error("lseek error in db file", "", "lseek"); - - // write a dummy byte at the last location - if (write (dbfid, "", 1) != 1) - error("write error", "", "write"); - - // mmap the output file if(verbosity) { cerr << "header size:" << O2_HEADERSIZE << endl; } - if ((db = (char*) mmap(0, size, PROT_READ | PROT_WRITE, - MAP_SHARED, dbfid, 0)) == (caddr_t) -1) - error("mmap error for creating database", "", "mmap"); dbH = new dbTableHeaderT(); assert(dbH); @@ -455,26 +454,34 @@ dbH->dim = 0; dbH->flags = 0; dbH->length = 0; - dbH->fileTableOffset = ALIGN_UP(O2_HEADERSIZE, 8); - dbH->trackTableOffset = ALIGN_UP(dbH->fileTableOffset + O2_FILETABLESIZE*maxfiles, 8); - dbH->dataOffset = ALIGN_UP(dbH->trackTableOffset + O2_TRACKTABLESIZE*maxfiles, 8); - dbH->l2normTableOffset = ALIGN_DOWN(size - maxfiles*O2_MEANNUMVECTORS*sizeof(double), 8); - dbH->timesTableOffset = ALIGN_DOWN(dbH->l2normTableOffset - maxfiles*O2_MEANNUMVECTORS*sizeof(double), 8); - dbH->powerTableOffset = ALIGN_DOWN(dbH->timesTableOffset - maxfiles*O2_MEANNUMVECTORS*sizeof(double), 8); + dbH->fileTableOffset = ALIGN_PAGE_UP(O2_HEADERSIZE); + dbH->trackTableOffset = ALIGN_PAGE_UP(dbH->fileTableOffset + O2_FILETABLESIZE*maxfiles); + dbH->dataOffset = ALIGN_PAGE_UP(dbH->trackTableOffset + O2_TRACKTABLESIZE*maxfiles); + dbH->l2normTableOffset = ALIGN_PAGE_DOWN(size - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); + dbH->powerTableOffset = ALIGN_PAGE_DOWN(dbH->l2normTableOffset - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); + dbH->timesTableOffset = ALIGN_PAGE_DOWN(dbH->powerTableOffset - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); dbH->dbSize = size; - memcpy (db, dbH, O2_HEADERSIZE); + write(dbfid, dbH, O2_HEADERSIZE); + + // go to the location corresponding to the last byte + if (lseek (dbfid, size - 1, SEEK_SET) == -1) + error("lseek error in db file", "", "lseek"); + + // write a dummy byte at the last location + if (write (dbfid, "", 1) != 1) + error("write error", "", "write"); + if(verbosity) { cerr << COM_CREATE << " " << dbName << endl; } } - void audioDB::drop(){ // FIXME: drop something? Should we even allow this? } -void audioDB::initDBHeader(const char* dbName, bool forWrite) { +void audioDB::initDBHeader(const char* dbName) { if ((dbfid = open(dbName, forWrite ? O_RDWR : O_RDONLY)) < 0) { error("Can't open database file", dbName, "open"); } @@ -503,18 +510,49 @@ error("database file has incorect version", dbName); } - // mmap the database file - if ((db = (char*) mmap(0, dbH->dbSize, PROT_READ | (forWrite ? PROT_WRITE : 0), - MAP_SHARED, dbfid, 0)) == (caddr_t) -1) - error("mmap error for initting tables of database", "", "mmap"); +#define CHECKED_MMAP(type, var, start, length) \ + { void *tmp = mmap(0, length, (PROT_READ | (forWrite ? PROT_WRITE : 0)), MAP_SHARED, dbfid, (start)); \ + if(tmp == (void *) -1) { \ + error("mmap error for db table", #var, "mmap"); \ + } \ + var = (type) tmp; \ + } + + CHECKED_MMAP(char *, db, 0, getpagesize()); // Make some handy tables with correct types - fileTable = (char *) (db + dbH->fileTableOffset); - trackTable = (unsigned *) (db + dbH->trackTableOffset); - dataBuf = (double *) (db + dbH->dataOffset); - l2normTable = (double *) (db + dbH->l2normTableOffset); - timesTable = (double *) (db + dbH->timesTableOffset); - powerTable = (double *) (db + dbH->powerTableOffset); + if(forWrite || (dbH->length > 0)) { + if(forWrite) { + fileTableLength = dbH->trackTableOffset - dbH->fileTableOffset; + trackTableLength = dbH->dataOffset - dbH->trackTableOffset; + dataBufLength = dbH->timesTableOffset - dbH->dataOffset; + timesTableLength = dbH->powerTableOffset - dbH->timesTableOffset; + powerTableLength = dbH->l2normTableOffset - dbH->powerTableOffset; + l2normTableLength = dbH->dbSize - dbH->l2normTableOffset; + } else { + fileTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLESIZE); + trackTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_TRACKTABLESIZE); + dataBufLength = ALIGN_PAGE_UP(dbH->length); + timesTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + } + CHECKED_MMAP(char *, fileTable, dbH->fileTableOffset, fileTableLength); + CHECKED_MMAP(unsigned *, trackTable, dbH->trackTableOffset, trackTableLength); + /* + * No more mmap() for dataBuf + * + * FIXME: Actually we do do the mmap() in the two cases where it's + * still "needed": in pointQuery and in l2norm if dbH->length is + * non-zero. Removing those cases too (and deleting the dataBuf + * variable completely) would be cool. -- CSR, 2007-11-19 + * + * CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); + */ + CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength); + CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength); + CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength); + } } void audioDB::initInputFile (const char *inFile) { @@ -555,15 +593,23 @@ } } -void audioDB::initTables(const char* dbName, bool forWrite, const char* inFile = 0) { - - initDBHeader(dbName, forWrite); +void audioDB::initTables(const char* dbName, const char* inFile = 0) { + initDBHeader(dbName); initInputFile(inFile); } -void audioDB::insert(const char* dbName, const char* inFile){ +bool audioDB::enough_data_space_free(off_t size) { + return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); +} - initTables(dbName, 1, inFile); +void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { + lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); + write(dbfid, buffer, size); +} + +void audioDB::insert(const char* dbName, const char* inFile) { + forWrite = true; + initTables(dbName, inFile); if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) error("Must use timestamps with timestamped database","use --times"); @@ -571,10 +617,10 @@ if(!usingPower && (dbH->flags & O2_FLAG_POWER)) error("Must use power with power-enabled database", dbName); - // Check that there is room for at least 1 more file - if((char*)timesTable<((char*)dataBuf+dbH->length+statbuf.st_size-sizeof(int))) + if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { error("Insert failed: no more room in database", inFile); - + } + if(!key) key=inFile; // Linear scan of filenames check for pre-existing feature @@ -607,12 +653,16 @@ strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key)); - unsigned insertoffset = dbH->length;// Store current state + off_t insertoffset = dbH->length;// Store current state // Check times status and insert times from file unsigned timesoffset=insertoffset/(dbH->dim*sizeof(double)); double* timesdata=timesTable+timesoffset; - assert(timesdata+numVectors timesTableLength) { + error("out of space for times", key); + } + insertTimeStamps(numVectors, timesFile, timesdata); double *powerdata = powerTable + timesoffset; @@ -624,19 +674,14 @@ // Update Header information dbH->length+=(statbuf.st_size-sizeof(int)); - // Copy the header back to the database - memcpy (db, dbH, sizeof(dbTableHeaderT)); - // Update track to file index map - //memcpy (db+trackTableOffset+(dbH->numFiles-1)*sizeof(unsigned), &numVectors, sizeof(unsigned)); memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); - // Update the feature database - memcpy (db+dbH->dataOffset+insertoffset, indata+sizeof(int), statbuf.st_size-sizeof(int)); + insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); // Norm the vectors on input if the database is already L2 normed if(dbH->flags & O2_FLAG_L2NORM) - unitNormAndInsertL2((double*)(db+dbH->dataOffset+insertoffset), dbH->dim, numVectors, 1); // append + unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append // Report status status(dbName); @@ -645,6 +690,9 @@ << (statbuf.st_size-sizeof(int)) << " bytes." << endl; } + // Copy the header back to the database + memcpy (db, dbH, sizeof(dbTableHeaderT)); + // CLEAN UP munmap(indata,statbuf.st_size); close(infid); @@ -731,7 +779,8 @@ void audioDB::batchinsert(const char* dbName, const char* inFile) { - initDBHeader(dbName, true); + forWrite = true; + initDBHeader(dbName); if(!key) key=inFile; @@ -774,9 +823,9 @@ initInputFile(thisFile); - // Check that there is room for at least 1 more file - if((char*)timesTable<((char*)dataBuf+(dbH->length+statbuf.st_size-sizeof(int)))) + if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { error("batchinsert failed: no more room in database", thisFile); + } // Linear scan of filenames check for pre-existing feature unsigned alreadyInserted=0; @@ -808,10 +857,12 @@ thisTimesFile=new ifstream(thisTimesFileName,ios::in); if(!thisTimesFile->is_open()) error("Cannot open timestamp file",thisTimesFileName); - unsigned insertoffset=dbH->length; + off_t insertoffset=dbH->length; unsigned timesoffset=insertoffset/(dbH->dim*sizeof(double)); double* timesdata=timesTable+timesoffset; - assert(timesdata+numVectors timesTableLength) { + error("out of space for times", key); + } insertTimeStamps(numVectors,thisTimesFile,timesdata); if(thisTimesFile) delete thisTimesFile; @@ -835,28 +886,27 @@ } strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey)); - unsigned insertoffset = dbH->length;// Store current state + off_t insertoffset = dbH->length;// Store current state // Increment file count dbH->numFiles++; // Update Header information dbH->length+=(statbuf.st_size-sizeof(int)); - // Copy the header back to the database - memcpy (db, dbH, sizeof(dbTableHeaderT)); // Update track to file index map - //memcpy (db+trackTableOffset+(dbH->numFiles-1)*sizeof(unsigned), &numVectors, sizeof(unsigned)); memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); - // Update the feature database - memcpy (db+dbH->dataOffset+insertoffset, indata+sizeof(int), statbuf.st_size-sizeof(int)); + insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); // Norm the vectors on input if the database is already L2 normed if(dbH->flags & O2_FLAG_L2NORM) - unitNormAndInsertL2((double*)(db+dbH->dataOffset+insertoffset), dbH->dim, numVectors, 1); // append + unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append totalVectors+=numVectors; + + // Copy the header back to the database + memcpy (db, dbH, sizeof(dbTableHeaderT)); } } // CLEAN UP @@ -923,7 +973,7 @@ void audioDB::status(const char* dbName, adb__statusResponse *adbStatusResponse){ if(!dbH) - initTables(dbName, 0, 0); + initTables(dbName, 0); unsigned dudCount=0; unsigned nullCount=0; @@ -962,7 +1012,7 @@ void audioDB::dump(const char* dbName){ if(!dbH) { - initTables(dbName, 0, 0); + initTables(dbName, 0); } if((mkdir(output, S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { @@ -1018,6 +1068,9 @@ int ffd, pfd; FILE *tFile; unsigned pos = 0; + lseek(dbfid, dbH->dataOffset, SEEK_SET); + double *data_buffer; + size_t data_buffer_size; for(unsigned k = 0; k < dbH->numFiles; k++) { fprintf(kLFile, "%s\n", fileTable + k*O2_FILETABLESIZE); snprintf(fName, 256, "%05d.features", k); @@ -1027,10 +1080,29 @@ if ((write(ffd, &dbH->dim, sizeof(uint32_t))) < 0) { error("error writing dimensions", fName, "write"); } - - if ((write(ffd, dataBuf + pos * dbH->dim, trackTable[k] * dbH->dim * sizeof(double))) < 0) { + + /* FIXME: this repeated malloc()/free() of data buffers is + inefficient. */ + data_buffer_size = trackTable[k] * dbH->dim * sizeof(double); + + { + void *tmp = malloc(data_buffer_size); + if (tmp == NULL) { + error("error allocating data buffer"); + } + data_buffer = (double *) tmp; + } + + if ((read(dbfid, data_buffer, data_buffer_size)) != (ssize_t) data_buffer_size) { + error("error reading data", fName, "read"); + } + + if ((write(ffd, data_buffer, data_buffer_size)) < 0) { error("error writing data", fName, "write"); } + + free(data_buffer); + fprintf(fLFile, "%s\n", fName); close(ffd); @@ -1078,7 +1150,7 @@ \n\ if [ -z \"${AUDIODB}\" ]; then echo set AUDIODB variable; exit 1; fi\n\ if [ -z \"$1\" ]; then echo usage: $0 newdb; exit 1; fi\n\n\ -\"${AUDIODB}\" -d \"$1\" -N --size=%d\n", dbH->dbSize / 1000000); +\"${AUDIODB}\" -d \"$1\" -N --size=%d\n", (int) (dbH->dbSize / 1000000)); if(dbH->flags & O2_FLAG_L2NORM) { fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -L\n"); } @@ -1112,10 +1184,13 @@ status(dbName); } -void audioDB::l2norm(const char* dbName){ - initTables(dbName, true, 0); +void audioDB::l2norm(const char* dbName) { + forWrite = true; + initTables(dbName, 0); if(dbH->length>0){ + /* FIXME: should probably be uint64_t */ unsigned numVectors = dbH->length/(sizeof(double)*dbH->dim); + CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); unitNormAndInsertL2(dataBuf, dbH->dim, numVectors, 0); // No append } // Update database flags @@ -1124,7 +1199,8 @@ } void audioDB::power_flag(const char *dbName) { - initTables(dbName, true, 0); + forWrite = true; + initTables(dbName, 0); if (dbH->length > 0) { error("cannot turn on power storage for non-empty database", dbName); } @@ -1146,7 +1222,6 @@ return true; } - void audioDB::query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ switch(queryType){ case O2_POINT_QUERY: @@ -1177,15 +1252,16 @@ } // Basic point query engine -void audioDB::pointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ +void audioDB::pointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { - initTables(dbName, 0, inFile); + initTables(dbName, inFile); // For each input vector, find the closest pointNN matching output vectors and report // we use stdout in this stub version unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - + double* query = (double*)(indata+sizeof(int)); + CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); double* data = dataBuf; double* queryCopy = 0; @@ -1360,13 +1436,13 @@ // trackPointQuery // return the trackNN closest tracks to the query track // uses average of pointNN points per track -void audioDB::trackPointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ - initTables(dbName, 0, inFile); +void audioDB::trackPointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { + initTables(dbName, inFile); // For each input vector, find the closest pointNN matching output vectors and report unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); double* query = (double*)(indata+sizeof(int)); - double* data = dataBuf; + double* data; double* queryCopy = 0; if( dbH->flags & O2_FLAG_L2NORM ){ @@ -1451,9 +1527,9 @@ } // build track offset table - unsigned *trackOffsetTable = new unsigned[dbH->numFiles]; + off_t *trackOffsetTable = new off_t[dbH->numFiles]; unsigned cumTrack=0; - unsigned trackIndexOffset; + off_t trackIndexOffset; for(k=0; knumFiles;k++){ trackOffsetTable[k]=cumTrack; cumTrack+=trackTable[k]*dbH->dim; @@ -1462,18 +1538,29 @@ char nextKey[MAXSTR]; gettimeofday(&tv1, NULL); + + size_t data_buffer_size = 0; + double *data_buffer = 0; + lseek(dbfid, dbH->dataOffset, SEEK_SET); for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ - if(trackFile){ - if(!trackFile->eof()){ - trackFile->getline(nextKey,MAXSTR); - track=getKeyPos(nextKey); + + trackOffset = trackOffsetTable[track]; // numDoubles offset + + // get trackID from file if using a control file + if(trackFile) { + trackFile->getline(nextKey,MAXSTR); + if(!trackFile->eof()) { + track = getKeyPos(nextKey); + trackOffset = trackOffsetTable[track]; + lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); + } else { + break; } - else - break; } - trackOffset=trackOffsetTable[track]; // numDoubles offset + trackIndexOffset=trackOffset/dbH->dim; // numVectors offset + if(verbosity>7) { cerr << track << "." << trackOffset/(dbH->dim) << "." << trackTable[track] << " | ";cerr.flush(); } @@ -1486,9 +1573,26 @@ j=1; else j=numVectors; + + if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { + if(data_buffer) { + free(data_buffer); + } + { + data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; + void *tmp = malloc(data_buffer_size); + if (tmp == NULL) { + error("error allocating data buffer"); + } + data_buffer = (double *) tmp; + } + } + + read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); + while(j--){ k=trackTable[track]; // number of vectors in track - data=dataBuf+trackOffset; // data for track + data=data_buffer; // data for track while(k--){ thisDist=0; l=dbH->dim; @@ -1558,6 +1662,9 @@ sIndexes[k]=~0; } } // tracks + + free(data_buffer); + gettimeofday(&tv2, NULL); if(verbosity>1) { @@ -1656,7 +1763,7 @@ // outputs distances of retrieved shingles, max retreived = pointNN shingles per per track void audioDB::trackSequenceQueryNN(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ - initTables(dbName, 0, inFile); + initTables(dbName, inFile); // For each input vector, find the closest pointNN matching output vectors and report // we use stdout in this stub version @@ -1848,9 +1955,9 @@ double* dp; // build track offset table - unsigned *trackOffsetTable = new unsigned[dbH->numFiles]; + off_t *trackOffsetTable = new off_t[dbH->numFiles]; unsigned cumTrack=0; - unsigned trackIndexOffset; + off_t trackIndexOffset; for(k=0; knumFiles;k++){ trackOffsetTable[k]=cumTrack; cumTrack+=trackTable[k]*dbH->dim; @@ -1866,19 +1973,26 @@ double maxSample = 0; // Track loop - for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ + size_t data_buffer_size = 0; + double *data_buffer = 0; + lseek(dbfid, dbH->dataOffset, SEEK_SET); + + for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++) { + + trackOffset = trackOffsetTable[track]; // numDoubles offset // get trackID from file if using a control file - if(trackFile){ - if(!trackFile->eof()){ - trackFile->getline(nextKey,MAXSTR); - track=getKeyPos(nextKey); + if(trackFile) { + trackFile->getline(nextKey,MAXSTR); + if(!trackFile->eof()) { + track = getKeyPos(nextKey); + trackOffset = trackOffsetTable[track]; + lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); + } else { + break; } - else - break; } - trackOffset=trackOffsetTable[track]; // numDoubles offset trackIndexOffset=trackOffset/dbH->dim; // numVectors offset if(sequenceLength<=trackTable[track]){ // test for short sequences @@ -1900,11 +2014,27 @@ assert(DD[j]); } + if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { + if(data_buffer) { + free(data_buffer); + } + { + data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; + void *tmp = malloc(data_buffer_size); + if (tmp == NULL) { + error("error allocating data buffer"); + } + data_buffer = (double *) tmp; + } + } + + read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); + // Dot product for(j=0; jdim; - sp=dataBuf+trackOffset+k*dbH->dim; + sp=data_buffer+k*dbH->dim; DD[j][k]=0.0; // Initialize matched filter array dp=&D[j][k]; // point to correlation cell j,k *dp=0.0; // initialize correlation cell @@ -2059,6 +2189,8 @@ } } + free(data_buffer); + gettimeofday(&tv2,NULL); if(verbosity>1) { cerr << endl << "processed tracks :" << processedTracks << " matched tracks: " << successfulTracks << " elapsed time:" @@ -2124,7 +2256,7 @@ // outputs count of retrieved shingles, max retreived = one shingle per query shingle per track void audioDB::trackSequenceQueryRad(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ - initTables(dbName, 0, inFile); + initTables(dbName, inFile); // For each input vector, find the closest pointNN matching output vectors and report // we use stdout in this stub version @@ -2312,9 +2444,9 @@ double* dp; // build track offset table - unsigned *trackOffsetTable = new unsigned[dbH->numFiles]; + off_t *trackOffsetTable = new off_t[dbH->numFiles]; unsigned cumTrack=0; - unsigned trackIndexOffset; + off_t trackIndexOffset; for(k=0; knumFiles;k++){ trackOffsetTable[k]=cumTrack; cumTrack+=trackTable[k]*dbH->dim; @@ -2330,19 +2462,26 @@ double maxSample = 0; // Track loop + size_t data_buffer_size = 0; + double *data_buffer = 0; + lseek(dbfid, dbH->dataOffset, SEEK_SET); + for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ + trackOffset = trackOffsetTable[track]; // numDoubles offset + // get trackID from file if using a control file - if(trackFile){ - if(!trackFile->eof()){ - trackFile->getline(nextKey,MAXSTR); - track=getKeyPos(nextKey); + if(trackFile) { + trackFile->getline(nextKey,MAXSTR); + if(!trackFile->eof()) { + track = getKeyPos(nextKey); + trackOffset = trackOffsetTable[track]; + lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); + } else { + break; } - else - break; } - trackOffset=trackOffsetTable[track]; // numDoubles offset trackIndexOffset=trackOffset/dbH->dim; // numVectors offset if(sequenceLength<=trackTable[track]){ // test for short sequences @@ -2364,11 +2503,27 @@ assert(DD[j]); } + if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { + if(data_buffer) { + free(data_buffer); + } + { + data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; + void *tmp = malloc(data_buffer_size); + if (tmp == NULL) { + error("error allocating data buffer"); + } + data_buffer = (double *) tmp; + } + } + + read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); + // Dot product for(j=0; jdim; - sp=dataBuf+trackOffset+k*dbH->dim; + sp=data_buffer+k*dbH->dim; DD[j][k]=0.0; // Initialize matched filter array dp=&D[j][k]; // point to correlation cell j,k *dp=0.0; // initialize correlation cell @@ -2376,7 +2531,7 @@ while(l--) *dp+=*qp++**sp++; } - + // Matched Filter // HOP SIZE == 1 double* spd; @@ -2501,6 +2656,8 @@ } } + free(data_buffer); + gettimeofday(&tv2,NULL); if(verbosity>1) { cerr << endl << "processed tracks :" << processedTracks << " matched tracks: " << successfulTracks << " elapsed time:" @@ -2621,14 +2778,6 @@ p++; } l2ptr++; - /* - oneOverL2 = 1.0/(*l2ptr++); - d=dim; - while(d--){ - *X*=oneOverL2; - X++; - } - */ X+=dim; } unsigned offset; diff -r 0e75deb7d4d1 -r 8c81cacf5aab audioDB.h --- a/audioDB.h Wed Nov 21 11:38:19 2007 +0000 +++ b/audioDB.h Fri Nov 23 11:08:15 2007 +0000 @@ -50,7 +50,7 @@ #define O2_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24) #define O2_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24) -#define O2_FORMAT_VERSION (1U) +#define O2_FORMAT_VERSION (2U) #define O2_DEFAULT_POINTNN (10U) #define O2_DEFAULT_TRACKNN (10U) @@ -88,6 +88,9 @@ #define ALIGN_UP(x,w) ((x) + ((1< testfeature01 +floatstring 0 1 >> testfeature01 +intstring 2 > testfeature10 +floatstring 1 0 >> testfeature10 + +${AUDIODB} -d testdb -I -f testfeature01 +${AUDIODB} -d testdb -I -f testfeature10 + +# sequence queries require L2NORM +${AUDIODB} -d testdb -L + +echo "query point (0.0,0.5)" +intstring 2 > testquery +floatstring 0 0.5 >> testquery + +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery > testoutput +echo testfeature01 0 0 0 > test-expected-output +echo testfeature10 2 0 0 >> test-expected-output +cmp testoutput test-expected-output +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K /dev/null > testoutput +cat /dev/null > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature01 0 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature10 2 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt -r 1 > testoutput +echo testfeature10 2 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo "query point (0.5,0.0)" +intstring 2 > testquery +floatstring 0.5 0 >> testquery + +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery > testoutput +echo testfeature10 0 0 0 > test-expected-output +echo testfeature01 2 0 0 >> test-expected-output +cmp testoutput test-expected-output +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K /dev/null > testoutput +cat /dev/null > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature10 0 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature01 2 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt -r 1 > testoutput +echo testfeature01 2 0 0 > test-expected-output +cmp testoutput test-expected-output + +exit 104 diff -r 0e75deb7d4d1 -r 8c81cacf5aab tests/0031/short-description --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0031/short-description Fri Nov 23 11:08:15 2007 +0000 @@ -0,0 +1,1 @@ +0008 with -K restriction \ No newline at end of file diff -r 0e75deb7d4d1 -r 8c81cacf5aab tests/0032/run-test.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0032/run-test.sh Fri Nov 23 11:08:15 2007 +0000 @@ -0,0 +1,71 @@ +#! /bin/sh + +. ../test-utils.sh + +if [ -f testdb ]; then rm -f testdb; fi + +${AUDIODB} -d testdb -N + +intstring 2 > testfeature01 +floatstring 0 1 >> testfeature01 +intstring 2 > testfeature10 +floatstring 1 0 >> testfeature10 + +${AUDIODB} -d testdb -I -f testfeature01 +${AUDIODB} -d testdb -I -f testfeature10 + +echo "query point (0.0,0.5)" +intstring 2 > testquery +floatstring 0 0.5 >> testquery + +${AUDIODB} -d testdb -Q track -l 1 -f testquery > testoutput +echo testfeature01 0.5 0 0 > test-expected-output +echo testfeature10 0 0 0 >> test-expected-output +cmp testoutput test-expected-output +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K /dev/null > testoutput +cat /dev/null > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature01 0.5 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature10 0 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K testkl.txt -r 1 > testoutput +echo testfeature10 0 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo "query point (0.5,0.0)" +intstring 2 > testquery +floatstring 0.5 0 >> testquery + +${AUDIODB} -d testdb -Q track -l 1 -f testquery > testoutput +echo testfeature10 0.5 0 0 > test-expected-output +echo testfeature01 0 0 0 >> test-expected-output +cmp testoutput test-expected-output +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K /dev/null > testoutput +cat /dev/null > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature10 0.5 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K testkl.txt > testoutput +echo testfeature01 0 0 0 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q track -l 1 -f testquery -K testkl.txt -r 1 > testoutput +echo testfeature01 0 0 0 > test-expected-output +cmp testoutput test-expected-output + +exit 104 diff -r 0e75deb7d4d1 -r 8c81cacf5aab tests/0032/short-description --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0032/short-description Fri Nov 23 11:08:15 2007 +0000 @@ -0,0 +1,1 @@ +0009 with -K restriction \ No newline at end of file diff -r 0e75deb7d4d1 -r 8c81cacf5aab tests/0033/run-test.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0033/run-test.sh Fri Nov 23 11:08:15 2007 +0000 @@ -0,0 +1,68 @@ +#! /bin/sh + +. ../test-utils.sh + +if [ -f testdb ]; then rm -f testdb; fi + +${AUDIODB} -d testdb -N + +intstring 2 > testfeature01 +floatstring 0 1 >> testfeature01 +intstring 2 > testfeature10 +floatstring 1 0 >> testfeature10 + +${AUDIODB} -d testdb -I -f testfeature01 +${AUDIODB} -d testdb -I -f testfeature10 + +# sequence queries require L2NORM +${AUDIODB} -d testdb -L + +echo "query point (0.0,0.5)" +intstring 2 > testquery +floatstring 0 0.5 >> testquery + +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -R 5 > testoutput +echo testfeature01 1 > test-expected-output +echo testfeature10 1 >> test-expected-output +cmp testoutput test-expected-output +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K /dev/null -R 5 > testoutput +cat /dev/null > test-expected-output +cmp testoutput test-expected-output + +echo testfeature01 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt -R 5 > testoutput +echo testfeature01 1 > test-expected-output +cmp testoutput test-expected-output +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt -R 5 > testoutput +echo testfeature10 1 > test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt -r 1 -R 5 > testoutput +echo testfeature10 1 > test-expected-output +cmp testoutput test-expected-output + +# NB: one might be tempted to insert a test here for having both keys +# in the keylist, but in non-database order, and then checking that +# the result list is also in that non-database order. I think that +# would be misguided, as the efficient way of dealing with such a +# keylist is to advance as-sequentially-as-possible through the +# database; it just so happens that our current implementation is not +# so smart. + +echo "query point (0.5,0.0)" +intstring 2 > testquery +floatstring 0.5 0 >> testquery + +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -R 5 > testoutput +echo testfeature01 1 > test-expected-output +echo testfeature10 1 >> test-expected-output +cmp testoutput test-expected-output + +echo testfeature10 > testkl.txt +${AUDIODB} -d testdb -Q sequence -l 1 -f testquery -K testkl.txt -r 1 -R 5 > testoutput +echo testfeature10 1 > test-expected-output +cmp testoutput test-expected-output + +exit 104 diff -r 0e75deb7d4d1 -r 8c81cacf5aab tests/0033/short-description --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0033/short-description Fri Nov 23 11:08:15 2007 +0000 @@ -0,0 +1,1 @@ +0010 with -K restriction \ No newline at end of file