Mercurial > hg > audiodb
changeset 243:15b8ff55ea5b audiodb-debian
Merge trunk changes -r290:313 into audiodb-debian branch.
(+ new debian/changelog)
author | mas01cr |
---|---|
date | Fri, 14 Dec 2007 14:41:37 +0000 |
parents | b83f0fd53a2c |
children | cbf51690c78c |
files | INSTALL.txt Makefile audioDB.cpp audioDB.h common.cpp create.cpp debian/changelog dump.cpp insert.cpp query.cpp reporter.h soap.cpp tests/0003/run-test.sh tests/0004/run-test.sh tests/0009/run-test.sh tests/0014/run-test.sh tests/0019/run-test.sh tests/0032/run-test.sh tests/0035/run-test.sh tests/0035/short-description tests/run-tests.sh |
diffstat | 21 files changed, 1882 insertions(+), 2596 deletions(-) [+] |
line wrap: on
line diff
--- a/INSTALL.txt Wed Dec 05 14:11:04 2007 +0000 +++ b/INSTALL.txt Fri Dec 14 14:41:37 2007 +0000 @@ -12,7 +12,13 @@ help2man: a spartan manual page is automatically generated from the usage text using help2man. -Use the following Makefiles to compile audioDB on your system: +bash: audioDB's test suite depends on /bin/sh being bash, version 3.0 + or greater. + +Compiling +--------- + +Use the following make commands to compile audioDB on your system: Linux: make (Debian-derived systems, with the right versions of gengetopt, gsoap @@ -25,3 +31,28 @@ GSOAP_INCLUDE="-I /path/to/gsoap" (where the path in GSOAP_INCLUDE should name the directory in which stdsoap2.h resides) + +Testing +------- + +audioDB comes with a suite of test scripts that verify that particular +pieces of functionality are behaving acceptably. They depend on +/bin/sh being GNU bash, version 3 or greater. The test suite can be +run with the command "make test". + +When running on systems without sparse file support (notably those +using the default filesystem in Mac OS X), running the test suite +requires approximately 70GB of free hard disk space. On other +systems, disk usage from running the test suite is negligible. + +Installation +------------ + +The audioDB executable is command-line database client, SOAP server +and SOAP client all in one: to install for system-wide use, simply +copy it to a place in the system-wide path. + +The build process also generates a Web Service Description file, +adb.wsdl; to publish a description of an audioDB web service, you will +need to edit that file to point to the network address and port of +your audioDB server before publishing it on an ordinary HTTP server.
--- a/Makefile Wed Dec 05 14:11:04 2007 +0000 +++ b/Makefile Fri Dec 14 14:41:37 2007 +0000 @@ -4,7 +4,10 @@ GSOAP_CPP=-lgsoap++ GSOAP_INCLUDE= -CFLAGS=-O3 -g +override CFLAGS+=-O3 -g +ifeq ($(shell uname),Linux) +override CFLAGS+=-D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 +endif EXECUTABLE=audioDB @@ -21,19 +24,23 @@ cmdline.c cmdline.h: gengetopt.in ${GENGETOPT} -e <gengetopt.in -soapServer.cpp soapClient.cpp soapC.cpp: audioDBws.h +soapServer.cpp soapClient.cpp soapC.cpp adb.nsmap: audioDBws.h ${SOAPCPP2} audioDBws.h -${EXECUTABLE}: audioDB.cpp audioDB.h soapServer.cpp soapClient.cpp soapC.cpp cmdline.c cmdline.h - g++ -c ${CFLAGS} ${GSOAP_INCLUDE} -Wall -Werror audioDB.cpp - g++ -o ${EXECUTABLE} ${CFLAGS} ${GSOAP_INCLUDE} audioDB.o soapServer.cpp soapClient.cpp soapC.cpp cmdline.c ${GSOAP_CPP} +%.o: %.cpp audioDB.h adb.nsmap cmdline.h reporter.h + g++ -c ${CFLAGS} ${GSOAP_INCLUDE} -Wall -Werror $< + +OBJS=insert.o create.o common.o dump.o query.o soap.o audioDB.o + +${EXECUTABLE}: ${OBJS} soapServer.cpp soapClient.cpp soapC.cpp cmdline.c + g++ -o ${EXECUTABLE} ${CFLAGS} ${GSOAP_INCLUDE} $^ ${GSOAP_CPP} clean: -rm cmdline.c cmdline.h -rm soapServer.cpp soapClient.cpp soapC.cpp soapObject.h soapStub.h soapProxy.h soapH.h soapServerLib.cpp soapClientLib.cpp - -rm adb.nsmap adb.xsd adb.wsdl adb.query.req.xml adb.query.res.xml adb.status.req.xml adb.status.res.xml + -rm adb.nsmap adb.xsd adb.wsdl adb.*.req.xml adb.*.res.xml -rm HELP.txt - -rm ${EXECUTABLE} ${EXECUTABLE}.1 audioDB.o + -rm ${EXECUTABLE} ${EXECUTABLE}.1 ${OBJS} -sh -c "cd tests && sh ./clean.sh" test: ${EXECUTABLE}
--- a/audioDB.cpp Wed Dec 05 14:11:04 2007 +0000 +++ b/audioDB.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -1,36 +1,5 @@ #include "audioDB.h" -#if defined(O2_DEBUG) -void sigterm_action(int signal, siginfo_t *info, void *context) { - exit(128+signal); -} - -void sighup_action(int signal, siginfo_t *info, void *context) { - // FIXME: reread any configuration files -} -#endif - -void audioDB::error(const char* a, const char* b, const char *sysFunc) { - if(isServer) { - /* FIXME: I think this is leaky -- we never delete err. actually - deleting it is tricky, though; it gets placed into some - soap-internal struct with uncertain extent... -- CSR, - 2007-10-01 */ - char *err = new char[256]; /* FIXME: overflows */ - snprintf(err, 255, "%s: %s\n%s", a, b, sysFunc ? strerror(errno) : ""); - /* FIXME: actually we could usefully do with a properly structured - type, so that we can throw separate faultstring and details. - -- CSR, 2007-10-01 */ - throw(err); - } else { - cerr << a << ": " << b << endl; - if (sysFunc) { - perror(sysFunc); - } - exit(1); - } -} - audioDB::audioDB(const unsigned argc, char* const argv[]): O2_AUDIODB_INITIALIZERS { if(processArgs(argc, argv)<0){ @@ -159,10 +128,10 @@ } if(args_info.verbosity_given){ - verbosity=args_info.verbosity_arg; - if(verbosity<0 || verbosity>10){ - cerr << "Warning: verbosity out of range, setting to 1" << endl; - verbosity=1; + verbosity = args_info.verbosity_arg; + if(verbosity < 0 || verbosity > 10){ + std::cerr << "Warning: verbosity out of range, setting to 1" << std::endl; + verbosity = 1; } } @@ -173,15 +142,13 @@ size = (off_t) args_info.size_arg * 1000000; } - if(args_info.radius_given){ - radius=args_info.radius_arg; - if(radius<=0 || radius>1000000000){ + if(args_info.radius_given) { + radius = args_info.radius_arg; + if(radius <= 0 || radius > 1000000000) { error("radius out of range"); + } else { + VERB_LOG(3, "Setting radius to %f\n", radius); } - else - if(verbosity>3) { - cerr << "Setting radius to " << radius << endl; - } } if(args_info.SERVER_given){ @@ -249,7 +216,7 @@ if(args_info.times_given){ timesFileName=args_info.times_arg; if(strlen(timesFileName)>0){ - if(!(timesFile = new ifstream(timesFileName,ios::in))) + if(!(timesFile = new std::ifstream(timesFileName,std::ios::in))) error("Could not open times file for reading", timesFileName); usingTimes=1; } @@ -276,7 +243,7 @@ /* TO DO: REPLACE WITH if(args_info.keyList_given){ trackFileName=args_info.keyList_arg; - if(strlen(trackFileName)>0 && !(trackFile = new ifstream(trackFileName,ios::in))) + if(strlen(trackFileName)>0 && !(trackFile = new std::ifstream(trackFileName,std::ios::in))) error("Could not open keyList file for reading",trackFileName); } AND UPDATE BATCHINSERT() @@ -285,7 +252,7 @@ if(args_info.timesList_given){ timesFileName=args_info.timesList_arg; if(strlen(timesFileName)>0){ - if(!(timesFile = new ifstream(timesFileName,ios::in))) + if(!(timesFile = new std::ifstream(timesFileName,std::ios::in))) error("Could not open timesList file for reading", timesFileName); usingTimes=1; } @@ -293,7 +260,7 @@ if(args_info.powerList_given){ powerFileName=args_info.powerList_arg; if(strlen(powerFileName)>0){ - if(!(powerFile = new ifstream(powerFileName,ios::in))) + if(!(powerFile = new std::ifstream(powerFileName,std::ios::in))) error("Could not open powerList file for reading", powerFileName); usingPower=1; } @@ -309,14 +276,14 @@ if(args_info.keyList_given){ trackFileName=args_info.keyList_arg; - if(strlen(trackFileName)>0 && !(trackFile = new ifstream(trackFileName,ios::in))) + if(strlen(trackFileName)>0 && !(trackFile = new std::ifstream(trackFileName,std::ios::in))) error("Could not open keyList file for reading",trackFileName); } if(args_info.times_given){ timesFileName=args_info.times_arg; if(strlen(timesFileName)>0){ - if(!(timesFile = new ifstream(timesFileName,ios::in))) + if(!(timesFile = new std::ifstream(timesFileName,std::ios::in))) error("Could not open times file for reading", timesFileName); usingTimes=1; } @@ -382,594 +349,6 @@ return -1; // no command found } -void audioDB::get_lock(int fd, bool exclusive) { - struct flock lock; - int status; - - lock.l_type = exclusive ? F_WRLCK : F_RDLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; /* "the whole file" */ - - retry: - do { - status = fcntl(fd, F_SETLKW, &lock); - } while (status != 0 && errno == EINTR); - - if (status) { - if (errno == EAGAIN) { - sleep(1); - goto retry; - } else { - error("fcntl lock error", "", "fcntl"); - } - } -} - -void audioDB::release_lock(int fd) { - struct flock lock; - int status; - - lock.l_type = F_UNLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 0; - - status = fcntl(fd, F_SETLKW, &lock); - - if (status) - error("fcntl unlock error", "", "fcntl"); -} - -/* Make a new database. - - The database consists of: - - * a header (see dbTableHeader struct definition); - * keyTable: list of keys of tracks; - * trackTable: Maps implicit feature index to a feature vector - matrix (sizes of tracks) - * featureTable: Lots of doubles; - * timesTable: (start,end) time points for each feature vector; - * powerTable: associated power for each feature vector; - * l2normTable: squared l2norms for each feature vector. -*/ -void audioDB::create(const char* dbName){ - if ((dbfid = open (dbName, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) - error("Can't create database file", dbName, "open"); - get_lock(dbfid, 1); - - if(verbosity) { - cerr << "header size:" << O2_HEADERSIZE << endl; - } - - dbH = new dbTableHeaderT(); - assert(dbH); - - unsigned int maxfiles = (unsigned int) rint((double) O2_MAXFILES * (double) size / (double) O2_DEFAULTDBSIZE); - - // Initialize header - dbH->magic = O2_MAGIC; - dbH->version = O2_FORMAT_VERSION; - dbH->numFiles = 0; - dbH->dim = 0; - dbH->flags = 0; - dbH->headerSize = O2_HEADERSIZE; - dbH->length = 0; - dbH->fileTableOffset = ALIGN_PAGE_UP(O2_HEADERSIZE); - dbH->trackTableOffset = ALIGN_PAGE_UP(dbH->fileTableOffset + O2_FILETABLESIZE*maxfiles); - dbH->dataOffset = ALIGN_PAGE_UP(dbH->trackTableOffset + O2_TRACKTABLESIZE*maxfiles); - dbH->l2normTableOffset = ALIGN_PAGE_DOWN(size - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); - dbH->powerTableOffset = ALIGN_PAGE_DOWN(dbH->l2normTableOffset - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); - dbH->timesTableOffset = ALIGN_PAGE_DOWN(dbH->powerTableOffset - 2*maxfiles*O2_MEANNUMVECTORS*sizeof(double)); - dbH->dbSize = size; - - write(dbfid, dbH, O2_HEADERSIZE); - - // go to the location corresponding to the last byte - if (lseek (dbfid, size - 1, SEEK_SET) == -1) - error("lseek error in db file", "", "lseek"); - - // write a dummy byte at the last location - if (write (dbfid, "", 1) != 1) - error("write error", "", "write"); - - if(verbosity) { - cerr << COM_CREATE << " " << dbName << endl; - } -} - -void audioDB::drop(){ - // FIXME: drop something? Should we even allow this? -} - -void audioDB::initDBHeader(const char* dbName) { - if ((dbfid = open(dbName, forWrite ? O_RDWR : O_RDONLY)) < 0) { - error("Can't open database file", dbName, "open"); - } - - get_lock(dbfid, forWrite); - // Get the database header info - dbH = new dbTableHeaderT(); - assert(dbH); - - if(read(dbfid, (char *) dbH, O2_HEADERSIZE) != O2_HEADERSIZE) { - error("error reading db header", dbName, "read"); - } - - if(dbH->magic == O2_OLD_MAGIC) { - // FIXME: if anyone ever complains, write the program to convert - // from the old audioDB format to the new... - error("database file has old O2 header", dbName); - } - - if(dbH->magic != O2_MAGIC) { - cerr << "expected: " << O2_MAGIC << ", got: " << dbH->magic << endl; - error("database file has incorrect header", dbName); - } - - if(dbH->version != O2_FORMAT_VERSION) { - error("database file has incorrect version", dbName); - } - - if(dbH->headerSize != O2_HEADERSIZE) { - error("sizeof(dbTableHeader) unexpected: platform ABI mismatch?", dbName); - } - -#define CHECKED_MMAP(type, var, start, length) \ - { void *tmp = mmap(0, length, (PROT_READ | (forWrite ? PROT_WRITE : 0)), MAP_SHARED, dbfid, (start)); \ - if(tmp == (void *) -1) { \ - error("mmap error for db table", #var, "mmap"); \ - } \ - var = (type) tmp; \ - } - - CHECKED_MMAP(char *, db, 0, getpagesize()); - - // Make some handy tables with correct types - if(forWrite || (dbH->length > 0)) { - if(forWrite) { - fileTableLength = dbH->trackTableOffset - dbH->fileTableOffset; - trackTableLength = dbH->dataOffset - dbH->trackTableOffset; - dataBufLength = dbH->timesTableOffset - dbH->dataOffset; - timesTableLength = dbH->powerTableOffset - dbH->timesTableOffset; - powerTableLength = dbH->l2normTableOffset - dbH->powerTableOffset; - l2normTableLength = dbH->dbSize - dbH->l2normTableOffset; - } else { - fileTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLESIZE); - trackTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_TRACKTABLESIZE); - dataBufLength = ALIGN_PAGE_UP(dbH->length); - timesTableLength = ALIGN_PAGE_UP(2*(dbH->length / dbH->dim)); - powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); - l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); - } - CHECKED_MMAP(char *, fileTable, dbH->fileTableOffset, fileTableLength); - CHECKED_MMAP(unsigned *, trackTable, dbH->trackTableOffset, trackTableLength); - /* - * No more mmap() for dataBuf - * - * FIXME: Actually we do do the mmap() in the two cases where it's - * still "needed": in pointQuery and in l2norm if dbH->length is - * non-zero. Removing those cases too (and deleting the dataBuf - * variable completely) would be cool. -- CSR, 2007-11-19 - * - * CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); - */ - CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength); - CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength); - CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength); - } -} - -void audioDB::initInputFile (const char *inFile) { - if (inFile) { - if ((infid = open(inFile, O_RDONLY)) < 0) { - error("can't open input file for reading", inFile, "open"); - } - - if (fstat(infid, &statbuf) < 0) { - error("fstat error finding size of input", inFile, "fstat"); - } - - if(dbH->dim == 0 && dbH->length == 0) { // empty database - // initialize with input dimensionality - if(read(infid, &dbH->dim, sizeof(unsigned)) != sizeof(unsigned)) { - error("short read of input file", inFile); - } - if(dbH->dim == 0) { - error("dimensionality of zero in input file", inFile); - } - } else { - unsigned test; - if(read(infid, &test, sizeof(unsigned)) != sizeof(unsigned)) { - error("short read of input file", inFile); - } - if(dbH->dim == 0) { - error("dimensionality of zero in input file", inFile); - } - if(dbH->dim != test) { - cerr << "error: expected dimension: " << dbH->dim << ", got : " << test <<endl; - error("feature dimensions do not match database table dimensions", inFile); - } - } - - if ((indata = (char *) mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, infid, 0)) == (caddr_t) -1) { - error("mmap error for input", inFile, "mmap"); - } - } -} - -void audioDB::initTables(const char* dbName, const char* inFile = 0) { - initDBHeader(dbName); - initInputFile(inFile); -} - -bool audioDB::enough_data_space_free(off_t size) { - return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); -} - -void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { - lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); - write(dbfid, buffer, size); -} - -void audioDB::insert(const char* dbName, const char* inFile) { - forWrite = true; - initTables(dbName, inFile); - - if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - error("Must use timestamps with timestamped database","use --times"); - - if(!usingPower && (dbH->flags & O2_FLAG_POWER)) - error("Must use power with power-enabled database", dbName); - - if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { - error("Insert failed: no more room in database", inFile); - } - - if(!key) - key=inFile; - // Linear scan of filenames check for pre-existing feature - unsigned alreadyInserted=0; - for(unsigned k=0; k<dbH->numFiles; k++) - if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key)+1)==0){ - alreadyInserted=1; - break; - } - - if(alreadyInserted){ - if(verbosity) { - cerr << "Warning: key already exists in database, ignoring: " <<inFile << endl; - } - return; - } - - // Make a track index table of features to file indexes - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - if(!numVectors){ - if(verbosity) { - cerr << "Warning: ignoring zero-length feature vector file:" << key << endl; - } - // CLEAN UP - munmap(indata,statbuf.st_size); - munmap(db,dbH->dbSize); - close(infid); - return; - } - - strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key)); - - off_t insertoffset = dbH->length;// Store current state - - // Check times status and insert times from file - unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); - double *timesdata = timesTable + 2*indexoffset; - - if(2*(indexoffset + numVectors) > timesTableLength) { - error("out of space for times", key); - } - - if (usingTimes) { - insertTimeStamps(numVectors, timesFile, timesdata); - } - - double *powerdata = powerTable + indexoffset; - insertPowerData(numVectors, powerfd, powerdata); - - // Increment file count - dbH->numFiles++; - - // Update Header information - dbH->length+=(statbuf.st_size-sizeof(int)); - - // Update track to file index map - memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned)); - - insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); - - // Norm the vectors on input if the database is already L2 normed - if(dbH->flags & O2_FLAG_L2NORM) - unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append - - // Report status - status(dbName); - if(verbosity) { - cerr << COM_INSERT << " " << dbName << " " << numVectors << " vectors " - << (statbuf.st_size-sizeof(int)) << " bytes." << endl; - } - - // Copy the header back to the database - memcpy (db, dbH, sizeof(dbTableHeaderT)); - - // CLEAN UP - munmap(indata,statbuf.st_size); - close(infid); -} - -void audioDB::insertTimeStamps(unsigned numVectors, ifstream *timesFile, double *timesdata) { - assert(usingTimes); - - unsigned numtimes = 0; - - if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) { - dbH->flags=dbH->flags|O2_FLAG_TIMES; - } else if(!(dbH->flags & O2_FLAG_TIMES)) { - error("Timestamp file used with non-timestamped database", timesFileName); - } - - if(!timesFile->is_open()) { - error("problem opening times file on timestamped database", timesFileName); - } - - double timepoint, next; - *timesFile >> timepoint; - if (timesFile->eof()) { - error("no entries in times file", timesFileName); - } - numtimes++; - do { - *timesFile >> next; - if (timesFile->eof()) { - break; - } - numtimes++; - timesdata[0] = timepoint; - timepoint = (timesdata[1] = next); - timesdata += 2; - } while (numtimes < numVectors + 1); - - if (numtimes < numVectors + 1) { - error("too few timepoints in times file", timesFileName); - } - - *timesFile >> next; - if (!timesFile->eof()) { - error("too many timepoints in times file", timesFileName); - } -} - -void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { - if (usingPower) { - if (!(dbH->flags & O2_FLAG_POWER)) { - error("Cannot insert power data on non-power DB", dbName); - } - - int one; - unsigned int count; - - count = read(powerfd, &one, sizeof(unsigned int)); - if (count != sizeof(unsigned int)) { - error("powerfd read failed", "int", "read"); - } - if (one != 1) { - error("dimensionality of power file not 1", powerFileName); - } - - // FIXME: should check that the powerfile is the right size for - // this. -- CSR, 2007-10-30 - count = read(powerfd, powerdata, numVectors * sizeof(double)); - if (count != numVectors * sizeof(double)) { - error("powerfd read failed", "double", "read"); - } - } -} - -void audioDB::batchinsert(const char* dbName, const char* inFile) { - - forWrite = true; - initDBHeader(dbName); - - if(!key) - key=inFile; - ifstream *filesIn = 0; - ifstream *keysIn = 0; - ifstream* thisTimesFile = 0; - int thispowerfd = 0; - - if(!(filesIn = new ifstream(inFile))) - error("Could not open batch in file", inFile); - if(key && key!=inFile) - if(!(keysIn = new ifstream(key))) - error("Could not open batch key file",key); - - if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - error("Must use timestamps with timestamped database","use --times"); - - if(!usingPower && (dbH->flags & O2_FLAG_POWER)) - error("Must use power with power-enabled database", dbName); - - unsigned totalVectors=0; - char *thisKey = new char[MAXSTR]; - char *thisFile = new char[MAXSTR]; - char *thisTimesFileName = new char[MAXSTR]; - char *thisPowerFileName = new char[MAXSTR]; - - do{ - filesIn->getline(thisFile,MAXSTR); - if(key && key!=inFile) - keysIn->getline(thisKey,MAXSTR); - else - thisKey = thisFile; - if(usingTimes) - timesFile->getline(thisTimesFileName,MAXSTR); - if(usingPower) - powerFile->getline(thisPowerFileName, MAXSTR); - - if(filesIn->eof()) - break; - - initInputFile(thisFile); - - if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { - error("batchinsert failed: no more room in database", thisFile); - } - - // Linear scan of filenames check for pre-existing feature - unsigned alreadyInserted=0; - - for(unsigned k=0; k<dbH->numFiles; k++) - if(strncmp(fileTable + k*O2_FILETABLESIZE, thisKey, strlen(thisKey)+1)==0){ - alreadyInserted=1; - break; - } - - if(alreadyInserted){ - if(verbosity) { - cerr << "Warning: key already exists in database:" << thisKey << endl; - } - } - else{ - - // Make a track index table of features to file indexes - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - if(!numVectors){ - if(verbosity) { - cerr << "Warning: ignoring zero-length feature vector file:" << thisKey << endl; - } - } - else{ - if(usingTimes){ - if(timesFile->eof()) { - error("not enough timestamp files in timesList", timesFileName); - } - thisTimesFile = new ifstream(thisTimesFileName,ios::in); - if(!thisTimesFile->is_open()) { - error("Cannot open timestamp file", thisTimesFileName); - } - off_t insertoffset = dbH->length; - unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double)); - double *timesdata = timesTable + 2*indexoffset; - if(2*(indexoffset + numVectors) > timesTableLength) { - error("out of space for times", key); - } - insertTimeStamps(numVectors, thisTimesFile, timesdata); - if(thisTimesFile) - delete thisTimesFile; - } - - if (usingPower) { - if(powerFile->eof()) { - error("not enough power files in powerList", powerFileName); - } - thispowerfd = open(thisPowerFileName, O_RDONLY); - if (thispowerfd < 0) { - error("failed to open power file", thisPowerFileName); - } - unsigned insertoffset = dbH->length; - unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double)); - double *powerdata = powerTable + poweroffset; - insertPowerData(numVectors, thispowerfd, powerdata); - if (0 < thispowerfd) { - close(thispowerfd); - } - } - strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey)); - - off_t insertoffset = dbH->length;// Store current state - - // Increment file count - dbH->numFiles++; - - // Update Header information - dbH->length+=(statbuf.st_size-sizeof(int)); - - // Update track to file index map - memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); - - insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); - - // Norm the vectors on input if the database is already L2 normed - if(dbH->flags & O2_FLAG_L2NORM) - unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append - - totalVectors+=numVectors; - - // Copy the header back to the database - memcpy (db, dbH, sizeof(dbTableHeaderT)); - } - } - // CLEAN UP - munmap(indata,statbuf.st_size); - close(infid); - }while(!filesIn->eof()); - - if(verbosity) { - cerr << COM_BATCHINSERT << " " << dbName << " " << totalVectors << " vectors " - << totalVectors*dbH->dim*sizeof(double) << " bytes." << endl; - } - - // Report status - status(dbName); -} - -// FIXME: this can't propagate the sequence length argument (used for -// dudCount). See adb__status() definition for the other half of -// this. -- CSR, 2007-10-01 -void audioDB::ws_status(const char*dbName, char* hostport){ - struct soap soap; - adb__statusResponse adbStatusResponse; - - // Query an existing adb database - soap_init(&soap); - if(soap_call_adb__status(&soap,hostport,NULL,(char*)dbName,adbStatusResponse)==SOAP_OK) { - cout << "numFiles = " << adbStatusResponse.result.numFiles << endl; - cout << "dim = " << adbStatusResponse.result.dim << endl; - cout << "length = " << adbStatusResponse.result.length << endl; - cout << "dudCount = " << adbStatusResponse.result.dudCount << endl; - cout << "nullCount = " << adbStatusResponse.result.nullCount << endl; - cout << "flags = " << adbStatusResponse.result.flags << endl; - } else { - soap_print_fault(&soap,stderr); - } - - soap_destroy(&soap); - soap_end(&soap); - soap_done(&soap); -} - -void audioDB::ws_query(const char*dbName, const char *trackKey, const char* hostport){ - struct soap soap; - adb__queryResponse adbQueryResponse; - - soap_init(&soap); - if(soap_call_adb__query(&soap,hostport,NULL, - (char*)dbName,(char*)trackKey,(char*)trackFileName,(char*)timesFileName, - queryType, queryPoint, pointNN, trackNN, sequenceLength, adbQueryResponse)==SOAP_OK){ - //std::cerr << "result list length:" << adbQueryResponse.result.__sizeRlist << std::endl; - for(int i=0; i<adbQueryResponse.result.__sizeRlist; i++) - std::cout << adbQueryResponse.result.Rlist[i] << " " << adbQueryResponse.result.Dist[i] - << " " << adbQueryResponse.result.Qpos[i] << " " << adbQueryResponse.result.Spos[i] << std::endl; - } - else - soap_print_fault(&soap,stderr); - - soap_destroy(&soap); - soap_end(&soap); - soap_done(&soap); - -} - - void audioDB::status(const char* dbName, adb__statusResponse *adbStatusResponse){ if(!dbH) initTables(dbName, 0); @@ -987,18 +366,18 @@ if(adbStatusResponse == 0) { // Update Header information - cout << "num files:" << dbH->numFiles << endl; - cout << "data dim:" << dbH->dim <<endl; + std::cout << "num files:" << dbH->numFiles << std::endl; + std::cout << "data dim:" << dbH->dim <<std::endl; if(dbH->dim>0){ - cout << "total vectors:" << dbH->length/(sizeof(double)*dbH->dim)<<endl; - cout << "vectors available:" << (dbH->timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << endl; + std::cout << "total vectors:" << dbH->length/(sizeof(double)*dbH->dim)<<std::endl; + std::cout << "vectors available:" << (dbH->timesTableOffset-(dbH->dataOffset+dbH->length))/(sizeof(double)*dbH->dim) << std::endl; } - cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << endl; - cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" << - (100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << endl; - cout << "flags:" << dbH->flags << endl; + std::cout << "total bytes:" << dbH->length << " (" << (100.0*dbH->length)/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl; + std::cout << "bytes available:" << dbH->timesTableOffset-(dbH->dataOffset+dbH->length) << " (" << + (100.0*(dbH->timesTableOffset-(dbH->dataOffset+dbH->length)))/(dbH->timesTableOffset-dbH->dataOffset) << "%)" << std::endl; + std::cout << "flags:" << dbH->flags << std::endl; - cout << "null count: " << nullCount << " small sequence count " << dudCount-nullCount << endl; + std::cout << "null count: " << nullCount << " small sequence count " << dudCount-nullCount << std::endl; } else { adbStatusResponse->result.numFiles = dbH->numFiles; adbStatusResponse->result.dim = dbH->dim; @@ -1009,182 +388,6 @@ } } -void audioDB::dump(const char* dbName){ - if(!dbH) { - initTables(dbName, 0); - } - - if((mkdir(output, S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { - error("error making output directory", output, "mkdir"); - } - - char *cwd = new char[PATH_MAX]; - - if ((getcwd(cwd, PATH_MAX)) == 0) { - error("error getting working directory", "", "getcwd"); - } - - if((chdir(output)) < 0) { - error("error changing working directory", output, "chdir"); - } - - int fLfd, tLfd = 0, pLfd = 0, kLfd; - FILE *fLFile, *tLFile = 0, *pLFile = 0, *kLFile; - - if ((fLfd = open("featureList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { - error("error creating featureList file", "featureList.txt", "open"); - } - - int times = dbH->flags & O2_FLAG_TIMES; - if (times) { - if ((tLfd = open("timesList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { - error("error creating timesList file", "timesList.txt", "open"); - } - } - - int power = dbH->flags & O2_FLAG_POWER; - if (power) { - if ((pLfd = open("powerList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { - error("error creating powerList file", "powerList.txt", "open"); - } - } - - if ((kLfd = open("keyList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { - error("error creating keyList file", "keyList.txt", "open"); - } - - /* can these fail? I sincerely hope not. */ - fLFile = fdopen(fLfd, "w"); - if (times) { - tLFile = fdopen(tLfd, "w"); - } - if (power) { - pLFile = fdopen(pLfd, "w"); - } - kLFile = fdopen(kLfd, "w"); - - char *fName = new char[256]; - int ffd, pfd; - FILE *tFile; - unsigned pos = 0; - lseek(dbfid, dbH->dataOffset, SEEK_SET); - double *data_buffer; - size_t data_buffer_size; - for(unsigned k = 0; k < dbH->numFiles; k++) { - fprintf(kLFile, "%s\n", fileTable + k*O2_FILETABLESIZE); - snprintf(fName, 256, "%05d.features", k); - if ((ffd = open(fName, O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { - error("error creating feature file", fName, "open"); - } - if ((write(ffd, &dbH->dim, sizeof(uint32_t))) < 0) { - error("error writing dimensions", fName, "write"); - } - - /* FIXME: this repeated malloc()/free() of data buffers is - inefficient. */ - data_buffer_size = trackTable[k] * dbH->dim * sizeof(double); - - { - void *tmp = malloc(data_buffer_size); - if (tmp == NULL) { - error("error allocating data buffer"); - } - data_buffer = (double *) tmp; - } - - if ((read(dbfid, data_buffer, data_buffer_size)) != (ssize_t) data_buffer_size) { - error("error reading data", fName, "read"); - } - - if ((write(ffd, data_buffer, data_buffer_size)) < 0) { - error("error writing data", fName, "write"); - } - - free(data_buffer); - - fprintf(fLFile, "%s\n", fName); - close(ffd); - - if (times) { - snprintf(fName, 256, "%05d.times", k); - tFile = fopen(fName, "w"); - for(unsigned i = 0; i < trackTable[k]; i++) { - // KLUDGE: specifying 16 digits of precision after the decimal - // point is (but check this!) sufficient to uniquely identify - // doubles; however, that will cause ugliness, as that's - // vastly too many for most values of interest. Moving to %a - // here and scanf() in the timesFile reading might fix this. - // -- CSR, 2007-10-19 - fprintf(tFile, "%.16e\n", *(timesTable + 2*pos + 2*i)); - } - fprintf(tFile, "%.16e\n", *(timesTable + 2*pos + 2*trackTable[k]-1)); - - fprintf(tLFile, "%s\n", fName); - } - - if (power) { - uint32_t one = 1; - snprintf(fName, 256, "%05d.power", k); - if ((pfd = open(fName, O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { - error("error creating power file", fName, "open"); - } - if ((write(pfd, &one, sizeof(uint32_t))) < 0) { - error("error writing one", fName, "write"); - } - if ((write(pfd, powerTable + pos, trackTable[k] * sizeof(double))) < 0) { - error("error writing data", fName, "write"); - } - fprintf(pLFile, "%s\n", fName); - close(pfd); - } - - pos += trackTable[k]; - cout << fileTable+k*O2_FILETABLESIZE << " " << trackTable[k] << endl; - } - - FILE *scriptFile; - scriptFile = fopen("restore.sh", "w"); - fprintf(scriptFile, "\ -#! /bin/sh\n\ -#\n\ -# usage: AUDIODB=/path/to/audioDB sh ./restore.sh <newdb>\n\ -\n\ -if [ -z \"${AUDIODB}\" ]; then echo set AUDIODB variable; exit 1; fi\n\ -if [ -z \"$1\" ]; then echo usage: $0 newdb; exit 1; fi\n\n\ -\"${AUDIODB}\" -d \"$1\" -N --size=%d\n", (int) (dbH->dbSize / 1000000)); - if(dbH->flags & O2_FLAG_L2NORM) { - fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -L\n"); - } - if(power) { - fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -P\n"); - } - fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -B -F featureList.txt -K keyList.txt"); - if(times) { - fprintf(scriptFile, " -T timesList.txt"); - } - if(power) { - fprintf(scriptFile, " -W powerList.txt"); - } - fprintf(scriptFile, "\n"); - fclose(scriptFile); - - if((chdir(cwd)) < 0) { - error("error changing working directory", cwd, "chdir"); - } - - fclose(fLFile); - if(times) { - fclose(tLFile); - } - if(power) { - fclose(pLFile); - } - fclose(kLFile); - delete[] fName; - - status(dbName); -} - void audioDB::l2norm(const char* dbName) { forWrite = true; initTables(dbName, 0); @@ -1209,1565 +412,11 @@ memcpy(db, dbH, O2_HEADERSIZE); } -bool audioDB::powers_acceptable(double p1, double p2) { - if (use_absolute_threshold) { - if ((p1 < absolute_threshold) || (p2 < absolute_threshold)) { - return false; - } - } - if (use_relative_threshold) { - if (fabs(p1-p2) > fabs(relative_threshold)) { - return false; - } - } - return true; -} +// Unit norm block of features -void audioDB::query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ - switch(queryType){ - case O2_POINT_QUERY: - pointQuery(dbName, inFile, adbQueryResponse); - break; - case O2_SEQUENCE_QUERY: - if(radius==0) - trackSequenceQueryNN(dbName, inFile, adbQueryResponse); - else - trackSequenceQueryRad(dbName, inFile, adbQueryResponse); - break; - case O2_TRACK_QUERY: - trackPointQuery(dbName, inFile, adbQueryResponse); - break; - default: - error("unrecognized queryType in query()"); - - } -} - -//return ordinal position of key in keyTable -unsigned audioDB::getKeyPos(char* key){ - for(unsigned k=0; k<dbH->numFiles; k++) - if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key))==0) - return k; - error("Key not found",key); - return O2_ERR_KEYNOTFOUND; -} - -// Basic point query engine -void audioDB::pointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { - - initTables(dbName, inFile); - - // For each input vector, find the closest pointNN matching output vectors and report - // we use stdout in this stub version - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - - double* query = (double*)(indata+sizeof(int)); - CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); - double* data = dataBuf; - double* queryCopy = 0; - - if( dbH->flags & O2_FLAG_L2NORM ){ - // Make a copy of the query - queryCopy = new double[numVectors*dbH->dim]; - qNorm = new double[numVectors]; - assert(queryCopy&&qNorm); - memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); - unitNorm(queryCopy, dbH->dim, numVectors, qNorm); - query = queryCopy; - } - - // Make temporary dynamic memory for results - assert(pointNN>0 && pointNN<=O2_MAXNN); - double distances[pointNN]; - unsigned qIndexes[pointNN]; - unsigned sIndexes[pointNN]; - for(unsigned k=0; k<pointNN; k++){ - distances[k]=-DBL_MAX; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - - unsigned j=numVectors; - unsigned k,l,n; - double thisDist; - - unsigned totalVecs=dbH->length/(dbH->dim*sizeof(double)); - double meanQdur = 0; - double *timesdata = 0; - double *querydurs = 0; - double *dbdurs = 0; - - if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ - cerr << "warning: ignoring query timestamps for non-timestamped database" << endl; - usingTimes=0; - } - - else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - cerr << "warning: no timestamps given for query. Ignoring database timestamps." << endl; - - else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ - timesdata = new double[2*numVectors]; - querydurs = new double[numVectors]; - insertTimeStamps(numVectors, timesFile, timesdata); - // Calculate durations of points - for(k=0; k<numVectors-1; k++){ - querydurs[k]=timesdata[2*k+1]-timesdata[2*k]; - meanQdur+=querydurs[k]; - } - meanQdur/=k; - // Individual exhaustive timepoint durations - dbdurs = new double[totalVecs]; - for(k=0; k<totalVecs-1; k++) { - dbdurs[k]=timesTable[2*k+1]-timesTable[2*k]; - } - } - - if(usingQueryPoint) - if(queryPoint>numVectors-1) - error("queryPoint > numVectors in query"); - else{ - if(verbosity>1) { - cerr << "query point: " << queryPoint << endl; cerr.flush(); - } - query=query+queryPoint*dbH->dim; - numVectors=queryPoint+1; - j=1; - } - - gettimeofday(&tv1, NULL); - while(j--){ // query - data=dataBuf; - k=totalVecs; // number of database vectors - while(k--){ // database - thisDist=0; - l=dbH->dim; - double* q=query; - while(l--) - thisDist+=*q++**data++; - if(!usingTimes || - (usingTimes - && fabs(dbdurs[totalVecs-k-1]-querydurs[numVectors-j-1])<querydurs[numVectors-j-1]*timesTol)){ - n=pointNN; - while(n--){ - if(thisDist>=distances[n]){ - if((n==0 || thisDist<=distances[n-1])){ - // Copy all values above up the queue - for( l=pointNN-1 ; l >= n+1 ; l--){ - distances[l]=distances[l-1]; - qIndexes[l]=qIndexes[l-1]; - sIndexes[l]=sIndexes[l-1]; - } - distances[n]=thisDist; - qIndexes[n]=numVectors-j-1; - sIndexes[n]=dbH->length/(sizeof(double)*dbH->dim)-k-1; - break; - } - } - else - break; - } - } - } - // Move query pointer to next query point - query+=dbH->dim; - } - - gettimeofday(&tv2, NULL); - if(verbosity>1) { - cerr << endl << " elapsed time:" << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << endl; - } - - if(adbQueryResponse==0){ - // Output answer - // Loop over nearest neighbours - for(k=0; k < pointNN; k++){ - // Scan for key - unsigned cumTrack=0; - for(l=0 ; l<dbH->numFiles; l++){ - cumTrack+=trackTable[l]; - if(sIndexes[k]<cumTrack){ - cout << fileTable+l*O2_FILETABLESIZE << " " << distances[k] << " " << qIndexes[k] << " " - << sIndexes[k]+trackTable[l]-cumTrack << endl; - break; - } - } - } - } - else{ // Process Web Services Query - int listLen; - for(k = 0; k < pointNN; k++) { - if(distances[k] == -DBL_MAX) - break; - } - listLen = k; - - adbQueryResponse->result.__sizeRlist=listLen; - adbQueryResponse->result.__sizeDist=listLen; - adbQueryResponse->result.__sizeQpos=listLen; - adbQueryResponse->result.__sizeSpos=listLen; - adbQueryResponse->result.Rlist= new char*[listLen]; - adbQueryResponse->result.Dist = new double[listLen]; - adbQueryResponse->result.Qpos = new unsigned int[listLen]; - adbQueryResponse->result.Spos = new unsigned int[listLen]; - for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ - adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; - adbQueryResponse->result.Dist[k]=distances[k]; - adbQueryResponse->result.Qpos[k]=qIndexes[k]; - unsigned cumTrack=0; - for(l=0 ; l<dbH->numFiles; l++){ - cumTrack+=trackTable[l]; - if(sIndexes[k]<cumTrack){ - sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+l*O2_FILETABLESIZE); - break; - } - } - adbQueryResponse->result.Spos[k]=sIndexes[k]+trackTable[l]-cumTrack; - } - } - - // Clean up - if(queryCopy) - delete queryCopy; - if(qNorm) - delete qNorm; - if(timesdata) - delete[] timesdata; - if(querydurs) - delete[] querydurs; - if(dbdurs) - delete dbdurs; -} - -// trackPointQuery -// return the trackNN closest tracks to the query track -// uses average of pointNN points per track -void audioDB::trackPointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { - initTables(dbName, inFile); - - // For each input vector, find the closest pointNN matching output vectors and report - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - double* query = (double*)(indata+sizeof(int)); - double* data; - double* queryCopy = 0; - - if( dbH->flags & O2_FLAG_L2NORM ){ - // Make a copy of the query - queryCopy = new double[numVectors*dbH->dim]; - qNorm = new double[numVectors]; - assert(queryCopy&&qNorm); - memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); - unitNorm(queryCopy, dbH->dim, numVectors, qNorm); - query = queryCopy; - } - - assert(pointNN>0 && pointNN<=O2_MAXNN); - assert(trackNN>0 && trackNN<=O2_MAXNN); - - // Make temporary dynamic memory for results - double trackDistances[trackNN]; - unsigned trackIDs[trackNN]; - unsigned trackQIndexes[trackNN]; - unsigned trackSIndexes[trackNN]; - - double distances[pointNN]; - unsigned qIndexes[pointNN]; - unsigned sIndexes[pointNN]; - - unsigned j=numVectors; // number of query points - unsigned k,l,n, track, trackOffset=0, processedTracks=0; - double thisDist; - - for(k=0; k<pointNN; k++){ - distances[k]=-DBL_MAX; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - - for(k=0; k<trackNN; k++){ - trackDistances[k]=-DBL_MAX; - trackQIndexes[k]=~0; - trackSIndexes[k]=~0; - trackIDs[k]=~0; - } - - double meanQdur = 0; - double *timesdata = 0; - double *querydurs = 0; - double *meanDBdur = 0; - - if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ - cerr << "warning: ignoring query timestamps for non-timestamped database" << endl; - usingTimes=0; - } - - else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - cerr << "warning: no timestamps given for query. Ignoring database timestamps." << endl; - - else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ - timesdata = new double[2*numVectors]; - querydurs = new double[numVectors]; - insertTimeStamps(numVectors, timesFile, timesdata); - // Calculate durations of points - for(k=0; k<numVectors-1; k++) { - querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; - meanQdur += querydurs[k]; - } - meanQdur/=k; - meanDBdur = new double[dbH->numFiles]; - for(k=0; k<dbH->numFiles; k++){ - meanDBdur[k]=0.0; - for(j=0; j<trackTable[k]-1 ; j++) { - meanDBdur[k]+=timesTable[2*j+1]-timesTable[2*j]; - } - meanDBdur[k]/=j; - } - } - - if(usingQueryPoint) - if(queryPoint>numVectors-1) - error("queryPoint > numVectors in query"); - else{ - if(verbosity>1) { - cerr << "query point: " << queryPoint << endl; cerr.flush(); - } - query=query+queryPoint*dbH->dim; - numVectors=queryPoint+1; - } - - // build track offset table - off_t *trackOffsetTable = new off_t[dbH->numFiles]; - unsigned cumTrack=0; - off_t trackIndexOffset; - for(k=0; k<dbH->numFiles;k++){ - trackOffsetTable[k]=cumTrack; - cumTrack+=trackTable[k]*dbH->dim; - } - - char nextKey[MAXSTR]; - - gettimeofday(&tv1, NULL); - - size_t data_buffer_size = 0; - double *data_buffer = 0; - lseek(dbfid, dbH->dataOffset, SEEK_SET); - - for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ - - trackOffset = trackOffsetTable[track]; // numDoubles offset - - // get trackID from file if using a control file - if(trackFile) { - trackFile->getline(nextKey,MAXSTR); - if(!trackFile->eof()) { - track = getKeyPos(nextKey); - trackOffset = trackOffsetTable[track]; - lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); - } else { - break; - } - } - - trackIndexOffset=trackOffset/dbH->dim; // numVectors offset - - if(verbosity>7) { - cerr << track << "." << trackOffset/(dbH->dim) << "." << trackTable[track] << " | ";cerr.flush(); - } - - if(dbH->flags & O2_FLAG_L2NORM) - usingQueryPoint?query=queryCopy+queryPoint*dbH->dim:query=queryCopy; - else - usingQueryPoint?query=(double*)(indata+sizeof(int))+queryPoint*dbH->dim:query=(double*)(indata+sizeof(int)); - if(usingQueryPoint) - j=1; - else - j=numVectors; - - if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { - if(data_buffer) { - free(data_buffer); - } - { - data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; - void *tmp = malloc(data_buffer_size); - if (tmp == NULL) { - error("error allocating data buffer"); - } - data_buffer = (double *) tmp; - } - } - - read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); - - while(j--){ - k=trackTable[track]; // number of vectors in track - data=data_buffer; // data for track - while(k--){ - thisDist=0; - l=dbH->dim; - double* q=query; - while(l--) - thisDist+=*q++**data++; - if(!usingTimes || - (usingTimes - && fabs(meanDBdur[track]-meanQdur)<meanQdur*timesTol)){ - n=pointNN; - while(n--){ - if(thisDist>=distances[n]){ - if((n==0 || thisDist<=distances[n-1])){ - // Copy all values above up the queue - for( l=pointNN-1 ; l > n ; l--){ - distances[l]=distances[l-1]; - qIndexes[l]=qIndexes[l-1]; - sIndexes[l]=sIndexes[l-1]; - } - distances[n]=thisDist; - qIndexes[n]=numVectors-j-1; - sIndexes[n]=trackTable[track]-k-1; - break; - } - } - else - break; - } - } - } // track - // Move query pointer to next query point - query+=dbH->dim; - } // query - // Take the average of this track's distance - // Test the track distances - thisDist=0; - for (n = 0; n < pointNN; n++) { - if (distances[n] == -DBL_MAX) break; - thisDist += distances[n]; - } - thisDist /= n; - - n=trackNN; - while(n--){ - if(thisDist>=trackDistances[n]){ - if((n==0 || thisDist<=trackDistances[n-1])){ - // Copy all values above up the queue - for( l=trackNN-1 ; l > n ; l--){ - trackDistances[l]=trackDistances[l-1]; - trackQIndexes[l]=trackQIndexes[l-1]; - trackSIndexes[l]=trackSIndexes[l-1]; - trackIDs[l]=trackIDs[l-1]; - } - trackDistances[n]=thisDist; - trackQIndexes[n]=qIndexes[0]; - trackSIndexes[n]=sIndexes[0]; - trackIDs[n]=track; - break; - } - } - else - break; - } - for(unsigned k=0; k<pointNN; k++){ - distances[k]=-DBL_MAX; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - } // tracks - - free(data_buffer); - - gettimeofday(&tv2, NULL); - - if(verbosity>1) { - cerr << endl << "processed tracks :" << processedTracks - << " elapsed time:" << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << endl; - } - - if(adbQueryResponse==0){ - if(verbosity>1) { - cerr<<endl; - } - // Output answer - // Loop over nearest neighbours - for(k=0; k < min(trackNN,processedTracks); k++) - cout << fileTable+trackIDs[k]*O2_FILETABLESIZE - << " " << trackDistances[k] << " " << trackQIndexes[k] << " " << trackSIndexes[k] << endl; - } - else{ // Process Web Services Query - int listLen = min(trackNN, processedTracks); - adbQueryResponse->result.__sizeRlist=listLen; - adbQueryResponse->result.__sizeDist=listLen; - adbQueryResponse->result.__sizeQpos=listLen; - adbQueryResponse->result.__sizeSpos=listLen; - adbQueryResponse->result.Rlist= new char*[listLen]; - adbQueryResponse->result.Dist = new double[listLen]; - adbQueryResponse->result.Qpos = new unsigned int[listLen]; - adbQueryResponse->result.Spos = new unsigned int[listLen]; - for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ - adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; - adbQueryResponse->result.Dist[k]=trackDistances[k]; - adbQueryResponse->result.Qpos[k]=trackQIndexes[k]; - adbQueryResponse->result.Spos[k]=trackSIndexes[k]; - sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+trackIDs[k]*O2_FILETABLESIZE); - } - } - - // Clean up - if(trackOffsetTable) - delete trackOffsetTable; - if(queryCopy) - delete queryCopy; - if(qNorm) - delete qNorm; - if(timesdata) - delete[] timesdata; - if(querydurs) - delete[] querydurs; - if(meanDBdur) - delete meanDBdur; -} - -// This is a common pattern in sequence queries: what we are doing is -// taking a window of length seqlen over a buffer of length length, -// and placing the sum of the elements in that window in the first -// element of the window: thus replacing all but the last seqlen -// elements in the buffer the corresponding windowed sum. -void audioDB::sequence_sum(double *buffer, int length, int seqlen) { - double tmp1, tmp2, *ps; - int j, w; - - tmp1 = *buffer; - j = 1; - w = seqlen - 1; - while(w--) { - *buffer += buffer[j++]; - } - ps = buffer + 1; - w = length - seqlen; // +1 - 1 - while(w--) { - tmp2 = *ps; - *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1); - tmp1 = tmp2; - ps++; - } -} - -void audioDB::sequence_sqrt(double *buffer, int length, int seqlen) { - int w = length - seqlen + 1; - while(w--) { - *buffer = sqrt(*buffer); - buffer++; - } -} - -void audioDB::sequence_average(double *buffer, int length, int seqlen) { - int w = length - seqlen + 1; - while(w--) { - *buffer /= seqlen; - buffer++; - } -} - -// k nearest-neighbor (k-NN) search between query and target tracks -// efficient implementation based on matched filter -// assumes normed shingles -// outputs distances of retrieved shingles, max retreived = pointNN shingles per per track -void audioDB::trackSequenceQueryNN(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ - - initTables(dbName, inFile); - - // For each input vector, find the closest pointNN matching output vectors and report - // we use stdout in this stub version - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - double* query = (double*)(indata+sizeof(int)); - double* queryCopy = 0; - - if(!(dbH->flags & O2_FLAG_L2NORM) ) - error("Database must be L2 normed for sequence query","use -L2NORM"); - - if(numVectors<sequenceLength) - error("Query shorter than requested sequence length", "maybe use -l"); - - if(verbosity>1) { - cerr << "performing norms ... "; cerr.flush(); - } - unsigned dbVectors = dbH->length/(sizeof(double)*dbH->dim); - - // Make a copy of the query - queryCopy = new double[numVectors*dbH->dim]; - memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); - qNorm = new double[numVectors]; - sNorm = new double[dbVectors]; - assert(qNorm&&sNorm&&queryCopy&&sequenceLength); - unitNorm(queryCopy, dbH->dim, numVectors, qNorm); - query = queryCopy; - - // Make norm measurements relative to sequenceLength - unsigned w = sequenceLength-1; - unsigned i,j; - - // Copy the L2 norm values to core to avoid disk random access later on - memcpy(sNorm, l2normTable, dbVectors*sizeof(double)); - double* qnPtr = qNorm; - double* snPtr = sNorm; - - double *sPower = 0, *qPower = 0; - double *spPtr = 0, *qpPtr = 0; - - if (usingPower) { - if (!(dbH->flags & O2_FLAG_POWER)) { - error("database not power-enabled", dbName); - } - sPower = new double[dbVectors]; - spPtr = sPower; - memcpy(sPower, powerTable, dbVectors * sizeof(double)); - } - - for(i=0; i<dbH->numFiles; i++){ - if(trackTable[i]>=sequenceLength) { - sequence_sum(snPtr, trackTable[i], sequenceLength); - sequence_sqrt(snPtr, trackTable[i], sequenceLength); - - if (usingPower) { - sequence_sum(spPtr, trackTable[i], sequenceLength); - sequence_average(spPtr, trackTable[i], sequenceLength); - } - } - snPtr += trackTable[i]; - if (usingPower) { - spPtr += trackTable[i]; - } - } - - sequence_sum(qnPtr, numVectors, sequenceLength); - sequence_sqrt(qnPtr, numVectors, sequenceLength); - - if (usingPower) { - qPower = new double[numVectors]; - qpPtr = qPower; - if (lseek(powerfd, sizeof(int), SEEK_SET) == (off_t) -1) { - error("error seeking to data", powerFileName, "lseek"); - } - int count = read(powerfd, qPower, numVectors * sizeof(double)); - if (count == -1) { - error("error reading data", powerFileName, "read"); - } - if ((unsigned) count != numVectors * sizeof(double)) { - error("short read", powerFileName); - } - - sequence_sum(qpPtr, numVectors, sequenceLength); - sequence_average(qpPtr, numVectors, sequenceLength); - } - - if(verbosity>1) { - cerr << "done." << endl; - } - - if(verbosity>1) { - cerr << "matching tracks..." << endl; - } - - assert(pointNN>0 && pointNN<=O2_MAXNN); - assert(trackNN>0 && trackNN<=O2_MAXNN); - - // Make temporary dynamic memory for results - double trackDistances[trackNN]; - unsigned trackIDs[trackNN]; - unsigned trackQIndexes[trackNN]; - unsigned trackSIndexes[trackNN]; - - double distances[pointNN]; - unsigned qIndexes[pointNN]; - unsigned sIndexes[pointNN]; - - - unsigned k,l,m,n,track,trackOffset=0, HOP_SIZE=sequenceHop, wL=sequenceLength; - double thisDist; - - for(k=0; k<pointNN; k++){ - distances[k]=1.0e6; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - - for(k=0; k<trackNN; k++){ - trackDistances[k]=1.0e6; - trackQIndexes[k]=~0; - trackSIndexes[k]=~0; - trackIDs[k]=~0; - } - - // Timestamp and durations processing - double meanQdur = 0; - double *timesdata = 0; - double *querydurs = 0; - double *meanDBdur = 0; - - if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ - cerr << "warning: ignoring query timestamps for non-timestamped database" << endl; - usingTimes=0; - } - - else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - cerr << "warning: no timestamps given for query. Ignoring database timestamps." << endl; - - else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ - timesdata = new double[2*numVectors]; - querydurs = new double[numVectors]; - - insertTimeStamps(numVectors, timesFile, timesdata); - // Calculate durations of points - for(k=0; k<numVectors-1; k++) { - querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; - meanQdur += querydurs[k]; - } - meanQdur/=k; - if(verbosity>1) { - cerr << "mean query file duration: " << meanQdur << endl; - } - meanDBdur = new double[dbH->numFiles]; - assert(meanDBdur); - for(k=0; k<dbH->numFiles; k++){ - meanDBdur[k]=0.0; - for(j=0; j<trackTable[k]-1 ; j++) { - meanDBdur[k]+=timesTable[2*j+1]-timesTable[2*j]; - } - meanDBdur[k]/=j; - } - } - - if(usingQueryPoint) - if(queryPoint>numVectors || queryPoint>numVectors-wL+1) - error("queryPoint > numVectors-wL+1 in query"); - else{ - if(verbosity>1) { - cerr << "query point: " << queryPoint << endl; cerr.flush(); - } - query = query + queryPoint * dbH->dim; - qnPtr = qnPtr + queryPoint; - if (usingPower) { - qpPtr = qpPtr + queryPoint; - } - numVectors=wL; - } - - double ** D = 0; // Differences query and target - double ** DD = 0; // Matched filter distance - - D = new double*[numVectors]; - assert(D); - DD = new double*[numVectors]; - assert(DD); - - gettimeofday(&tv1, NULL); - unsigned processedTracks = 0; - unsigned successfulTracks=0; - - double* qp; - double* sp; - double* dp; - - // build track offset table - off_t *trackOffsetTable = new off_t[dbH->numFiles]; - unsigned cumTrack=0; - off_t trackIndexOffset; - for(k=0; k<dbH->numFiles;k++){ - trackOffsetTable[k]=cumTrack; - cumTrack+=trackTable[k]*dbH->dim; - } - - char nextKey [MAXSTR]; - - // chi^2 statistics - double sampleCount = 0; - double sampleSum = 0; - double logSampleSum = 0; - double minSample = 1e9; - double maxSample = 0; - - // Track loop - size_t data_buffer_size = 0; - double *data_buffer = 0; - lseek(dbfid, dbH->dataOffset, SEEK_SET); - - for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++) { - - trackOffset = trackOffsetTable[track]; // numDoubles offset - - // get trackID from file if using a control file - if(trackFile) { - trackFile->getline(nextKey,MAXSTR); - if(!trackFile->eof()) { - track = getKeyPos(nextKey); - trackOffset = trackOffsetTable[track]; - lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); - } else { - break; - } - } - - trackIndexOffset=trackOffset/dbH->dim; // numVectors offset - - if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { - if(data_buffer) { - free(data_buffer); - } - { - data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; - void *tmp = malloc(data_buffer_size); - if (tmp == NULL) { - error("error allocating data buffer"); - } - data_buffer = (double *) tmp; - } - } - - read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); - - if(sequenceLength<=trackTable[track]){ // test for short sequences - - if(verbosity>7) { - cerr << track << "." << trackIndexOffset << "." << trackTable[track] << " | ";cerr.flush(); - } - - // Sum products matrix - for(j=0; j<numVectors;j++){ - D[j]=new double[trackTable[track]]; - assert(D[j]); - - } - - // Matched filter matrix - for(j=0; j<numVectors;j++){ - DD[j]=new double[trackTable[track]]; - assert(DD[j]); - } - - // Dot product - for(j=0; j<numVectors; j++) - for(k=0; k<trackTable[track]; k++){ - qp=query+j*dbH->dim; - sp=data_buffer+k*dbH->dim; - DD[j][k]=0.0; // Initialize matched filter array - dp=&D[j][k]; // point to correlation cell j,k - *dp=0.0; // initialize correlation cell - l=dbH->dim; // size of vectors - while(l--) - *dp+=*qp++**sp++; - } - - // Matched Filter - // HOP SIZE == 1 - double* spd; - if(HOP_SIZE==1){ // HOP_SIZE = shingleHop - for(w=0; w<wL; w++) - for(j=0; j<numVectors-w; j++){ - sp=DD[j]; - spd=D[j+w]+w; - k=trackTable[track]-w; - while(k--) - *sp+++=*spd++; - } - } - - else{ // HOP_SIZE != 1 - for(w=0; w<wL; w++) - for(j=0; j<numVectors-w; j+=HOP_SIZE){ - sp=DD[j]; - spd=D[j+w]+w; - for(k=0; k<trackTable[track]-w; k+=HOP_SIZE){ - *sp+=*spd; - sp+=HOP_SIZE; - spd+=HOP_SIZE; - } - } - } - - if(verbosity>3 && usingTimes) { - cerr << "meanQdur=" << meanQdur << " meanDBdur=" << meanDBdur[track] << endl; - cerr.flush(); - } - - if(!usingTimes || - (usingTimes - && fabs(meanDBdur[track]-meanQdur)<meanQdur*timesTol)){ - - if(verbosity>3 && usingTimes) { - cerr << "within duration tolerance." << endl; - cerr.flush(); - } - - // Search for minimum distance by shingles (concatenated vectors) - for(j=0;j<=numVectors-wL;j+=HOP_SIZE) - for(k=0;k<=trackTable[track]-wL;k+=HOP_SIZE){ - thisDist=2-(2/(qnPtr[j]*sNorm[trackIndexOffset+k]))*DD[j][k]; - if(verbosity>9) { - cerr << thisDist << " " << qnPtr[j] << " " << sNorm[trackIndexOffset+k] << endl; - } - // Gather chi^2 statistics - if(thisDist<minSample) - minSample=thisDist; - else if(thisDist>maxSample) - maxSample=thisDist; - if(thisDist>1e-9){ - sampleCount++; - sampleSum+=thisDist; - logSampleSum+=log(thisDist); - } - - // diffL2 = fabs(qnPtr[j] - sNorm[trackIndexOffset+k]); - // Power test - if (usingPower) { - if (!(powers_acceptable(qpPtr[j], sPower[trackIndexOffset + k]))) { - thisDist = 1000000.0; - } - } - - // k-NN match algorithm - m=pointNN; - while(m--){ - if(thisDist<=distances[m]) - if(m==0 || thisDist>=distances[m-1]){ - // Shuffle distances up the list - for(l=pointNN-1; l>m; l--){ - distances[l]=distances[l-1]; - qIndexes[l]=qIndexes[l-1]; - sIndexes[l]=sIndexes[l-1]; - } - distances[m]=thisDist; - if(usingQueryPoint) - qIndexes[m]=queryPoint; - else - qIndexes[m]=j; - sIndexes[m]=k; - break; - } - } - } - // Calculate the mean of the N-Best matches - thisDist=0.0; - for(m=0; m<pointNN; m++) { - if (distances[m] == 1000000.0) break; - thisDist+=distances[m]; - } - thisDist/=m; - - // Let's see the distances then... - if(verbosity>3) { - cerr << fileTable+track*O2_FILETABLESIZE << " " << thisDist << endl; - } - - - // All the track stuff goes here - n=trackNN; - while(n--){ - if(thisDist<=trackDistances[n]){ - if((n==0 || thisDist>=trackDistances[n-1])){ - // Copy all values above up the queue - for( l=trackNN-1 ; l > n ; l--){ - trackDistances[l]=trackDistances[l-1]; - trackQIndexes[l]=trackQIndexes[l-1]; - trackSIndexes[l]=trackSIndexes[l-1]; - trackIDs[l]=trackIDs[l-1]; - } - trackDistances[n]=thisDist; - trackQIndexes[n]=qIndexes[0]; - trackSIndexes[n]=sIndexes[0]; - successfulTracks++; - trackIDs[n]=track; - break; - } - } - else - break; - } - } // Duration match - - // Clean up current track - if(D!=NULL){ - for(j=0; j<numVectors; j++) - delete[] D[j]; - } - - if(DD!=NULL){ - for(j=0; j<numVectors; j++) - delete[] DD[j]; - } - } - // per-track reset array values - for(unsigned k=0; k<pointNN; k++){ - distances[k]=1.0e6; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - } - - free(data_buffer); - - gettimeofday(&tv2,NULL); - if(verbosity>1) { - cerr << endl << "processed tracks :" << processedTracks << " matched tracks: " << successfulTracks << " elapsed time:" - << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << endl; - cerr << "sampleCount: " << sampleCount << " sampleSum: " << sampleSum << " logSampleSum: " << logSampleSum - << " minSample: " << minSample << " maxSample: " << maxSample << endl; - } - if(adbQueryResponse==0){ - if(verbosity>1) { - cerr<<endl; - } - // Output answer - // Loop over nearest neighbours - for(k=0; k < min(trackNN,successfulTracks); k++) - cout << fileTable+trackIDs[k]*O2_FILETABLESIZE << " " << trackDistances[k] << " " - << trackQIndexes[k] << " " << trackSIndexes[k] << endl; - } - else{ // Process Web Services Query - int listLen = min(trackNN, processedTracks); - adbQueryResponse->result.__sizeRlist=listLen; - adbQueryResponse->result.__sizeDist=listLen; - adbQueryResponse->result.__sizeQpos=listLen; - adbQueryResponse->result.__sizeSpos=listLen; - adbQueryResponse->result.Rlist= new char*[listLen]; - adbQueryResponse->result.Dist = new double[listLen]; - adbQueryResponse->result.Qpos = new unsigned int[listLen]; - adbQueryResponse->result.Spos = new unsigned int[listLen]; - for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ - adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; - adbQueryResponse->result.Dist[k]=trackDistances[k]; - adbQueryResponse->result.Qpos[k]=trackQIndexes[k]; - adbQueryResponse->result.Spos[k]=trackSIndexes[k]; - sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+trackIDs[k]*O2_FILETABLESIZE); - } - } - - // Clean up - if(trackOffsetTable) - delete[] trackOffsetTable; - if(queryCopy) - delete[] queryCopy; - if(qNorm) - delete[] qNorm; - if(sNorm) - delete[] sNorm; - if(qPower) - delete[] qPower; - if(sPower) - delete[] sPower; - if(D) - delete[] D; - if(DD) - delete[] DD; - if(timesdata) - delete[] timesdata; - if(querydurs) - delete[] querydurs; - if(meanDBdur) - delete[] meanDBdur; -} - -// Radius search between query and target tracks -// efficient implementation based on matched filter -// assumes normed shingles -// outputs count of retrieved shingles, max retreived = one shingle per query shingle per track -void audioDB::trackSequenceQueryRad(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ - - initTables(dbName, inFile); - - // For each input vector, find the closest pointNN matching output vectors and report - // we use stdout in this stub version - unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); - double* query = (double*)(indata+sizeof(int)); - double* queryCopy = 0; - - if(!(dbH->flags & O2_FLAG_L2NORM) ) - error("Database must be L2 normed for sequence query","use -l2norm"); - - if(verbosity>1) { - cerr << "performing norms ... "; cerr.flush(); - } - unsigned dbVectors = dbH->length/(sizeof(double)*dbH->dim); - - // Make a copy of the query - queryCopy = new double[numVectors*dbH->dim]; - memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); - qNorm = new double[numVectors]; - sNorm = new double[dbVectors]; - assert(qNorm&&sNorm&&queryCopy&&sequenceLength); - unitNorm(queryCopy, dbH->dim, numVectors, qNorm); - query = queryCopy; - - // Make norm measurements relative to sequenceLength - unsigned w = sequenceLength-1; - unsigned i,j; - - // Copy the L2 norm values to core to avoid disk random access later on - memcpy(sNorm, l2normTable, dbVectors*sizeof(double)); - double* snPtr = sNorm; - double* qnPtr = qNorm; - - double *sPower = 0, *qPower = 0; - double *spPtr = 0, *qpPtr = 0; - - if (usingPower) { - if(!(dbH->flags & O2_FLAG_POWER)) { - error("database not power-enabled", dbName); - } - sPower = new double[dbVectors]; - spPtr = sPower; - memcpy(sPower, powerTable, dbVectors * sizeof(double)); - } - - for(i=0; i<dbH->numFiles; i++){ - if(trackTable[i]>=sequenceLength) { - sequence_sum(snPtr, trackTable[i], sequenceLength); - sequence_sqrt(snPtr, trackTable[i], sequenceLength); - if (usingPower) { - sequence_sum(spPtr, trackTable[i], sequenceLength); - sequence_average(spPtr, trackTable[i], sequenceLength); - } - } - snPtr += trackTable[i]; - if (usingPower) { - spPtr += trackTable[i]; - } - } - - sequence_sum(qnPtr, numVectors, sequenceLength); - sequence_sqrt(qnPtr, numVectors, sequenceLength); - - if (usingPower) { - qPower = new double[numVectors]; - qpPtr = qPower; - if (lseek(powerfd, sizeof(int), SEEK_SET) == (off_t) -1) { - error("error seeking to data", powerFileName, "lseek"); - } - int count = read(powerfd, qPower, numVectors * sizeof(double)); - if (count == -1) { - error("error reading data", powerFileName, "read"); - } - if ((unsigned) count != numVectors * sizeof(double)) { - error("short read", powerFileName); - } - - sequence_sum(qpPtr, numVectors, sequenceLength); - sequence_average(qpPtr, numVectors, sequenceLength); - } - - if(verbosity>1) { - cerr << "done." << endl; - } - - if(verbosity>1) { - cerr << "matching tracks..." << endl; - } - - assert(pointNN>0 && pointNN<=O2_MAXNN); - assert(trackNN>0 && trackNN<=O2_MAXNN); - - // Make temporary dynamic memory for results - double trackDistances[trackNN]; - unsigned trackIDs[trackNN]; - unsigned trackQIndexes[trackNN]; - unsigned trackSIndexes[trackNN]; - - double distances[pointNN]; - unsigned qIndexes[pointNN]; - unsigned sIndexes[pointNN]; - - - unsigned k,l,n,track,trackOffset=0, HOP_SIZE=sequenceHop, wL=sequenceLength; - double thisDist; - - for(k=0; k<pointNN; k++){ - distances[k]=0.0; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - - for(k=0; k<trackNN; k++){ - trackDistances[k]=0.0; - trackQIndexes[k]=~0; - trackSIndexes[k]=~0; - trackIDs[k]=~0; - } - - // Timestamp and durations processing - double meanQdur = 0; - double *timesdata = 0; - double *querydurs = 0; - double *meanDBdur = 0; - - if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ - cerr << "warning: ignoring query timestamps for non-timestamped database" << endl; - usingTimes=0; - } - - else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) - cerr << "warning: no timestamps given for query. Ignoring database timestamps." << endl; - - else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ - timesdata = new double[2*numVectors]; - querydurs = new double[numVectors]; - - insertTimeStamps(numVectors, timesFile, timesdata); - // Calculate durations of points - for(k=0; k<numVectors-1; k++){ - querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; - meanQdur += querydurs[k]; - } - meanQdur/=k; - if(verbosity>1) { - cerr << "mean query file duration: " << meanQdur << endl; - } - meanDBdur = new double[dbH->numFiles]; - assert(meanDBdur); - for(k=0; k<dbH->numFiles; k++){ - meanDBdur[k]=0.0; - for(j=0; j<trackTable[k]-1 ; j++) { - meanDBdur[k]+=timesTable[2*j+1]-timesTable[2*j]; - } - meanDBdur[k]/=j; - } - } - - if(usingQueryPoint) - if(queryPoint>numVectors || queryPoint>numVectors-wL+1) - error("queryPoint > numVectors-wL+1 in query"); - else{ - if(verbosity>1) { - cerr << "query point: " << queryPoint << endl; cerr.flush(); - } - query = query + queryPoint*dbH->dim; - qnPtr = qnPtr + queryPoint; - if (usingPower) { - qpPtr = qpPtr + queryPoint; - } - numVectors=wL; - } - - double ** D = 0; // Differences query and target - double ** DD = 0; // Matched filter distance - - D = new double*[numVectors]; - assert(D); - DD = new double*[numVectors]; - assert(DD); - - gettimeofday(&tv1, NULL); - unsigned processedTracks = 0; - unsigned successfulTracks=0; - - double* qp; - double* sp; - double* dp; - - // build track offset table - off_t *trackOffsetTable = new off_t[dbH->numFiles]; - unsigned cumTrack=0; - off_t trackIndexOffset; - for(k=0; k<dbH->numFiles;k++){ - trackOffsetTable[k]=cumTrack; - cumTrack+=trackTable[k]*dbH->dim; - } - - char nextKey [MAXSTR]; - - // chi^2 statistics - double sampleCount = 0; - double sampleSum = 0; - double logSampleSum = 0; - double minSample = 1e9; - double maxSample = 0; - - // Track loop - size_t data_buffer_size = 0; - double *data_buffer = 0; - lseek(dbfid, dbH->dataOffset, SEEK_SET); - - for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ - - trackOffset = trackOffsetTable[track]; // numDoubles offset - - // get trackID from file if using a control file - if(trackFile) { - trackFile->getline(nextKey,MAXSTR); - if(!trackFile->eof()) { - track = getKeyPos(nextKey); - trackOffset = trackOffsetTable[track]; - lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); - } else { - break; - } - } - - trackIndexOffset=trackOffset/dbH->dim; // numVectors offset - - if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { - if(data_buffer) { - free(data_buffer); - } - { - data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; - void *tmp = malloc(data_buffer_size); - if (tmp == NULL) { - error("error allocating data buffer"); - } - data_buffer = (double *) tmp; - } - } - - read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); - - if(sequenceLength<=trackTable[track]){ // test for short sequences - - if(verbosity>7) { - cerr << track << "." << trackIndexOffset << "." << trackTable[track] << " | ";cerr.flush(); - } - - // Sum products matrix - for(j=0; j<numVectors;j++){ - D[j]=new double[trackTable[track]]; - assert(D[j]); - - } - - // Matched filter matrix - for(j=0; j<numVectors;j++){ - DD[j]=new double[trackTable[track]]; - assert(DD[j]); - } - - // Dot product - for(j=0; j<numVectors; j++) - for(k=0; k<trackTable[track]; k++){ - qp=query+j*dbH->dim; - sp=data_buffer+k*dbH->dim; - DD[j][k]=0.0; // Initialize matched filter array - dp=&D[j][k]; // point to correlation cell j,k - *dp=0.0; // initialize correlation cell - l=dbH->dim; // size of vectors - while(l--) - *dp+=*qp++**sp++; - } - - // Matched Filter - // HOP SIZE == 1 - double* spd; - if(HOP_SIZE==1){ // HOP_SIZE = shingleHop - for(w=0; w<wL; w++) - for(j=0; j<numVectors-w; j++){ - sp=DD[j]; - spd=D[j+w]+w; - k=trackTable[track]-w; - while(k--) - *sp+++=*spd++; - } - } - - else{ // HOP_SIZE != 1 - for(w=0; w<wL; w++) - for(j=0; j<numVectors-w; j+=HOP_SIZE){ - sp=DD[j]; - spd=D[j+w]+w; - for(k=0; k<trackTable[track]-w; k+=HOP_SIZE){ - *sp+=*spd; - sp+=HOP_SIZE; - spd+=HOP_SIZE; - } - } - } - - if(verbosity>3 && usingTimes) { - cerr << "meanQdur=" << meanQdur << " meanDBdur=" << meanDBdur[track] << endl; - cerr.flush(); - } - - if(!usingTimes || - (usingTimes - && fabs(meanDBdur[track]-meanQdur)<meanQdur*timesTol)){ - - if(verbosity>3 && usingTimes) { - cerr << "within duration tolerance." << endl; - cerr.flush(); - } - - // Search for minimum distance by shingles (concatenated vectors) - for(j=0;j<=numVectors-wL;j+=HOP_SIZE) - for(k=0;k<=trackTable[track]-wL;k+=HOP_SIZE){ - thisDist=2-(2/(qnPtr[j]*sNorm[trackIndexOffset+k]))*DD[j][k]; - if(verbosity>9) { - cerr << thisDist << " " << qnPtr[j] << " " << sNorm[trackIndexOffset+k] << endl; - } - // Gather chi^2 statistics - if(thisDist<minSample) - minSample=thisDist; - else if(thisDist>maxSample) - maxSample=thisDist; - if(thisDist>1e-9){ - sampleCount++; - sampleSum+=thisDist; - logSampleSum+=log(thisDist); - } - - // diffL2 = fabs(qnPtr[j] - sNorm[trackIndexOffset+k]); - // Power test - if (usingPower) { - if (!(powers_acceptable(qpPtr[j], sPower[trackIndexOffset + k]))) { - thisDist = 1000000.0; - } - } - - if(thisDist>=0 && thisDist<=radius){ - distances[0]++; // increment count - break; // only need one track point per query point - } - } - // How many points were below threshold ? - thisDist=distances[0]; - - // Let's see the distances then... - if(verbosity>3) { - cerr << fileTable+track*O2_FILETABLESIZE << " " << thisDist << endl; - } - - // All the track stuff goes here - n=trackNN; - while(n--){ - if(thisDist>trackDistances[n]){ - if((n==0 || thisDist<=trackDistances[n-1])){ - // Copy all values above up the queue - for( l=trackNN-1 ; l > n ; l--){ - trackDistances[l]=trackDistances[l-1]; - trackQIndexes[l]=trackQIndexes[l-1]; - trackSIndexes[l]=trackSIndexes[l-1]; - trackIDs[l]=trackIDs[l-1]; - } - trackDistances[n]=thisDist; - trackQIndexes[n]=qIndexes[0]; - trackSIndexes[n]=sIndexes[0]; - successfulTracks++; - trackIDs[n]=track; - break; - } - } - else - break; - } - } // Duration match - - // Clean up current track - if(D!=NULL){ - for(j=0; j<numVectors; j++) - delete[] D[j]; - } - - if(DD!=NULL){ - for(j=0; j<numVectors; j++) - delete[] DD[j]; - } - } - // per-track reset array values - for(unsigned k=0; k<pointNN; k++){ - distances[k]=0.0; - qIndexes[k]=~0; - sIndexes[k]=~0; - } - } - - free(data_buffer); - - gettimeofday(&tv2,NULL); - if(verbosity>1) { - cerr << endl << "processed tracks :" << processedTracks << " matched tracks: " << successfulTracks << " elapsed time:" - << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << endl; - cerr << "sampleCount: " << sampleCount << " sampleSum: " << sampleSum << " logSampleSum: " << logSampleSum - << " minSample: " << minSample << " maxSample: " << maxSample << endl; - } - - if(adbQueryResponse==0){ - if(verbosity>1) { - cerr<<endl; - } - // Output answer - // Loop over nearest neighbours - for(k=0; k < min(trackNN,successfulTracks); k++) - cout << fileTable+trackIDs[k]*O2_FILETABLESIZE << " " << trackDistances[k] << endl; - } - else{ // Process Web Services Query - int listLen = min(trackNN, processedTracks); - adbQueryResponse->result.__sizeRlist=listLen; - adbQueryResponse->result.__sizeDist=listLen; - adbQueryResponse->result.__sizeQpos=listLen; - adbQueryResponse->result.__sizeSpos=listLen; - adbQueryResponse->result.Rlist= new char*[listLen]; - adbQueryResponse->result.Dist = new double[listLen]; - adbQueryResponse->result.Qpos = new unsigned int[listLen]; - adbQueryResponse->result.Spos = new unsigned int[listLen]; - for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ - adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; - adbQueryResponse->result.Dist[k]=trackDistances[k]; - adbQueryResponse->result.Qpos[k]=trackQIndexes[k]; - adbQueryResponse->result.Spos[k]=trackSIndexes[k]; - sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+trackIDs[k]*O2_FILETABLESIZE); - } - } - - // Clean up - if(trackOffsetTable) - delete[] trackOffsetTable; - if(queryCopy) - delete[] queryCopy; - if(qNorm) - delete[] qNorm; - if(sNorm) - delete[] sNorm; - if(qPower) - delete[] qPower; - if(sPower) - delete[] sPower; - if(D) - delete[] D; - if(DD) - delete[] DD; - if(timesdata) - delete[] timesdata; - if(querydurs) - delete[] querydurs; - if(meanDBdur) - delete[] meanDBdur; -} - -// Unit norm block of features -void audioDB::unitNorm(double* X, unsigned dim, unsigned n, double* qNorm){ - unsigned d; - double L2, *p; - if(verbosity>2) { - cerr << "norming " << n << " vectors...";cerr.flush(); - } - while(n--){ - p=X; - L2=0.0; - d=dim; - while(d--){ - L2+=*p**p; - p++; - } - /* L2=sqrt(L2);*/ - if(qNorm) - *qNorm++=L2; - /* - oneOverL2 = 1.0/L2; - d=dim; - while(d--){ - *X*=oneOverL2; - X++; - */ - X+=dim; - } - if(verbosity>2) { - cerr << "done..." << endl; - } -} - -// Unit norm block of features +/* FIXME: in fact this does not unit norm a block of features, it just + records the L2 norms somewhere. unitNorm() does in fact unit norm + a block of features. */ void audioDB::unitNormAndInsertL2(double* X, unsigned dim, unsigned n, unsigned append=0){ unsigned d; double *p; @@ -2778,9 +427,7 @@ if( !append && (dbH->flags & O2_FLAG_L2NORM) ) error("Database is already L2 normed", "automatic norm on insert is enabled"); - if(verbosity>2) { - cerr << "norming " << n << " vectors...";cerr.flush(); - } + VERB_LOG(2, "norming %u vectors...", n); double* l2buf = new double[n]; double* l2ptr = l2buf; @@ -2811,194 +458,7 @@ memcpy(l2normTable+offset, l2buf, n*sizeof(double)); if(l2buf) delete[] l2buf; - if(verbosity>2) { - cerr << "done..." << endl; - } -} - - -// Start an audioDB server on the host -void audioDB::startServer(){ - struct soap soap; - int m, s; // master and slave sockets - soap_init(&soap); - // FIXME: largely this use of SO_REUSEADDR is to make writing (and - // running) test cases more convenient, so that multiple test runs - // in close succession don't fail because of a bin() error. - // Investigate whether there are any potential drawbacks in this, - // and also whether there's a better way to write the tests. -- - // CSR, 2007-10-03 - soap.bind_flags |= SO_REUSEADDR; - m = soap_bind(&soap, NULL, port, 100); - if (m < 0) - soap_print_fault(&soap, stderr); - else - { - fprintf(stderr, "Socket connection successful: master socket = %d\n", m); - for (int i = 1; ; i++) - { - s = soap_accept(&soap); - if (s < 0) - { - soap_print_fault(&soap, stderr); - break; - } - fprintf(stderr, "%d: accepted connection from IP=%lu.%lu.%lu.%lu socket=%d\n", i, - (soap.ip >> 24)&0xFF, (soap.ip >> 16)&0xFF, (soap.ip >> 8)&0xFF, soap.ip&0xFF, s); - if (soap_serve(&soap) != SOAP_OK) // process RPC request - soap_print_fault(&soap, stderr); // print error - fprintf(stderr, "request served\n"); - soap_destroy(&soap); // clean up class instances - soap_end(&soap); // clean up everything and close socket - } - } - soap_done(&soap); // close master socket and detach environment -} - - -// web services - -// SERVER SIDE -int adb__status(struct soap* soap, xsd__string dbName, adb__statusResponse &adbStatusResponse){ - char* const argv[]={"audioDB",COM_STATUS,"-d",dbName}; - const unsigned argc = 4; - try { - audioDB(argc, argv, &adbStatusResponse); - return SOAP_OK; - } catch(char *err) { - soap_receiver_fault(soap, err, ""); - return SOAP_FAULT; - } -} - -// Literal translation of command line to web service - -int adb__query(struct soap* soap, xsd__string dbName, xsd__string qKey, xsd__string keyList, xsd__string timesFileName, xsd__int qType, xsd__int qPos, xsd__int pointNN, xsd__int trackNN, xsd__int seqLen, adb__queryResponse &adbQueryResponse){ - char queryType[256]; - for(int k=0; k<256; k++) - queryType[k]='\0'; - if(qType == O2_POINT_QUERY) - strncpy(queryType, "point", strlen("point")); - else if (qType == O2_SEQUENCE_QUERY) - strncpy(queryType, "sequence", strlen("sequence")); - else if(qType == O2_TRACK_QUERY) - strncpy(queryType,"track", strlen("track")); - else - strncpy(queryType, "", strlen("")); - - if(pointNN==0) - pointNN=10; - if(trackNN==0) - trackNN=10; - if(seqLen==0) - seqLen=16; - - char qPosStr[256]; - sprintf(qPosStr, "%d", qPos); - char pointNNStr[256]; - sprintf(pointNNStr,"%d",pointNN); - char trackNNStr[256]; - sprintf(trackNNStr,"%d",trackNN); - char seqLenStr[256]; - sprintf(seqLenStr,"%d",seqLen); - - const char* argv[] ={ - "./audioDB", - COM_QUERY, - queryType, // Need to pass a parameter - COM_DATABASE, - ENSURE_STRING(dbName), - COM_FEATURES, - ENSURE_STRING(qKey), - COM_KEYLIST, - ENSURE_STRING(keyList), - COM_TIMES, - ENSURE_STRING(timesFileName), - COM_QPOINT, - qPosStr, - COM_POINTNN, - pointNNStr, - COM_TRACKNN, - trackNNStr, // Need to pass a parameter - COM_SEQLEN, - seqLenStr - }; - - const unsigned argc = 19; - try { - audioDB(argc, (char* const*)argv, &adbQueryResponse); - return SOAP_OK; - } catch (char *err) { - soap_receiver_fault(soap, err, ""); - return SOAP_FAULT; - } -} - -int adb__sequenceQuery(struct soap* soap, xsd__string dbName, xsd__string qKey, - adb__sequenceQueryParms *parms, - adb__queryResponse &adbQueryResponse) { - - char qPosStr[256]; - char pointNNStr[256]; - char trackNNStr[256]; - char seqLenStr[256]; - char relative_thresholdStr[256]; - char absolute_thresholdStr[256]; - - /* When the branch is merged, move this to a header and use it - elsewhere */ -#define INTSTRINGIFY(val, str) \ - snprintf(str, 256, "%d", val); -#define DOUBLESTRINGIFY(val, str) \ - snprintf(str, 256, "%f", val); - - INTSTRINGIFY(parms->qPos, qPosStr); - INTSTRINGIFY(parms->pointNN, pointNNStr); - INTSTRINGIFY(parms->segNN, trackNNStr); - /* FIXME: decide which of segLen and seqLen should live */ - INTSTRINGIFY(parms->segLen, seqLenStr); - - DOUBLESTRINGIFY(parms->relative_threshold, relative_thresholdStr); - DOUBLESTRINGIFY(parms->absolute_threshold, absolute_thresholdStr); - - const char *argv[] = { - "./audioDB", - COM_QUERY, - "sequence", - COM_DATABASE, - dbName, - COM_FEATURES, - qKey, - COM_KEYLIST, - /* FIXME: when this branch is merged, use ENSURE_STRING */ - parms->keyList==0?"":parms->keyList, - COM_TIMES, - parms->timesFileName==0?"":parms->timesFileName, - COM_QUERYPOWER, - parms->powerFileName==0?"":parms->powerFileName, - COM_QPOINT, - qPosStr, - COM_POINTNN, - pointNNStr, - COM_TRACKNN, - trackNNStr, - COM_SEQLEN, - seqLenStr, - COM_RELATIVE_THRESH, - relative_thresholdStr, - COM_ABSOLUTE_THRESH, - absolute_thresholdStr - }; - - const unsigned argc = 25; - - try { - audioDB(argc, (char* const*)argv, &adbQueryResponse); - return SOAP_OK; - } catch (char *err) { - soap_receiver_fault(soap, err, ""); - return SOAP_FAULT; - } + VERB_LOG(2, " done."); } int main(const unsigned argc, char* const argv[]){
--- a/audioDB.h Wed Dec 05 14:11:04 2007 +0000 +++ b/audioDB.h Fri Dec 14 14:41:37 2007 +0000 @@ -15,7 +15,6 @@ // includes for web services #include "soapH.h" -#include "adb.nsmap" #include "cmdline.h" #define MAXSTR 512 @@ -56,10 +55,8 @@ #define O2_DEFAULT_TRACKNN (10U) #define O2_DEFAULTDBSIZE (2000000000) // 2GB table size -//#define O2_DEFAULTDBSIZE (1000000000U) // 1GB table size -//#define O2_MAXFILES (1000000) -#define O2_MAXFILES (20000U) // 10,000 files +#define O2_MAXFILES (20000U) #define O2_MAXFILESTR (256U) #define O2_FILETABLESIZE (O2_MAXFILESTR) #define O2_TRACKTABLESIZE (sizeof(unsigned)) @@ -93,7 +90,19 @@ #define ENSURE_STRING(x) ((x) ? (x) : "") -using namespace std; +#define CHECKED_MMAP(type, var, start, length) \ + { void *tmp = mmap(0, length, (PROT_READ | (forWrite ? PROT_WRITE : 0)), MAP_SHARED, dbfid, (start)); \ + if(tmp == (void *) -1) { \ + error("mmap error for db table", #var, "mmap"); \ + } \ + var = (type) tmp; \ + } + +#define VERB_LOG(vv, ...) \ + if(verbosity > vv) { \ + fprintf(stderr, __VA_ARGS__); \ + fflush(stderr); \ + } typedef struct dbTableHeader { uint32_t magic; @@ -112,6 +121,7 @@ off_t dbSize; } dbTableHeaderT, *dbTableHeaderPtr; +class Reporter; class audioDB{ @@ -123,13 +133,13 @@ const char *hostport; const char *key; const char* trackFileName; - ifstream *trackFile; + std::ifstream *trackFile; const char *command; const char *output; const char *timesFileName; - ifstream *timesFile; + std::ifstream *timesFile; const char *powerFileName; - ifstream *powerFile; + std::ifstream *powerFile; int powerfd; int dbfid; @@ -145,8 +155,6 @@ double* dataBuf; double* inBuf; double* l2normTable; - double* qNorm; - double* sNorm; double* timesTable; double* powerTable; @@ -165,6 +173,7 @@ unsigned trackNN; // how many track NNs ? unsigned sequenceLength; unsigned sequenceHop; + bool normalizedDistance; unsigned queryPoint; unsigned usingQueryPoint; unsigned usingTimes; @@ -187,21 +196,23 @@ // private methods void error(const char* a, const char* b = "", const char *sysFunc = 0); - void pointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); - void trackPointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); void sequence_sum(double *buffer, int length, int seqlen); void sequence_sqrt(double *buffer, int length, int seqlen); void sequence_average(double *buffer, int length, int seqlen); - void trackSequenceQueryNN(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); - void trackSequenceQueryRad(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); + void initialize_arrays(int track, unsigned int numVectors, double *query, double *data_buffer, double **D, double **DD); + void delete_arrays(int track, unsigned int numVectors, double **D, double **DD); + void read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p); + void set_up_query(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned int *nvp); + void set_up_db(double **snp, double **vsnp, double **spp, double **vspp, double **mddp, unsigned int *dvp); + void query_loop(const char* dbName, const char* inFile, Reporter *reporter); void initDBHeader(const char *dbName); void initInputFile(const char *inFile); void initTables(const char* dbName, const char* inFile); void unitNorm(double* X, unsigned d, unsigned n, double* qNorm); void unitNormAndInsertL2(double* X, unsigned dim, unsigned n, unsigned append); - void insertTimeStamps(unsigned n, ifstream* timesFile, double* timesdata); + void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata); void insertPowerData(unsigned n, int powerfd, double *powerdata); unsigned getKeyPos(char* key); public: @@ -258,7 +269,6 @@ trackTable(0), \ dataBuf(0), \ l2normTable(0), \ - qNorm(0), \ timesTable(0), \ fileTableLength(0), \ trackTableLength(0), \ @@ -273,6 +283,7 @@ trackNN(O2_DEFAULT_TRACKNN), \ sequenceLength(16), \ sequenceHop(1), \ + normalizedDistance(true), \ queryPoint(0), \ usingQueryPoint(0), \ usingTimes(0), \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/common.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,184 @@ +#include "audioDB.h" + +#if defined(O2_DEBUG) +void sigterm_action(int signal, siginfo_t *info, void *context) { + exit(128+signal); +} + +void sighup_action(int signal, siginfo_t *info, void *context) { + // FIXME: reread any configuration files +} +#endif + +void audioDB::get_lock(int fd, bool exclusive) { + struct flock lock; + int status; + + lock.l_type = exclusive ? F_WRLCK : F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; /* "the whole file" */ + + retry: + do { + status = fcntl(fd, F_SETLKW, &lock); + } while (status != 0 && errno == EINTR); + + if (status) { + if (errno == EAGAIN) { + sleep(1); + goto retry; + } else { + error("fcntl lock error", "", "fcntl"); + } + } +} + +void audioDB::release_lock(int fd) { + struct flock lock; + int status; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + status = fcntl(fd, F_SETLKW, &lock); + + if (status) + error("fcntl unlock error", "", "fcntl"); +} + +void audioDB::error(const char* a, const char* b, const char *sysFunc) { + if(isServer) { + /* FIXME: I think this is leaky -- we never delete err. actually + deleting it is tricky, though; it gets placed into some + soap-internal struct with uncertain extent... -- CSR, + 2007-10-01 */ + char *err = new char[256]; /* FIXME: overflows */ + snprintf(err, 255, "%s: %s\n%s", a, b, sysFunc ? strerror(errno) : ""); + /* FIXME: actually we could usefully do with a properly structured + type, so that we can throw separate faultstring and details. + -- CSR, 2007-10-01 */ + throw(err); + } else { + std::cerr << a << ": " << b << std::endl; + if (sysFunc) { + perror(sysFunc); + } + exit(1); + } +} + +void audioDB::initDBHeader(const char* dbName) { + if ((dbfid = open(dbName, forWrite ? O_RDWR : O_RDONLY)) < 0) { + error("Can't open database file", dbName, "open"); + } + + get_lock(dbfid, forWrite); + // Get the database header info + dbH = new dbTableHeaderT(); + assert(dbH); + + if(read(dbfid, (char *) dbH, O2_HEADERSIZE) != O2_HEADERSIZE) { + error("error reading db header", dbName, "read"); + } + + if(dbH->magic == O2_OLD_MAGIC) { + // FIXME: if anyone ever complains, write the program to convert + // from the old audioDB format to the new... + error("database file has old O2 header", dbName); + } + + if(dbH->magic != O2_MAGIC) { + std::cerr << "expected: " << O2_MAGIC << ", got: " << dbH->magic << std::endl; + error("database file has incorrect header", dbName); + } + + if(dbH->version != O2_FORMAT_VERSION) { + error("database file has incorrect version", dbName); + } + + if(dbH->headerSize != O2_HEADERSIZE) { + error("sizeof(dbTableHeader) unexpected: platform ABI mismatch?", dbName); + } + + CHECKED_MMAP(char *, db, 0, getpagesize()); + + // Make some handy tables with correct types + if(forWrite || (dbH->length > 0)) { + if(forWrite) { + fileTableLength = dbH->trackTableOffset - dbH->fileTableOffset; + trackTableLength = dbH->dataOffset - dbH->trackTableOffset; + dataBufLength = dbH->timesTableOffset - dbH->dataOffset; + timesTableLength = dbH->powerTableOffset - dbH->timesTableOffset; + powerTableLength = dbH->l2normTableOffset - dbH->powerTableOffset; + l2normTableLength = dbH->dbSize - dbH->l2normTableOffset; + } else { + fileTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_FILETABLESIZE); + trackTableLength = ALIGN_PAGE_UP(dbH->numFiles * O2_TRACKTABLESIZE); + dataBufLength = ALIGN_PAGE_UP(dbH->length); + timesTableLength = ALIGN_PAGE_UP(2*(dbH->length / dbH->dim)); + powerTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + l2normTableLength = ALIGN_PAGE_UP(dbH->length / dbH->dim); + } + CHECKED_MMAP(char *, fileTable, dbH->fileTableOffset, fileTableLength); + CHECKED_MMAP(unsigned *, trackTable, dbH->trackTableOffset, trackTableLength); + /* + * No more mmap() for dataBuf + * + * FIXME: Actually we do do the mmap() in the two cases where it's + * still "needed": in pointQuery and in l2norm if dbH->length is + * non-zero. Removing those cases too (and deleting the dataBuf + * variable completely) would be cool. -- CSR, 2007-11-19 + * + * CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); + */ + CHECKED_MMAP(double *, timesTable, dbH->timesTableOffset, timesTableLength); + CHECKED_MMAP(double *, powerTable, dbH->powerTableOffset, powerTableLength); + CHECKED_MMAP(double *, l2normTable, dbH->l2normTableOffset, l2normTableLength); + } +} + +void audioDB::initInputFile (const char *inFile) { + if (inFile) { + if ((infid = open(inFile, O_RDONLY)) < 0) { + error("can't open input file for reading", inFile, "open"); + } + + if (fstat(infid, &statbuf) < 0) { + error("fstat error finding size of input", inFile, "fstat"); + } + + if(dbH->dim == 0 && dbH->length == 0) { // empty database + // initialize with input dimensionality + if(read(infid, &dbH->dim, sizeof(unsigned)) != sizeof(unsigned)) { + error("short read of input file", inFile); + } + if(dbH->dim == 0) { + error("dimensionality of zero in input file", inFile); + } + } else { + unsigned test; + if(read(infid, &test, sizeof(unsigned)) != sizeof(unsigned)) { + error("short read of input file", inFile); + } + if(dbH->dim == 0) { + error("dimensionality of zero in input file", inFile); + } + if(dbH->dim != test) { + std::cerr << "error: expected dimension: " << dbH->dim << ", got : " << test <<std::endl; + error("feature dimensions do not match database table dimensions", inFile); + } + } + + if ((indata = (char *) mmap(0, statbuf.st_size, PROT_READ, MAP_SHARED, infid, 0)) == (caddr_t) -1) { + error("mmap error for input", inFile, "mmap"); + } + } +} + +void audioDB::initTables(const char* dbName, const char* inFile = 0) { + initDBHeader(dbName); + initInputFile(inFile); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,61 @@ +#include "audioDB.h" + +/* Make a new database. + + The database consists of: + + * a header (see dbTableHeader struct definition); + * keyTable: list of keys of tracks; + * trackTable: Maps implicit feature index to a feature vector + matrix (sizes of tracks) + * featureTable: Lots of doubles; + * timesTable: (start,end) time points for each feature vector; + * powerTable: associated power for each feature vector; + * l2normTable: squared l2norms for each feature vector. +*/ + +void audioDB::create(const char* dbName){ + if ((dbfid = open (dbName, O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) + error("Can't create database file", dbName, "open"); + get_lock(dbfid, 1); + + VERB_LOG(0, "header size: %ju\n", (intmax_t) O2_HEADERSIZE); + + dbH = new dbTableHeaderT(); + assert(dbH); + + unsigned int maxfiles = (unsigned int) rint((double) O2_MAXFILES * (double) size / (double) O2_DEFAULTDBSIZE); + + // Initialize header + dbH->magic = O2_MAGIC; + dbH->version = O2_FORMAT_VERSION; + dbH->numFiles = 0; + dbH->dim = 0; + dbH->flags = 0; + dbH->headerSize = O2_HEADERSIZE; + dbH->length = 0; + dbH->fileTableOffset = ALIGN_PAGE_UP(O2_HEADERSIZE); + dbH->trackTableOffset = ALIGN_PAGE_UP(dbH->fileTableOffset + O2_FILETABLESIZE*maxfiles); + dbH->dataOffset = ALIGN_PAGE_UP(dbH->trackTableOffset + O2_TRACKTABLESIZE*maxfiles); + dbH->l2normTableOffset = ALIGN_PAGE_DOWN(size - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); + dbH->powerTableOffset = ALIGN_PAGE_DOWN(dbH->l2normTableOffset - maxfiles*O2_MEANNUMVECTORS*sizeof(double)); + dbH->timesTableOffset = ALIGN_PAGE_DOWN(dbH->powerTableOffset - 2*maxfiles*O2_MEANNUMVECTORS*sizeof(double)); + dbH->dbSize = size; + + write(dbfid, dbH, O2_HEADERSIZE); + + // go to the location corresponding to the last byte + if (lseek (dbfid, size - 1, SEEK_SET) == -1) + error("lseek error in db file", "", "lseek"); + + // write a dummy byte at the last location + if (write (dbfid, "", 1) != 1) + error("write error", "", "write"); + + VERB_LOG(0, "%s %s\n", COM_CREATE, dbName); +} + +void audioDB::drop(){ + // FIXME: drop something? Should we even allow this? +} +
--- a/debian/changelog Wed Dec 05 14:11:04 2007 +0000 +++ b/debian/changelog Fri Dec 14 14:41:37 2007 +0000 @@ -1,3 +1,10 @@ +audiodb (1.0-20) unstable; urgency=low + + * Updated to svn version #313 + * Includes refactoring branch changes -r267:310 + + -- Christophe Rhodes <c.rhodes@gold.ac.uk> Thu, 13 Dec 2007 14:42:17 +0000 + audiodb (1.0-19) unstable; urgency=low * Updated to svn version #289
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dump.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,177 @@ +#include "audioDB.h" + +void audioDB::dump(const char* dbName){ + if(!dbH) { + initTables(dbName, 0); + } + + if((mkdir(output, S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { + error("error making output directory", output, "mkdir"); + } + + char *cwd = new char[PATH_MAX]; + + if ((getcwd(cwd, PATH_MAX)) == 0) { + error("error getting working directory", "", "getcwd"); + } + + if((chdir(output)) < 0) { + error("error changing working directory", output, "chdir"); + } + + int fLfd, tLfd = 0, pLfd = 0, kLfd; + FILE *fLFile, *tLFile = 0, *pLFile = 0, *kLFile; + + if ((fLfd = open("featureList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { + error("error creating featureList file", "featureList.txt", "open"); + } + + int times = dbH->flags & O2_FLAG_TIMES; + if (times) { + if ((tLfd = open("timesList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { + error("error creating timesList file", "timesList.txt", "open"); + } + } + + int power = dbH->flags & O2_FLAG_POWER; + if (power) { + if ((pLfd = open("powerList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { + error("error creating powerList file", "powerList.txt", "open"); + } + } + + if ((kLfd = open("keyList.txt", O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { + error("error creating keyList file", "keyList.txt", "open"); + } + + /* can these fail? I sincerely hope not. */ + fLFile = fdopen(fLfd, "w"); + if (times) { + tLFile = fdopen(tLfd, "w"); + } + if (power) { + pLFile = fdopen(pLfd, "w"); + } + kLFile = fdopen(kLfd, "w"); + + char *fName = new char[256]; + int ffd, pfd; + FILE *tFile; + unsigned pos = 0; + lseek(dbfid, dbH->dataOffset, SEEK_SET); + double *data_buffer; + size_t data_buffer_size; + for(unsigned k = 0; k < dbH->numFiles; k++) { + fprintf(kLFile, "%s\n", fileTable + k*O2_FILETABLESIZE); + snprintf(fName, 256, "%05d.features", k); + if ((ffd = open(fName, O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { + error("error creating feature file", fName, "open"); + } + if ((write(ffd, &dbH->dim, sizeof(uint32_t))) < 0) { + error("error writing dimensions", fName, "write"); + } + + /* FIXME: this repeated malloc()/free() of data buffers is + inefficient. */ + data_buffer_size = trackTable[k] * dbH->dim * sizeof(double); + + { + void *tmp = malloc(data_buffer_size); + if (tmp == NULL) { + error("error allocating data buffer"); + } + data_buffer = (double *) tmp; + } + + if ((read(dbfid, data_buffer, data_buffer_size)) != (ssize_t) data_buffer_size) { + error("error reading data", fName, "read"); + } + + if ((write(ffd, data_buffer, data_buffer_size)) < 0) { + error("error writing data", fName, "write"); + } + + free(data_buffer); + + fprintf(fLFile, "%s\n", fName); + close(ffd); + + if (times) { + snprintf(fName, 256, "%05d.times", k); + tFile = fopen(fName, "w"); + for(unsigned i = 0; i < trackTable[k]; i++) { + // KLUDGE: specifying 16 digits of precision after the decimal + // point is (but check this!) sufficient to uniquely identify + // doubles; however, that will cause ugliness, as that's + // vastly too many for most values of interest. Moving to %a + // here and scanf() in the timesFile reading might fix this. + // -- CSR, 2007-10-19 + fprintf(tFile, "%.16e\n", *(timesTable + 2*pos + 2*i)); + } + fprintf(tFile, "%.16e\n", *(timesTable + 2*pos + 2*trackTable[k]-1)); + + fprintf(tLFile, "%s\n", fName); + } + + if (power) { + uint32_t one = 1; + snprintf(fName, 256, "%05d.power", k); + if ((pfd = open(fName, O_CREAT|O_RDWR|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH)) < 0) { + error("error creating power file", fName, "open"); + } + if ((write(pfd, &one, sizeof(uint32_t))) < 0) { + error("error writing one", fName, "write"); + } + if ((write(pfd, powerTable + pos, trackTable[k] * sizeof(double))) < 0) { + error("error writing data", fName, "write"); + } + fprintf(pLFile, "%s\n", fName); + close(pfd); + } + + pos += trackTable[k]; + std::cout << fileTable+k*O2_FILETABLESIZE << " " << trackTable[k] << std::endl; + } + + FILE *scriptFile; + scriptFile = fopen("restore.sh", "w"); + fprintf(scriptFile, "\ +#! /bin/sh\n\ +#\n\ +# usage: AUDIODB=/path/to/audioDB sh ./restore.sh <newdb>\n\ +\n\ +if [ -z \"${AUDIODB}\" ]; then echo set AUDIODB variable; exit 1; fi\n\ +if [ -z \"$1\" ]; then echo usage: $0 newdb; exit 1; fi\n\n\ +\"${AUDIODB}\" -d \"$1\" -N --size=%d\n", (int) (dbH->dbSize / 1000000)); + if(dbH->flags & O2_FLAG_L2NORM) { + fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -L\n"); + } + if(power) { + fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -P\n"); + } + fprintf(scriptFile, "\"${AUDIODB}\" -d \"$1\" -B -F featureList.txt -K keyList.txt"); + if(times) { + fprintf(scriptFile, " -T timesList.txt"); + } + if(power) { + fprintf(scriptFile, " -W powerList.txt"); + } + fprintf(scriptFile, "\n"); + fclose(scriptFile); + + if((chdir(cwd)) < 0) { + error("error changing working directory", cwd, "chdir"); + } + + fclose(fLFile); + if(times) { + fclose(tLFile); + } + if(power) { + fclose(pLFile); + } + fclose(kLFile); + delete[] fName; + + status(dbName); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/insert.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,304 @@ +#include "audioDB.h" + +bool audioDB::enough_data_space_free(off_t size) { + return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); +} + +void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { + lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); + write(dbfid, buffer, size); +} + +void audioDB::insert(const char* dbName, const char* inFile) { + forWrite = true; + initTables(dbName, inFile); + + if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) + error("Must use timestamps with timestamped database","use --times"); + + if(!usingPower && (dbH->flags & O2_FLAG_POWER)) + error("Must use power with power-enabled database", dbName); + + if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { + error("Insert failed: no more room in database", inFile); + } + + if(!key) + key=inFile; + // Linear scan of filenames check for pre-existing feature + unsigned alreadyInserted=0; + for(unsigned k=0; k<dbH->numFiles; k++) + if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key)+1)==0){ + alreadyInserted=1; + break; + } + + if(alreadyInserted) { + VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); + return; + } + + // Make a track index table of features to file indexes + unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); + if(!numVectors) { + VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", key); + + // CLEAN UP + munmap(indata,statbuf.st_size); + munmap(db,dbH->dbSize); + close(infid); + return; + } + + strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, key, strlen(key)); + + off_t insertoffset = dbH->length;// Store current state + + // Check times status and insert times from file + unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); + double *timesdata = timesTable + 2*indexoffset; + + if(2*(indexoffset + numVectors) > timesTableLength) { + error("out of space for times", key); + } + + if (usingTimes) { + insertTimeStamps(numVectors, timesFile, timesdata); + } + + double *powerdata = powerTable + indexoffset; + insertPowerData(numVectors, powerfd, powerdata); + + // Increment file count + dbH->numFiles++; + + // Update Header information + dbH->length+=(statbuf.st_size-sizeof(int)); + + // Update track to file index map + memcpy(trackTable + dbH->numFiles - 1, &numVectors, sizeof(unsigned)); + + insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); + + // Norm the vectors on input if the database is already L2 normed + if(dbH->flags & O2_FLAG_L2NORM) + unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append + + // Report status + status(dbName); + VERB_LOG(0, "%s %s %u vectors %jd bytes.\n", COM_INSERT, dbName, numVectors, (intmax_t) (statbuf.st_size - sizeof(int))); + + // Copy the header back to the database + memcpy (db, dbH, sizeof(dbTableHeaderT)); + + // CLEAN UP + munmap(indata,statbuf.st_size); + close(infid); +} + +void audioDB::insertTimeStamps(unsigned numVectors, std::ifstream *timesFile, double *timesdata) { + assert(usingTimes); + + unsigned numtimes = 0; + + if(!(dbH->flags & O2_FLAG_TIMES) && !dbH->numFiles) { + dbH->flags=dbH->flags|O2_FLAG_TIMES; + } else if(!(dbH->flags & O2_FLAG_TIMES)) { + error("Timestamp file used with non-timestamped database", timesFileName); + } + + if(!timesFile->is_open()) { + error("problem opening times file on timestamped database", timesFileName); + } + + double timepoint, next; + *timesFile >> timepoint; + if (timesFile->eof()) { + error("no entries in times file", timesFileName); + } + numtimes++; + do { + *timesFile >> next; + if (timesFile->eof()) { + break; + } + numtimes++; + timesdata[0] = timepoint; + timepoint = (timesdata[1] = next); + timesdata += 2; + } while (numtimes < numVectors + 1); + + if (numtimes < numVectors + 1) { + error("too few timepoints in times file", timesFileName); + } + + *timesFile >> next; + if (!timesFile->eof()) { + error("too many timepoints in times file", timesFileName); + } +} + +void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { + if (usingPower) { + if (!(dbH->flags & O2_FLAG_POWER)) { + error("Cannot insert power data on non-power DB", dbName); + } + + int one; + unsigned int count; + + count = read(powerfd, &one, sizeof(unsigned int)); + if (count != sizeof(unsigned int)) { + error("powerfd read failed", "int", "read"); + } + if (one != 1) { + error("dimensionality of power file not 1", powerFileName); + } + + // FIXME: should check that the powerfile is the right size for + // this. -- CSR, 2007-10-30 + count = read(powerfd, powerdata, numVectors * sizeof(double)); + if (count != numVectors * sizeof(double)) { + error("powerfd read failed", "double", "read"); + } + } +} + +void audioDB::batchinsert(const char* dbName, const char* inFile) { + + forWrite = true; + initDBHeader(dbName); + + if(!key) + key=inFile; + std::ifstream *filesIn = 0; + std::ifstream *keysIn = 0; + std::ifstream* thisTimesFile = 0; + int thispowerfd = 0; + + if(!(filesIn = new std::ifstream(inFile))) + error("Could not open batch in file", inFile); + if(key && key!=inFile) + if(!(keysIn = new std::ifstream(key))) + error("Could not open batch key file",key); + + if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) + error("Must use timestamps with timestamped database","use --times"); + + if(!usingPower && (dbH->flags & O2_FLAG_POWER)) + error("Must use power with power-enabled database", dbName); + + unsigned totalVectors=0; + char *thisKey = new char[MAXSTR]; + char *thisFile = new char[MAXSTR]; + char *thisTimesFileName = new char[MAXSTR]; + char *thisPowerFileName = new char[MAXSTR]; + + do{ + filesIn->getline(thisFile,MAXSTR); + if(key && key!=inFile) + keysIn->getline(thisKey,MAXSTR); + else + thisKey = thisFile; + if(usingTimes) + timesFile->getline(thisTimesFileName,MAXSTR); + if(usingPower) + powerFile->getline(thisPowerFileName, MAXSTR); + + if(filesIn->eof()) + break; + + initInputFile(thisFile); + + if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { + error("batchinsert failed: no more room in database", thisFile); + } + + // Linear scan of filenames check for pre-existing feature + unsigned alreadyInserted=0; + + for(unsigned k=0; k<dbH->numFiles; k++) + if(strncmp(fileTable + k*O2_FILETABLESIZE, thisKey, strlen(thisKey)+1)==0){ + alreadyInserted=1; + break; + } + + if(alreadyInserted) { + VERB_LOG(0, "key already exists in database: %s\n", thisKey); + } else { + // Make a track index table of features to file indexes + unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); + if(!numVectors) { + VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); + } + else{ + if(usingTimes){ + if(timesFile->eof()) { + error("not enough timestamp files in timesList", timesFileName); + } + thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); + if(!thisTimesFile->is_open()) { + error("Cannot open timestamp file", thisTimesFileName); + } + off_t insertoffset = dbH->length; + unsigned indexoffset = insertoffset / (dbH->dim*sizeof(double)); + double *timesdata = timesTable + 2*indexoffset; + if(2*(indexoffset + numVectors) > timesTableLength) { + error("out of space for times", key); + } + insertTimeStamps(numVectors, thisTimesFile, timesdata); + if(thisTimesFile) + delete thisTimesFile; + } + + if (usingPower) { + if(powerFile->eof()) { + error("not enough power files in powerList", powerFileName); + } + thispowerfd = open(thisPowerFileName, O_RDONLY); + if (thispowerfd < 0) { + error("failed to open power file", thisPowerFileName); + } + off_t insertoffset = dbH->length; + unsigned poweroffset = insertoffset / (dbH->dim * sizeof(double)); + double *powerdata = powerTable + poweroffset; + insertPowerData(numVectors, thispowerfd, powerdata); + if (0 < thispowerfd) { + close(thispowerfd); + } + } + strncpy(fileTable + dbH->numFiles*O2_FILETABLESIZE, thisKey, strlen(thisKey)); + + off_t insertoffset = dbH->length;// Store current state + + // Increment file count + dbH->numFiles++; + + // Update Header information + dbH->length+=(statbuf.st_size-sizeof(int)); + + // Update track to file index map + memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); + + insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); + + // Norm the vectors on input if the database is already L2 normed + if(dbH->flags & O2_FLAG_L2NORM) + unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append + + totalVectors+=numVectors; + + // Copy the header back to the database + memcpy (db, dbH, sizeof(dbTableHeaderT)); + } + } + // CLEAN UP + munmap(indata,statbuf.st_size); + close(infid); + } while(!filesIn->eof()); + + VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); + + // Report status + status(dbName); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,481 @@ +#include "audioDB.h" + +#include "reporter.h" + +bool audioDB::powers_acceptable(double p1, double p2) { + if (use_absolute_threshold) { + if ((p1 < absolute_threshold) || (p2 < absolute_threshold)) { + return false; + } + } + if (use_relative_threshold) { + if (fabs(p1-p2) > fabs(relative_threshold)) { + return false; + } + } + return true; +} + +void audioDB::query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { + initTables(dbName, inFile); + Reporter *r = 0; + switch (queryType) { + case O2_POINT_QUERY: + sequenceLength = 1; + normalizedDistance = false; + r = new pointQueryReporter<std::greater < NNresult > >(pointNN); + break; + case O2_TRACK_QUERY: + sequenceLength = 1; + normalizedDistance = false; + r = new trackAveragingReporter<std::greater < NNresult > >(pointNN, trackNN, dbH->numFiles); + break; + case O2_SEQUENCE_QUERY: + if(radius == 0) { + r = new trackAveragingReporter<std::less < NNresult > >(pointNN, trackNN, dbH->numFiles); + } else { + r = new trackSequenceQueryRadReporter(trackNN, dbH->numFiles); + } + break; + default: + error("unrecognized queryType in query()"); + } + query_loop(dbName, inFile, r); + r->report(fileTable, adbQueryResponse); + delete r; +} + +// return ordinal position of key in keyTable +unsigned audioDB::getKeyPos(char* key){ + for(unsigned k=0; k<dbH->numFiles; k++) + if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key))==0) + return k; + error("Key not found",key); + return O2_ERR_KEYNOTFOUND; +} + +// This is a common pattern in sequence queries: what we are doing is +// taking a window of length seqlen over a buffer of length length, +// and placing the sum of the elements in that window in the first +// element of the window: thus replacing all but the last seqlen +// elements in the buffer with the corresponding windowed sum. +void audioDB::sequence_sum(double *buffer, int length, int seqlen) { + double tmp1, tmp2, *ps; + int j, w; + + tmp1 = *buffer; + j = 1; + w = seqlen - 1; + while(w--) { + *buffer += buffer[j++]; + } + ps = buffer + 1; + w = length - seqlen; // +1 - 1 + while(w--) { + tmp2 = *ps; + if(isfinite(tmp1)) { + *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1); + } else { + for(int i = 1; i < seqlen; i++) { + *ps += *(ps + i); + } + } + tmp1 = tmp2; + ps++; + } +} + +// In contrast to sequence_sum() above, sequence_sqrt() and +// sequence_average() below are simple mappers across the sequence. +void audioDB::sequence_sqrt(double *buffer, int length, int seqlen) { + int w = length - seqlen + 1; + while(w--) { + *buffer = sqrt(*buffer); + buffer++; + } +} + +void audioDB::sequence_average(double *buffer, int length, int seqlen) { + int w = length - seqlen + 1; + while(w--) { + *buffer /= seqlen; + buffer++; + } +} + +void audioDB::initialize_arrays(int track, unsigned int numVectors, double *query, double *data_buffer, double **D, double **DD) { + unsigned int j, k, l, w; + double *dp, *qp, *sp; + + const unsigned HOP_SIZE = sequenceHop; + const unsigned wL = sequenceLength; + + for(j = 0; j < numVectors; j++) { + // Sum products matrix + D[j] = new double[trackTable[track]]; + assert(D[j]); + // Matched filter matrix + DD[j]=new double[trackTable[track]]; + assert(DD[j]); + } + + // Dot product + for(j = 0; j < numVectors; j++) + for(k = 0; k < trackTable[track]; k++){ + qp = query + j * dbH->dim; + sp = data_buffer + k * dbH->dim; + DD[j][k] = 0.0; // Initialize matched filter array + dp = &D[j][k]; // point to correlation cell j,k + *dp = 0.0; // initialize correlation cell + l = dbH->dim; // size of vectors + while(l--) + *dp += *qp++ * *sp++; + } + + // Matched Filter + // HOP SIZE == 1 + double* spd; + if(HOP_SIZE == 1) { // HOP_SIZE = shingleHop + for(w = 0; w < wL; w++) { + for(j = 0; j < numVectors - w; j++) { + sp = DD[j]; + spd = D[j+w] + w; + k = trackTable[track] - w; + while(k--) + *sp++ += *spd++; + } + } + } else { // HOP_SIZE != 1 + for(w = 0; w < wL; w++) { + for(j = 0; j < numVectors - w; j += HOP_SIZE) { + sp = DD[j]; + spd = D[j+w]+w; + for(k = 0; k < trackTable[track] - w; k += HOP_SIZE) { + *sp += *spd; + sp += HOP_SIZE; + spd += HOP_SIZE; + } + } + } + } +} + +void audioDB::delete_arrays(int track, unsigned int numVectors, double **D, double **DD) { + if(D != NULL) { + for(unsigned int j = 0; j < numVectors; j++) { + delete[] D[j]; + } + } + if(DD != NULL) { + for(unsigned int j = 0; j < numVectors; j++) { + delete[] DD[j]; + } + } +} + +void audioDB::read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p) { + if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) { + if(*data_buffer_p) { + free(*data_buffer_p); + } + { + *data_buffer_size_p = trackTable[track] * sizeof(double) * dbH->dim; + void *tmp = malloc(*data_buffer_size_p); + if (tmp == NULL) { + error("error allocating data buffer"); + } + *data_buffer_p = (double *) tmp; + } + } + + read(dbfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); +} + +// These names deserve some unpicking. The names starting with a "q" +// are pointers to the query, norm and power vectors; the names +// starting with "v" are things that will end up pointing to the +// actual query point's information. -- CSR, 2007-12-05 +void audioDB::set_up_query(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned *nvp) { + *nvp = (statbuf.st_size - sizeof(int)) / (dbH->dim * sizeof(double)); + + if(!(dbH->flags & O2_FLAG_L2NORM)) { + error("Database must be L2 normed for sequence query","use -L2NORM"); + } + + if(*nvp < sequenceLength) { + error("Query shorter than requested sequence length", "maybe use -l"); + } + + VERB_LOG(1, "performing norms... "); + + *qp = new double[*nvp * dbH->dim]; + memcpy(*qp, indata+sizeof(int), *nvp * dbH->dim * sizeof(double)); + *qnp = new double[*nvp]; + unitNorm(*qp, dbH->dim, *nvp, *qnp); + + sequence_sum(*qnp, *nvp, sequenceLength); + sequence_sqrt(*qnp, *nvp, sequenceLength); + + if (usingPower) { + *qpp = new double[*nvp]; + if (lseek(powerfd, sizeof(int), SEEK_SET) == (off_t) -1) { + error("error seeking to data", powerFileName, "lseek"); + } + int count = read(powerfd, *qpp, *nvp * sizeof(double)); + if (count == -1) { + error("error reading data", powerFileName, "read"); + } + if ((unsigned) count != *nvp * sizeof(double)) { + error("short read", powerFileName); + } + + sequence_sum(*qpp, *nvp, sequenceLength); + sequence_average(*qpp, *nvp, sequenceLength); + } + + if (usingTimes) { + unsigned int k; + *mqdp = 0.0; + double *querydurs = new double[*nvp]; + double *timesdata = new double[*nvp*2]; + insertTimeStamps(*nvp, timesFile, timesdata); + for(k = 0; k < *nvp; k++) { + querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; + *mqdp += querydurs[k]; + } + *mqdp /= k; + + VERB_LOG(1, "mean query file duration: %f\n", *mqdp); + + delete [] querydurs; + delete [] timesdata; + } + + // Defaults, for exhaustive search (!usingQueryPoint) + *vqp = *qp; + *vqnp = *qnp; + *vqpp = *qpp; + + if(usingQueryPoint) { + if(queryPoint > *nvp || queryPoint > *nvp - sequenceLength + 1) { + error("queryPoint > numVectors-wL+1 in query"); + } else { + VERB_LOG(1, "query point: %u\n", queryPoint); + *vqp = *qp + queryPoint * dbH->dim; + *vqnp = *qnp + queryPoint; + if (usingPower) { + *vqpp = *qpp + queryPoint; + } + *nvp = sequenceLength; + } + } +} + +// FIXME: this is not the right name; we're not actually setting up +// the database, but copying various bits of it out of mmap()ed tables +// in order to reduce seeks. +void audioDB::set_up_db(double **snp, double **vsnp, double **spp, double **vspp, double **mddp, unsigned int *dvp) { + *dvp = dbH->length / (dbH->dim * sizeof(double)); + *snp = new double[*dvp]; + + double *snpp = *snp, *sppp = 0; + memcpy(*snp, l2normTable, *dvp * sizeof(double)); + + if (usingPower) { + if (!(dbH->flags & O2_FLAG_POWER)) { + error("database not power-enabled", dbName); + } + *spp = new double[*dvp]; + sppp = *spp; + memcpy(*spp, powerTable, *dvp * sizeof(double)); + } + + for(unsigned int i = 0; i < dbH->numFiles; i++){ + if(trackTable[i] >= sequenceLength) { + sequence_sum(snpp, trackTable[i], sequenceLength); + sequence_sqrt(snpp, trackTable[i], sequenceLength); + + if (usingPower) { + sequence_sum(sppp, trackTable[i], sequenceLength); + sequence_average(sppp, trackTable[i], sequenceLength); + } + } + snpp += trackTable[i]; + if (usingPower) { + sppp += trackTable[i]; + } + } + + if (usingTimes) { + if(!(dbH->flags & O2_FLAG_TIMES)) { + error("query timestamps provided for non-timed database", dbName); + } + + *mddp = new double[dbH->numFiles]; + + for(unsigned int k = 0; k < dbH->numFiles; k++) { + unsigned int j; + (*mddp)[k] = 0.0; + for(j = 0; j < trackTable[k]; j++) { + (*mddp)[k] += timesTable[2*j+1] - timesTable[2*j]; + } + (*mddp)[k] /= j; + } + } + + *vsnp = *snp; + *vspp = *spp; +} + +void audioDB::query_loop(const char* dbName, const char* inFile, Reporter *reporter) { + + unsigned int numVectors; + double *query, *query_data; + double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; + double meanQdur; + + set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); + + unsigned int dbVectors; + double *sNorm, *snPtr, *sPower = 0, *spPtr = 0; + double *meanDBdur = 0; + + set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); + + VERB_LOG(1, "matching tracks..."); + + assert(pointNN>0 && pointNN<=O2_MAXNN); + assert(trackNN>0 && trackNN<=O2_MAXNN); + + unsigned j,k,track,trackOffset=0, HOP_SIZE=sequenceHop, wL=sequenceLength; + double **D = 0; // Differences query and target + double **DD = 0; // Matched filter distance + + D = new double*[numVectors]; + DD = new double*[numVectors]; + + gettimeofday(&tv1, NULL); + unsigned processedTracks = 0; + + // build track offset table + off_t *trackOffsetTable = new off_t[dbH->numFiles]; + unsigned cumTrack=0; + off_t trackIndexOffset; + for(k = 0; k < dbH->numFiles; k++){ + trackOffsetTable[k] = cumTrack; + cumTrack += trackTable[k] * dbH->dim; + } + + char nextKey[MAXSTR]; + + // Track loop + size_t data_buffer_size = 0; + double *data_buffer = 0; + lseek(dbfid, dbH->dataOffset, SEEK_SET); + + for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++) { + + trackOffset = trackOffsetTable[track]; // numDoubles offset + + // get trackID from file if using a control file + if(trackFile) { + trackFile->getline(nextKey,MAXSTR); + if(!trackFile->eof()) { + track = getKeyPos(nextKey); + trackOffset = trackOffsetTable[track]; + lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); + } else { + break; + } + } + + trackIndexOffset=trackOffset/dbH->dim; // numVectors offset + + read_data(track, &data_buffer, &data_buffer_size); + if(sequenceLength <= trackTable[track]) { // test for short sequences + + VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]); + + initialize_arrays(track, numVectors, query, data_buffer, D, DD); + + if(usingTimes) { + VERB_LOG(3,"meanQdur=%f meanDBdur=%f\n", meanQdur, meanDBdur[track]); + } + + if((!usingTimes) || fabs(meanDBdur[track]-meanQdur) < meanQdur*timesTol) { + if(usingTimes) { + VERB_LOG(3,"within duration tolerance.\n"); + } + + // Search for minimum distance by shingles (concatenated vectors) + for(j = 0; j <= numVectors - wL; j += HOP_SIZE) { + for(k = 0; k <= trackTable[track] - wL; k += HOP_SIZE) { + double thisDist; + if(normalizedDistance) { + thisDist = 2-(2/(qnPtr[j]*sNorm[trackIndexOffset+k]))*DD[j][k]; + } else { + thisDist = DD[j][k]; + } + // Power test + if ((!usingPower) || powers_acceptable(qpPtr[j], sPower[trackIndexOffset + k])) { + // radius test + if((!radius) || thisDist < radius) { + reporter->add_point(track, usingQueryPoint ? queryPoint : j, k, thisDist); + } + } + } + } + } // Duration match + delete_arrays(track, numVectors, D, DD); + } + } + + free(data_buffer); + + gettimeofday(&tv2,NULL); + VERB_LOG(1,"elapsed time: %ld msec\n", + (tv2.tv_sec*1000 + tv2.tv_usec/1000) - + (tv1.tv_sec*1000 + tv1.tv_usec/1000)) + + // Clean up + if(trackOffsetTable) + delete[] trackOffsetTable; + if(query_data) + delete[] query_data; + if(qNorm) + delete[] qNorm; + if(sNorm) + delete[] sNorm; + if(qPower) + delete[] qPower; + if(sPower) + delete[] sPower; + if(D) + delete[] D; + if(DD) + delete[] DD; + if(meanDBdur) + delete[] meanDBdur; +} + +// Unit norm block of features +void audioDB::unitNorm(double* X, unsigned dim, unsigned n, double* qNorm){ + unsigned d; + double L2, *p; + + VERB_LOG(2, "norming %u vectors...", n); + while(n--) { + p = X; + L2 = 0.0; + d = dim; + while(d--) { + L2 += *p * *p; + p++; + } + if(qNorm) { + *qNorm++=L2; + } + X += dim; + } + VERB_LOG(2, "done.\n"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/reporter.h Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,283 @@ +#include <utility> +#include <queue> +#include <set> +#include <functional> + +typedef struct nnresult { + unsigned int trackID; + double dist; + unsigned int qpos; + unsigned int spos; +} NNresult; + +typedef struct radresult { + unsigned int trackID; + unsigned int count; +} Radresult; + +bool operator< (const NNresult &a, const NNresult &b) { + return a.dist < b.dist; +} + +bool operator> (const NNresult &a, const NNresult &b) { + return a.dist > b.dist; +} + +bool operator< (const Radresult &a, const Radresult &b) { + return a.count < b.count; +} + +class Reporter { +public: + virtual ~Reporter() {}; + virtual void add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist) = 0; + // FIXME: this interface is a bit wacky: a relic of previous, more + // confused times. Really it might make sense to have separate + // reporter classes for WS and for stdout, rather than passing this + // adbQueryResponse thing everywhere; the fileTable argument is + // there solely for convertion trackIDs into names. -- CSR, + // 2007-12-10. + virtual void report(char *fileTable, adb__queryResponse *adbQueryResponse) = 0; +}; + +template <class T> class pointQueryReporter : public Reporter { +public: + pointQueryReporter(unsigned int pointNN); + ~pointQueryReporter(); + void add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist); + void report(char *fileTable, adb__queryResponse *adbQueryResponse); +private: + unsigned int pointNN; + std::priority_queue< NNresult, std::vector< NNresult >, T> *queue; +}; + +template <class T> pointQueryReporter<T>::pointQueryReporter(unsigned int pointNN) + : pointNN(pointNN) { + queue = new std::priority_queue< NNresult, std::vector< NNresult >, T>; +} + +template <class T> pointQueryReporter<T>::~pointQueryReporter() { + delete queue; +} + +template <class T> void pointQueryReporter<T>::add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist) { + NNresult r; + r.trackID = trackID; + r.qpos = qpos; + r.spos = spos; + r.dist = dist; + queue->push(r); + if(queue->size() > pointNN) { + queue->pop(); + } +} + +template <class T> void pointQueryReporter<T>::report(char *fileTable, adb__queryResponse *adbQueryResponse) { + NNresult r; + std::vector<NNresult> v; + unsigned int size = queue->size(); + for(unsigned int k = 0; k < size; k++) { + r = queue->top(); + v.push_back(r); + queue->pop(); + } + std::vector<NNresult>::reverse_iterator rit; + + if(adbQueryResponse==0) { + for(rit = v.rbegin(); rit < v.rend(); rit++) { + r = *rit; + std::cout << fileTable + r.trackID*O2_FILETABLESIZE << " "; + std::cout << r.dist << " " << r.qpos << " " << r.spos << std::endl; + } + } else { + adbQueryResponse->result.__sizeRlist=size; + adbQueryResponse->result.__sizeDist=size; + adbQueryResponse->result.__sizeQpos=size; + adbQueryResponse->result.__sizeSpos=size; + adbQueryResponse->result.Rlist= new char*[size]; + adbQueryResponse->result.Dist = new double[size]; + adbQueryResponse->result.Qpos = new unsigned int[size]; + adbQueryResponse->result.Spos = new unsigned int[size]; + unsigned int k = 0; + for(rit = v.rbegin(); rit < v.rend(); rit++, k++) { + r = *rit; + adbQueryResponse->result.Rlist[k] = new char[O2_MAXFILESTR]; + adbQueryResponse->result.Dist[k] = r.dist; + adbQueryResponse->result.Qpos[k] = r.qpos; + adbQueryResponse->result.Spos[k] = r.spos; + snprintf(adbQueryResponse->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLESIZE); + } + } +} + +template <class T> class trackAveragingReporter : public Reporter { + public: + trackAveragingReporter(unsigned int pointNN, unsigned int trackNN, unsigned int numFiles); + ~trackAveragingReporter(); + void add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist); + void report(char *fileTable, adb__queryResponse *adbQueryResponse); + private: + unsigned int pointNN; + unsigned int trackNN; + unsigned int numFiles; + std::priority_queue< NNresult, std::vector< NNresult>, T > *queues; +}; + +template <class T> trackAveragingReporter<T>::trackAveragingReporter(unsigned int pointNN, unsigned int trackNN, unsigned int numFiles) + : pointNN(pointNN), trackNN(trackNN), numFiles(numFiles) { + queues = new std::priority_queue< NNresult, std::vector< NNresult>, T >[numFiles]; +} + +template <class T> trackAveragingReporter<T>::~trackAveragingReporter() { + delete [] queues; +} + +template <class T> void trackAveragingReporter<T>::add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist) { + NNresult r; + r.trackID = trackID; + r.qpos = qpos; + r.spos = spos; + r.dist = dist; + queues[trackID].push(r); + if(queues[trackID].size() > pointNN) { + queues[trackID].pop(); + } +} + +template <class T> void trackAveragingReporter<T>::report(char *fileTable, adb__queryResponse *adbQueryResponse) { + std::priority_queue < NNresult, std::vector< NNresult>, T> result; + for (int i = numFiles-1; i >= 0; i--) { + unsigned int size = queues[i].size(); + if (size > 0) { + NNresult r; + double dist = 0; + NNresult oldr = queues[i].top(); + for (unsigned int j = 0; j < size; j++) { + r = queues[i].top(); + dist += r.dist; + queues[i].pop(); + if (r.dist == oldr.dist) { + r.qpos = oldr.qpos; + r.spos = oldr.spos; + } else { + oldr = r; + } + } + dist /= size; + r.dist = dist; // trackID, qpos and spos are magically right already. + result.push(r); + if (result.size() > trackNN) { + result.pop(); + } + } + } + + NNresult r; + std::vector<NNresult> v; + unsigned int size = result.size(); + for(unsigned int k = 0; k < size; k++) { + r = result.top(); + v.push_back(r); + result.pop(); + } + std::vector<NNresult>::reverse_iterator rit; + + if(adbQueryResponse==0) { + for(rit = v.rbegin(); rit < v.rend(); rit++) { + r = *rit; + std::cout << fileTable + r.trackID*O2_FILETABLESIZE << " "; + std::cout << r.dist << " " << r.qpos << " " << r.spos << std::endl; + } + } else { + adbQueryResponse->result.__sizeRlist=size; + adbQueryResponse->result.__sizeDist=size; + adbQueryResponse->result.__sizeQpos=size; + adbQueryResponse->result.__sizeSpos=size; + adbQueryResponse->result.Rlist= new char*[size]; + adbQueryResponse->result.Dist = new double[size]; + adbQueryResponse->result.Qpos = new unsigned int[size]; + adbQueryResponse->result.Spos = new unsigned int[size]; + unsigned int k = 0; + for(rit = v.rbegin(); rit < v.rend(); rit++, k++) { + r = *rit; + adbQueryResponse->result.Rlist[k] = new char[O2_MAXFILESTR]; + adbQueryResponse->result.Dist[k] = r.dist; + adbQueryResponse->result.Qpos[k] = r.qpos; + adbQueryResponse->result.Spos[k] = r.spos; + snprintf(adbQueryResponse->result.Rlist[k], O2_MAXFILESTR, "%s", fileTable+r.trackID*O2_FILETABLESIZE); + } + } +} + +class trackSequenceQueryRadReporter : public Reporter { +public: + trackSequenceQueryRadReporter(unsigned int trackNN, unsigned int numFiles); + ~trackSequenceQueryRadReporter(); + void add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist); + void report(char *fileTable, adb__queryResponse *adbQueryResponse); +private: + unsigned int trackNN; + unsigned int numFiles; + std::set<std::pair<unsigned int, unsigned int> > *set; + unsigned int *count; +}; + +trackSequenceQueryRadReporter::trackSequenceQueryRadReporter(unsigned int trackNN, unsigned int numFiles): + trackNN(trackNN), numFiles(numFiles) { + set = new std::set<std::pair<unsigned int, unsigned int> >; + count = new unsigned int[numFiles]; + for (unsigned i = 0; i < numFiles; i++) { + count[i] = 0; + } +} + +trackSequenceQueryRadReporter::~trackSequenceQueryRadReporter() { + delete set; + delete [] count; +} + +void trackSequenceQueryRadReporter::add_point(unsigned int trackID, unsigned int qpos, unsigned int spos, double dist) { + std::set<std::pair<unsigned int, unsigned int> >::iterator it; + std::pair<unsigned int, unsigned int> pair = std::make_pair(trackID, qpos); + it = set->find(pair); + if (it == set->end()) { + set->insert(pair); + count[trackID]++; + } +} + +void trackSequenceQueryRadReporter::report(char *fileTable, adb__queryResponse *adbQueryResponse) { + std::priority_queue < Radresult > result; + // KLUDGE: doing this backwards in an attempt to get the same + // tiebreak behaviour as before. + for (int i = numFiles-1; i >= 0; i--) { + Radresult r; + r.trackID = i; + r.count = count[i]; + if(r.count > 0) { + result.push(r); + if (result.size() > trackNN) { + result.pop(); + } + } + } + + Radresult r; + std::vector<Radresult> v; + unsigned int size = result.size(); + for(unsigned int k = 0; k < size; k++) { + r = result.top(); + v.push_back(r); + result.pop(); + } + std::vector<Radresult>::reverse_iterator rit; + + if(adbQueryResponse==0) { + for(rit = v.rbegin(); rit < v.rend(); rit++) { + r = *rit; + std::cout << fileTable + r.trackID*O2_FILETABLESIZE << " " << r.count << std::endl; + } + } else { + // FIXME + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/soap.cpp Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,233 @@ +#include "audioDB.h" +#include "adb.nsmap" + +/* Command-line client definitions */ + +// FIXME: this can't propagate the sequence length argument (used for +// dudCount). See adb__status() definition for the other half of +// this. -- CSR, 2007-10-01 +void audioDB::ws_status(const char*dbName, char* hostport){ + struct soap soap; + adb__statusResponse adbStatusResponse; + + // Query an existing adb database + soap_init(&soap); + if(soap_call_adb__status(&soap,hostport,NULL,(char*)dbName,adbStatusResponse)==SOAP_OK) { + std::cout << "numFiles = " << adbStatusResponse.result.numFiles << std::endl; + std::cout << "dim = " << adbStatusResponse.result.dim << std::endl; + std::cout << "length = " << adbStatusResponse.result.length << std::endl; + std::cout << "dudCount = " << adbStatusResponse.result.dudCount << std::endl; + std::cout << "nullCount = " << adbStatusResponse.result.nullCount << std::endl; + std::cout << "flags = " << adbStatusResponse.result.flags << std::endl; + } else { + soap_print_fault(&soap,stderr); + } + + soap_destroy(&soap); + soap_end(&soap); + soap_done(&soap); +} + +void audioDB::ws_query(const char*dbName, const char *trackKey, const char* hostport){ + struct soap soap; + adb__queryResponse adbQueryResponse; + + soap_init(&soap); + if(soap_call_adb__query(&soap,hostport,NULL, + (char*)dbName,(char*)trackKey,(char*)trackFileName,(char*)timesFileName, + queryType, queryPoint, pointNN, trackNN, sequenceLength, adbQueryResponse)==SOAP_OK){ + //std::std::cerr << "result list length:" << adbQueryResponse.result.__sizeRlist << std::std::endl; + for(int i=0; i<adbQueryResponse.result.__sizeRlist; i++) + std::cout << adbQueryResponse.result.Rlist[i] << " " << adbQueryResponse.result.Dist[i] + << " " << adbQueryResponse.result.Qpos[i] << " " << adbQueryResponse.result.Spos[i] << std::endl; + } + else + soap_print_fault(&soap,stderr); + + soap_destroy(&soap); + soap_end(&soap); + soap_done(&soap); +} + +/* Server definitions */ +int adb__status(struct soap* soap, xsd__string dbName, adb__statusResponse &adbStatusResponse){ + char* const argv[]={"audioDB",COM_STATUS,"-d",dbName}; + const unsigned argc = 4; + try { + audioDB(argc, argv, &adbStatusResponse); + return SOAP_OK; + } catch(char *err) { + soap_receiver_fault(soap, err, ""); + return SOAP_FAULT; + } +} + +// Literal translation of command line to web service + +int adb__query(struct soap* soap, xsd__string dbName, xsd__string qKey, xsd__string keyList, xsd__string timesFileName, xsd__int qType, xsd__int qPos, xsd__int pointNN, xsd__int trackNN, xsd__int seqLen, adb__queryResponse &adbQueryResponse){ + char queryType[256]; + for(int k=0; k<256; k++) + queryType[k]='\0'; + if(qType == O2_POINT_QUERY) + strncpy(queryType, "point", strlen("point")); + else if (qType == O2_SEQUENCE_QUERY) + strncpy(queryType, "sequence", strlen("sequence")); + else if(qType == O2_TRACK_QUERY) + strncpy(queryType,"track", strlen("track")); + else + strncpy(queryType, "", strlen("")); + + if(pointNN==0) + pointNN=10; + if(trackNN==0) + trackNN=10; + if(seqLen==0) + seqLen=16; + + char qPosStr[256]; + sprintf(qPosStr, "%d", qPos); + char pointNNStr[256]; + sprintf(pointNNStr,"%d",pointNN); + char trackNNStr[256]; + sprintf(trackNNStr,"%d",trackNN); + char seqLenStr[256]; + sprintf(seqLenStr,"%d",seqLen); + + const char* argv[] ={ + "./audioDB", + COM_QUERY, + queryType, // Need to pass a parameter + COM_DATABASE, + ENSURE_STRING(dbName), + COM_FEATURES, + ENSURE_STRING(qKey), + COM_KEYLIST, + ENSURE_STRING(keyList), + COM_TIMES, + ENSURE_STRING(timesFileName), + COM_QPOINT, + qPosStr, + COM_POINTNN, + pointNNStr, + COM_TRACKNN, + trackNNStr, // Need to pass a parameter + COM_SEQLEN, + seqLenStr + }; + + const unsigned argc = 19; + try { + audioDB(argc, (char* const*)argv, &adbQueryResponse); + return SOAP_OK; + } catch (char *err) { + soap_receiver_fault(soap, err, ""); + return SOAP_FAULT; + } +} + +int adb__sequenceQuery(struct soap* soap, xsd__string dbName, xsd__string qKey, + adb__sequenceQueryParms *parms, + adb__queryResponse &adbQueryResponse) { + + char qPosStr[256]; + char pointNNStr[256]; + char trackNNStr[256]; + char seqLenStr[256]; + char relative_thresholdStr[256]; + char absolute_thresholdStr[256]; + + /* When the branch is merged, move this to a header and use it + elsewhere */ +#define INTSTRINGIFY(val, str) \ + snprintf(str, 256, "%d", val); +#define DOUBLESTRINGIFY(val, str) \ + snprintf(str, 256, "%f", val); + + INTSTRINGIFY(parms->qPos, qPosStr); + INTSTRINGIFY(parms->pointNN, pointNNStr); + INTSTRINGIFY(parms->segNN, trackNNStr); + /* FIXME: decide which of segLen and seqLen should live */ + INTSTRINGIFY(parms->segLen, seqLenStr); + + DOUBLESTRINGIFY(parms->relative_threshold, relative_thresholdStr); + DOUBLESTRINGIFY(parms->absolute_threshold, absolute_thresholdStr); + + const char *argv[] = { + "./audioDB", + COM_QUERY, + "sequence", + COM_DATABASE, + dbName, + COM_FEATURES, + qKey, + COM_KEYLIST, + /* FIXME: when this branch is merged, use ENSURE_STRING */ + parms->keyList==0?"":parms->keyList, + COM_TIMES, + parms->timesFileName==0?"":parms->timesFileName, + COM_QUERYPOWER, + parms->powerFileName==0?"":parms->powerFileName, + COM_QPOINT, + qPosStr, + COM_POINTNN, + pointNNStr, + COM_TRACKNN, + trackNNStr, + COM_SEQLEN, + seqLenStr, + COM_RELATIVE_THRESH, + relative_thresholdStr, + COM_ABSOLUTE_THRESH, + absolute_thresholdStr + }; + + const unsigned argc = 25; + + try { + audioDB(argc, (char* const*)argv, &adbQueryResponse); + return SOAP_OK; + } catch (char *err) { + soap_receiver_fault(soap, err, ""); + return SOAP_FAULT; + } +} + +/* Server loop */ +void audioDB::startServer(){ + struct soap soap; + int m, s; // master and slave sockets + soap_init(&soap); + // FIXME: largely this use of SO_REUSEADDR is to make writing (and + // running) test cases more convenient, so that multiple test runs + // in close succession don't fail because of a bin() error. + // Investigate whether there are any potential drawbacks in this, + // and also whether there's a better way to write the tests. -- + // CSR, 2007-10-03 + soap.bind_flags |= SO_REUSEADDR; + m = soap_bind(&soap, NULL, port, 100); + if (m < 0) + soap_print_fault(&soap, stderr); + else + { + fprintf(stderr, "Socket connection successful: master socket = %d\n", m); + for (int i = 1; ; i++) + { + s = soap_accept(&soap); + if (s < 0) + { + soap_print_fault(&soap, stderr); + break; + } + /* FIXME: find a way to play nice with logging when run from + /etc/init.d scripts: at present this just goes nowhere */ + fprintf(stderr, "%d: accepted connection from IP=%lu.%lu.%lu.%lu socket=%d\n", i, + (soap.ip >> 24)&0xFF, (soap.ip >> 16)&0xFF, (soap.ip >> 8)&0xFF, soap.ip&0xFF, s); + if (soap_serve(&soap) != SOAP_OK) // process RPC request + soap_print_fault(&soap, stderr); // print error + fprintf(stderr, "request served\n"); + soap_destroy(&soap); // clean up class instances + soap_end(&soap); // clean up everything and close socket + } + } + soap_done(&soap); // close master socket and detach environment +}
--- a/tests/0003/run-test.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/0003/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -6,6 +6,9 @@ ${AUDIODB} -d testdb -N +# point query now implemented as sequence search +${AUDIODB} -d testdb -L + # We could contemplate putting the test feature (and the expected # query output) under svn control if we trust its binary file # handling.
--- a/tests/0004/run-test.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/0004/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -6,6 +6,8 @@ ${AUDIODB} -d testdb -N +${AUDIODB} -d testdb -L + intstring 2 > testfeature floatstring 0 1 >> testfeature floatstring 1 0 >> testfeature
--- a/tests/0009/run-test.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/0009/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -6,6 +6,8 @@ ${AUDIODB} -d testdb -N +${AUDIODB} -d testdb -L + intstring 2 > testfeature01 floatstring 0 1 >> testfeature01 intstring 2 > testfeature10
--- a/tests/0014/run-test.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/0014/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -6,6 +6,8 @@ ${AUDIODB} -d testdb -N +${AUDIODB} -d testdb -L + intstring 2 > testfeature floatstring 0 1 >> testfeature floatstring 1 0 >> testfeature
--- a/tests/0019/run-test.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/0019/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -6,6 +6,8 @@ ${AUDIODB} -d testdb -N +${AUDIODB} -d testdb -L + intstring 2 > testfeature01 floatstring 0 1 >> testfeature01 intstring 2 > testfeature10
--- a/tests/0032/run-test.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/0032/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -6,6 +6,8 @@ ${AUDIODB} -d testdb -N +${AUDIODB} -d testdb -L + intstring 2 > testfeature01 floatstring 0 1 >> testfeature01 intstring 2 > testfeature10
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0035/run-test.sh Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,31 @@ +#! /bin/sh + +. ../test-utils.sh + +if [ -f testdb ]; then rm -f testdb; fi + +${AUDIODB} -d testdb -N + +intstring 2 > testfeature1 +floatstring 0 1 >> testfeature1 +intstring 2 > testfeature3 +floatstring 1 0 >> testfeature3 +floatstring 0 1 >> testfeature3 +floatstring 1 0 >> testfeature3 + +${AUDIODB} -d testdb -I -f testfeature1 +${AUDIODB} -d testdb -I -f testfeature3 + +# sequence queries require L2NORM +${AUDIODB} -d testdb -L + +echo "query point (0 1, 1 0)" +intstring 2 > testquery +floatstring 0 1 >> testquery +floatstring 1 0 >> testquery + +${AUDIODB} -d testdb -Q sequence -l 2 -f testquery -n 1 > testoutput +wc -l testoutput | grep "1 testoutput" +grep "^testfeature3 .* 0 1$" testoutput + +exit 104
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0035/short-description Fri Dec 14 14:41:37 2007 +0000 @@ -0,0 +1,1 @@ +-l 2 search with short tracks \ No newline at end of file
--- a/tests/run-tests.sh Wed Dec 05 14:11:04 2007 +0000 +++ b/tests/run-tests.sh Fri Dec 14 14:41:37 2007 +0000 @@ -26,8 +26,10 @@ echo -n : (cd ${file} && sh ./run-test.sh > test.out 2> test.err) EXIT_STATUS=$? - if [ ${EXIT_STATUS} -ne 104 ]; then - echo " failed (exit status ${EXIT_STATUS})". + if [ ${EXIT_STATUS} -eq 14 ]; then + echo " n/a." + elif [ ${EXIT_STATUS} -ne 104 ]; then + echo " failed (exit status ${EXIT_STATUS})." FAILED=true else echo " success."