Mercurial > hg > audiodb
changeset 302:74824093c1c4
Implement O((M+N) log(M+N)) duplicate key detection in batchinsert,
rather than naive O(N^2).
Note that I haven't measured the constants in those complexity
expressions; I am anticipating that 40000 is a sufficiently large N for
this to matter.
Write a test case for duplicate keys, too. Use 0037, since no-one else
seems to be writing tests, and everything is merged onto the trunk these
days anyway.
author | mas01cr |
---|---|
date | Mon, 04 Aug 2008 10:00:34 +0000 |
parents | 8764d114ce80 |
children | 9f9b8b5f35f2 |
files | audioDB.h insert.cpp tests/0037/run-test.sh tests/0037/short-description |
diffstat | 4 files changed, 97 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/audioDB.h Fri Aug 01 16:21:51 2008 +0000 +++ b/audioDB.h Mon Aug 04 10:00:34 2008 +0000 @@ -10,6 +10,8 @@ #include <string.h> #include <iostream> #include <fstream> +#include <set> +#include <string> #include <math.h> #include <sys/time.h> #include <assert.h>
--- a/insert.cpp Fri Aug 01 16:21:51 2008 +0000 +++ b/insert.cpp Mon Aug 04 10:00:34 2008 +0000 @@ -210,8 +210,14 @@ } char *thisTimesFileName = new char[MAXSTR]; char *thisPowerFileName = new char[MAXSTR]; - - do{ + + std::set<std::string> s; + + for (unsigned k = 0; k < dbH->numFiles; k++) { + s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); + } + + do { filesIn->getline(thisFile,MAXSTR); if(key && key!=inFile) { keysIn->getline(thisKey,MAXSTR); @@ -238,18 +244,10 @@ error("batchinsert failed: no more room in database", thisFile); } - // Linear scan of filenames check for pre-existing feature - unsigned alreadyInserted=0; - - for(unsigned k=0; k<dbH->numFiles; k++) - if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){ - alreadyInserted=1; - break; - } - - if(alreadyInserted) { + if(s.count(thisKey)) { VERB_LOG(0, "key already exists in database: %s\n", thisKey); } else { + s.insert(thisKey); // Make a track index table of features to file indexes unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); if(!numVectors) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/0037/run-test.sh Mon Aug 04 10:00:34 2008 +0000 @@ -0,0 +1,84 @@ +#! /bin/bash + +. ../test-utils.sh + +if [ -f testdb ]; then rm -f testdb; fi + +${AUDIODB} -d testdb -N + +intstring 2 > testfeature01 +floatstring 0 1 >> testfeature01 +floatstring 1 0 >> testfeature01 +intstring 2 > testfeature10 +floatstring 1 0 >> testfeature10 +floatstring 0 1 >> testfeature10 + +cat > testfeaturefiles <<EOF +testfeature01 +testfeature01 +testfeature10 +testfeature10 +testfeature01 +testfeature10 +EOF + +${AUDIODB} -d testdb -B -F testfeaturefiles + +${AUDIODB} -d testdb -S | grep "num files:2" + +# sequence queries require L2NORM +${AUDIODB} -d testdb -L + +echo "query point (0.0,0.5)" +intstring 2 > testquery +floatstring 0 0.5 >> testquery + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery > testoutput +echo testfeature01 1 > test-expected-output +echo 0 0 0 >> test-expected-output +echo 2 0 1 >> test-expected-output +echo testfeature10 1 >> test-expected-output +echo 0 0 1 >> test-expected-output +echo 2 0 0 >> test-expected-output +cmp testoutput test-expected-output + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 2 > testoutput +cmp testoutput test-expected-output + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 5 > testoutput +cmp testoutput test-expected-output + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 1 > testoutput +echo testfeature01 0 > test-expected-output +echo 0 0 0 >> test-expected-output +echo testfeature10 0 >> test-expected-output +echo 0 0 1 >> test-expected-output +cmp testoutput test-expected-output + +echo "query point (0.5,0.0)" +intstring 2 > testquery +floatstring 0.5 0 >> testquery + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery > testoutput +echo testfeature01 1 > test-expected-output +echo 0 0 1 >> test-expected-output +echo 2 0 0 >> test-expected-output +echo testfeature10 1 >> test-expected-output +echo 0 0 0 >> test-expected-output +echo 2 0 1 >> test-expected-output +cmp testoutput test-expected-output + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 2 > testoutput +cmp testoutput test-expected-output + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 5 > testoutput +cmp testoutput test-expected-output + +${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 1 > testoutput +echo testfeature01 0 > test-expected-output +echo 0 0 1 >> test-expected-output +echo testfeature10 0 >> test-expected-output +echo 0 0 0 >> test-expected-output +cmp testoutput test-expected-output + +exit 104