changeset 302:74824093c1c4

Implement O((M+N) log(M+N)) duplicate key detection in batchinsert, rather than naive O(N^2). Note that I haven't measured the constants in those complexity expressions; I am anticipating that 40000 is a sufficiently large N for this to matter. Write a test case for duplicate keys, too. Use 0037, since no-one else seems to be writing tests, and everything is merged onto the trunk these days anyway.
author mas01cr
date Mon, 04 Aug 2008 10:00:34 +0000
parents 8764d114ce80
children 9f9b8b5f35f2
files audioDB.h insert.cpp tests/0037/run-test.sh tests/0037/short-description
diffstat 4 files changed, 97 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/audioDB.h	Fri Aug 01 16:21:51 2008 +0000
+++ b/audioDB.h	Mon Aug 04 10:00:34 2008 +0000
@@ -10,6 +10,8 @@
 #include <string.h>
 #include <iostream>
 #include <fstream>
+#include <set>
+#include <string>
 #include <math.h>
 #include <sys/time.h>
 #include <assert.h>
--- a/insert.cpp	Fri Aug 01 16:21:51 2008 +0000
+++ b/insert.cpp	Mon Aug 04 10:00:34 2008 +0000
@@ -210,8 +210,14 @@
   }
   char *thisTimesFileName = new char[MAXSTR];
   char *thisPowerFileName = new char[MAXSTR];
-  
-  do{
+
+  std::set<std::string> s;
+
+  for (unsigned k = 0; k < dbH->numFiles; k++) {
+    s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
+  }
+
+  do {
     filesIn->getline(thisFile,MAXSTR);
     if(key && key!=inFile) {
       keysIn->getline(thisKey,MAXSTR);
@@ -238,18 +244,10 @@
       error("batchinsert failed: no more room in database", thisFile);
     }
     
-    // Linear scan of filenames check for pre-existing feature
-    unsigned alreadyInserted=0;
-  
-    for(unsigned k=0; k<dbH->numFiles; k++)
-      if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){
-	alreadyInserted=1;
-	break;
-      }
-  
-    if(alreadyInserted) {
+    if(s.count(thisKey)) {
       VERB_LOG(0, "key already exists in database: %s\n", thisKey);
     } else {
+      s.insert(thisKey);
       // Make a track index table of features to file indexes
       unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
       if(!numVectors) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/0037/run-test.sh	Mon Aug 04 10:00:34 2008 +0000
@@ -0,0 +1,84 @@
+#! /bin/bash
+
+. ../test-utils.sh
+
+if [ -f testdb ]; then rm -f testdb; fi
+
+${AUDIODB} -d testdb -N
+
+intstring 2 > testfeature01
+floatstring 0 1 >> testfeature01
+floatstring 1 0 >> testfeature01
+intstring 2 > testfeature10
+floatstring 1 0 >> testfeature10
+floatstring 0 1 >> testfeature10
+
+cat > testfeaturefiles <<EOF
+testfeature01
+testfeature01
+testfeature10
+testfeature10
+testfeature01
+testfeature10
+EOF
+
+${AUDIODB} -d testdb -B -F testfeaturefiles
+
+${AUDIODB} -d testdb -S | grep "num files:2"
+
+# sequence queries require L2NORM
+${AUDIODB} -d testdb -L
+
+echo "query point (0.0,0.5)"
+intstring 2 > testquery
+floatstring 0 0.5 >> testquery
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery > testoutput
+echo testfeature01 1 > test-expected-output
+echo 0 0 0 >> test-expected-output
+echo 2 0 1 >> test-expected-output
+echo testfeature10 1 >> test-expected-output
+echo 0 0 1 >> test-expected-output
+echo 2 0 0 >> test-expected-output
+cmp testoutput test-expected-output
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 2 > testoutput
+cmp testoutput test-expected-output
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 5 > testoutput
+cmp testoutput test-expected-output
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 1 > testoutput
+echo testfeature01 0 > test-expected-output
+echo 0 0 0 >> test-expected-output
+echo testfeature10 0 >> test-expected-output
+echo 0 0 1 >> test-expected-output
+cmp testoutput test-expected-output
+
+echo "query point (0.5,0.0)"
+intstring 2 > testquery
+floatstring 0.5 0 >> testquery
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery > testoutput
+echo testfeature01 1 > test-expected-output
+echo 0 0 1 >> test-expected-output
+echo 2 0 0 >> test-expected-output
+echo testfeature10 1 >> test-expected-output
+echo 0 0 0 >> test-expected-output
+echo 2 0 1 >> test-expected-output
+cmp testoutput test-expected-output
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 2 > testoutput
+cmp testoutput test-expected-output
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 5 > testoutput
+cmp testoutput test-expected-output
+
+${AUDIODB} -d testdb -Q nsequence -l 1 -f testquery -n 1 > testoutput
+echo testfeature01 0 > test-expected-output
+echo 0 0 1 >> test-expected-output
+echo testfeature10 0 >> test-expected-output
+echo 0 0 0 >> test-expected-output
+cmp testoutput test-expected-output
+
+exit 104
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/0037/short-description	Mon Aug 04 10:00:34 2008 +0000
@@ -0,0 +1,1 @@
+batchinsert repeated key handling
\ No newline at end of file