diff insert.cpp @ 302:74824093c1c4

Implement O((M+N) log(M+N)) duplicate key detection in batchinsert, rather than naive O(N^2). Note that I haven't measured the constants in those complexity expressions; I am anticipating that 40000 is a sufficiently large N for this to matter. Write a test case for duplicate keys, too. Use 0037, since no-one else seems to be writing tests, and everything is merged onto the trunk these days anyway.
author mas01cr
date Mon, 04 Aug 2008 10:00:34 +0000
parents 34ce7f7a177d
children 25572f1bd37f c93be2f3a674
line wrap: on
line diff
--- a/insert.cpp	Fri Aug 01 16:21:51 2008 +0000
+++ b/insert.cpp	Mon Aug 04 10:00:34 2008 +0000
@@ -210,8 +210,14 @@
   }
   char *thisTimesFileName = new char[MAXSTR];
   char *thisPowerFileName = new char[MAXSTR];
-  
-  do{
+
+  std::set<std::string> s;
+
+  for (unsigned k = 0; k < dbH->numFiles; k++) {
+    s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
+  }
+
+  do {
     filesIn->getline(thisFile,MAXSTR);
     if(key && key!=inFile) {
       keysIn->getline(thisKey,MAXSTR);
@@ -238,18 +244,10 @@
       error("batchinsert failed: no more room in database", thisFile);
     }
     
-    // Linear scan of filenames check for pre-existing feature
-    unsigned alreadyInserted=0;
-  
-    for(unsigned k=0; k<dbH->numFiles; k++)
-      if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){
-	alreadyInserted=1;
-	break;
-      }
-  
-    if(alreadyInserted) {
+    if(s.count(thisKey)) {
       VERB_LOG(0, "key already exists in database: %s\n", thisKey);
     } else {
+      s.insert(thisKey);
       // Make a track index table of features to file indexes
       unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
       if(!numVectors) {