Mercurial > hg > audiodb
diff insert.cpp @ 302:74824093c1c4
Implement O((M+N) log(M+N)) duplicate key detection in batchinsert,
rather than naive O(N^2).
Note that I haven't measured the constants in those complexity
expressions; I am anticipating that 40000 is a sufficiently large N for
this to matter.
Write a test case for duplicate keys, too. Use 0037, since no-one else
seems to be writing tests, and everything is merged onto the trunk these
days anyway.
author | mas01cr |
---|---|
date | Mon, 04 Aug 2008 10:00:34 +0000 |
parents | 34ce7f7a177d |
children | 25572f1bd37f c93be2f3a674 |
line wrap: on
line diff
--- a/insert.cpp Fri Aug 01 16:21:51 2008 +0000 +++ b/insert.cpp Mon Aug 04 10:00:34 2008 +0000 @@ -210,8 +210,14 @@ } char *thisTimesFileName = new char[MAXSTR]; char *thisPowerFileName = new char[MAXSTR]; - - do{ + + std::set<std::string> s; + + for (unsigned k = 0; k < dbH->numFiles; k++) { + s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); + } + + do { filesIn->getline(thisFile,MAXSTR); if(key && key!=inFile) { keysIn->getline(thisKey,MAXSTR); @@ -238,18 +244,10 @@ error("batchinsert failed: no more room in database", thisFile); } - // Linear scan of filenames check for pre-existing feature - unsigned alreadyInserted=0; - - for(unsigned k=0; k<dbH->numFiles; k++) - if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){ - alreadyInserted=1; - break; - } - - if(alreadyInserted) { + if(s.count(thisKey)) { VERB_LOG(0, "key already exists in database: %s\n", thisKey); } else { + s.insert(thisKey); // Make a track index table of features to file indexes unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); if(!numVectors) {