Mercurial > hg > audiodb
comparison insert.cpp @ 302:74824093c1c4
Implement O((M+N) log(M+N)) duplicate key detection in batchinsert,
rather than naive O(N^2).
Note that I haven't measured the constants in those complexity
expressions; I am anticipating that 40000 is a sufficiently large N for
this to matter.
Write a test case for duplicate keys, too. Use 0037, since no-one else
seems to be writing tests, and everything is merged onto the trunk these
days anyway.
| author | mas01cr |
|---|---|
| date | Mon, 04 Aug 2008 10:00:34 +0000 |
| parents | 34ce7f7a177d |
| children | 25572f1bd37f c93be2f3a674 |
comparison
equal
deleted
inserted
replaced
| 301:8764d114ce80 | 302:74824093c1c4 |
|---|---|
| 208 if (key && (key != inFile)) { | 208 if (key && (key != inFile)) { |
| 209 thisKey = new char[MAXSTR]; | 209 thisKey = new char[MAXSTR]; |
| 210 } | 210 } |
| 211 char *thisTimesFileName = new char[MAXSTR]; | 211 char *thisTimesFileName = new char[MAXSTR]; |
| 212 char *thisPowerFileName = new char[MAXSTR]; | 212 char *thisPowerFileName = new char[MAXSTR]; |
| 213 | 213 |
| 214 do{ | 214 std::set<std::string> s; |
| 215 | |
| 216 for (unsigned k = 0; k < dbH->numFiles; k++) { | |
| 217 s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); | |
| 218 } | |
| 219 | |
| 220 do { | |
| 215 filesIn->getline(thisFile,MAXSTR); | 221 filesIn->getline(thisFile,MAXSTR); |
| 216 if(key && key!=inFile) { | 222 if(key && key!=inFile) { |
| 217 keysIn->getline(thisKey,MAXSTR); | 223 keysIn->getline(thisKey,MAXSTR); |
| 218 } else { | 224 } else { |
| 219 thisKey = thisFile; | 225 thisKey = thisFile; |
| 236 | 242 |
| 237 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { | 243 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { |
| 238 error("batchinsert failed: no more room in database", thisFile); | 244 error("batchinsert failed: no more room in database", thisFile); |
| 239 } | 245 } |
| 240 | 246 |
| 241 // Linear scan of filenames check for pre-existing feature | 247 if(s.count(thisKey)) { |
| 242 unsigned alreadyInserted=0; | |
| 243 | |
| 244 for(unsigned k=0; k<dbH->numFiles; k++) | |
| 245 if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){ | |
| 246 alreadyInserted=1; | |
| 247 break; | |
| 248 } | |
| 249 | |
| 250 if(alreadyInserted) { | |
| 251 VERB_LOG(0, "key already exists in database: %s\n", thisKey); | 248 VERB_LOG(0, "key already exists in database: %s\n", thisKey); |
| 252 } else { | 249 } else { |
| 250 s.insert(thisKey); | |
| 253 // Make a track index table of features to file indexes | 251 // Make a track index table of features to file indexes |
| 254 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | 252 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); |
| 255 if(!numVectors) { | 253 if(!numVectors) { |
| 256 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); | 254 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); |
| 257 } | 255 } |
