comparison insert.cpp @ 302:74824093c1c4

Implement O((M+N) log(M+N)) duplicate key detection in batchinsert, rather than naive O(N^2). Note that I haven't measured the constants in those complexity expressions; I am anticipating that 40000 is a sufficiently large N for this to matter. Write a test case for duplicate keys, too. Use 0037, since no-one else seems to be writing tests, and everything is merged onto the trunk these days anyway.
author mas01cr
date Mon, 04 Aug 2008 10:00:34 +0000
parents 34ce7f7a177d
children 25572f1bd37f c93be2f3a674
comparison
equal deleted inserted replaced
301:8764d114ce80 302:74824093c1c4
208 if (key && (key != inFile)) { 208 if (key && (key != inFile)) {
209 thisKey = new char[MAXSTR]; 209 thisKey = new char[MAXSTR];
210 } 210 }
211 char *thisTimesFileName = new char[MAXSTR]; 211 char *thisTimesFileName = new char[MAXSTR];
212 char *thisPowerFileName = new char[MAXSTR]; 212 char *thisPowerFileName = new char[MAXSTR];
213 213
214 do{ 214 std::set<std::string> s;
215
216 for (unsigned k = 0; k < dbH->numFiles; k++) {
217 s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
218 }
219
220 do {
215 filesIn->getline(thisFile,MAXSTR); 221 filesIn->getline(thisFile,MAXSTR);
216 if(key && key!=inFile) { 222 if(key && key!=inFile) {
217 keysIn->getline(thisKey,MAXSTR); 223 keysIn->getline(thisKey,MAXSTR);
218 } else { 224 } else {
219 thisKey = thisFile; 225 thisKey = thisFile;
236 242
237 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) { 243 if(!enough_data_space_free(statbuf.st_size - sizeof(int))) {
238 error("batchinsert failed: no more room in database", thisFile); 244 error("batchinsert failed: no more room in database", thisFile);
239 } 245 }
240 246
241 // Linear scan of filenames check for pre-existing feature 247 if(s.count(thisKey)) {
242 unsigned alreadyInserted=0;
243
244 for(unsigned k=0; k<dbH->numFiles; k++)
245 if(strncmp(fileTable + k*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)+1)==0){
246 alreadyInserted=1;
247 break;
248 }
249
250 if(alreadyInserted) {
251 VERB_LOG(0, "key already exists in database: %s\n", thisKey); 248 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
252 } else { 249 } else {
250 s.insert(thisKey);
253 // Make a track index table of features to file indexes 251 // Make a track index table of features to file indexes
254 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); 252 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
255 if(!numVectors) { 253 if(!numVectors) {
256 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); 254 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
257 } 255 }