Mercurial > hg > audiodb
comparison insert.cpp @ 316:25572f1bd37f large_adb
Adding large_adb support (up to 1M tracks)
author | mas01mc |
---|---|
date | Tue, 19 Aug 2008 14:27:21 +0000 |
parents | 74824093c1c4 |
children | c270d9e4659a |
comparison
equal
deleted
inserted
replaced
315:d2c56d4f841e | 316:25572f1bd37f |
---|---|
9 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles; | 9 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles; |
10 return(dbH->numFiles < maxfiles); | 10 return(dbH->numFiles < maxfiles); |
11 } | 11 } |
12 | 12 |
13 bool audioDB::enough_data_space_free(off_t size) { | 13 bool audioDB::enough_data_space_free(off_t size) { |
14 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); | 14 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); |
15 } | 15 } |
16 | 16 |
17 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { | 17 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { |
18 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); | 18 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); |
19 write(dbfid, buffer, size); | 19 write(dbfid, buffer, size); |
20 } | 20 } |
21 | 21 |
22 void audioDB::insert(const char* dbName, const char* inFile) { | 22 void audioDB::insert(const char* dbName, const char* inFile) { |
23 forWrite = true; | 23 forWrite = true; |
24 initTables(dbName, inFile); | 24 initTables(dbName, inFile); |
25 | |
26 if(dbH->flags & O2_FLAG_LARGE_ADB) | |
27 error("Single-feature inserts not allowed with LARGE audioDB instances"); | |
25 | 28 |
26 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) | 29 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) |
27 error("Must use timestamps with timestamped database","use --times"); | 30 error("Must use timestamps with timestamped database","use --times"); |
28 | 31 |
29 if(!usingPower && (dbH->flags & O2_FLAG_POWER)) | 32 if(!usingPower && (dbH->flags & O2_FLAG_POWER)) |
47 break; | 50 break; |
48 } | 51 } |
49 | 52 |
50 if(alreadyInserted) { | 53 if(alreadyInserted) { |
51 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); | 54 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); |
55 // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08 | |
52 return; | 56 return; |
53 } | 57 } |
54 | 58 |
55 // Make a track index table of features to file indexes | 59 // Make a track index table of features to file indexes |
56 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | 60 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); |
62 munmap(db,dbH->dbSize); | 66 munmap(db,dbH->dbSize); |
63 close(infid); | 67 close(infid); |
64 return; | 68 return; |
65 } | 69 } |
66 | 70 |
67 strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)); | 71 INSERT_FILETABLE_STRING(fileTable, key); |
68 | 72 |
69 off_t insertoffset = dbH->length;// Store current state | 73 off_t insertoffset = dbH->length;// Store current state |
70 | 74 |
71 // Check times status and insert times from file | 75 // Check times status and insert times from file |
72 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); | 76 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); |
181 void audioDB::batchinsert(const char* dbName, const char* inFile) { | 185 void audioDB::batchinsert(const char* dbName, const char* inFile) { |
182 | 186 |
183 forWrite = true; | 187 forWrite = true; |
184 initDBHeader(dbName); | 188 initDBHeader(dbName); |
185 | 189 |
190 // Treat large ADB instances differently | |
191 if( dbH->flags & O2_FLAG_LARGE_ADB ){ | |
192 batchinsert_large_adb(dbName, inFile) ; | |
193 return; | |
194 } | |
195 | |
186 if(!key) | 196 if(!key) |
187 key=inFile; | 197 key=inFile; |
188 std::ifstream *filesIn = 0; | 198 std::ifstream *filesIn = 0; |
189 std::ifstream *keysIn = 0; | 199 std::ifstream *keysIn = 0; |
190 std::ifstream* thisTimesFile = 0; | 200 std::ifstream* thisTimesFile = 0; |
287 insertPowerData(numVectors, thispowerfd, powerdata); | 297 insertPowerData(numVectors, thispowerfd, powerdata); |
288 if (0 < thispowerfd) { | 298 if (0 < thispowerfd) { |
289 close(thispowerfd); | 299 close(thispowerfd); |
290 } | 300 } |
291 } | 301 } |
292 strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)); | 302 |
293 | 303 INSERT_FILETABLE_STRING(fileTable, thisKey); |
304 | |
294 off_t insertoffset = dbH->length;// Store current state | 305 off_t insertoffset = dbH->length;// Store current state |
295 | 306 |
296 // Increment file count | 307 // Increment file count |
297 dbH->numFiles++; | 308 dbH->numFiles++; |
298 | 309 |
299 // Update Header information | 310 // Update Header information |
300 dbH->length+=(statbuf.st_size-sizeof(int)); | 311 dbH->length+=(statbuf.st_size-sizeof(int)); |
301 | 312 |
302 // Update track to file index map | 313 // Update track to file index map |
303 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); | 314 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); |
304 | 315 |
305 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); | 316 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); |
306 | 317 |
307 // Norm the vectors on input if the database is already L2 normed | 318 // Norm the vectors on input if the database is already L2 normed |
308 if(dbH->flags & O2_FLAG_L2NORM) | 319 if(dbH->flags & O2_FLAG_L2NORM) |
309 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append | 320 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append |
332 delete keysIn; | 343 delete keysIn; |
333 | 344 |
334 // Report status | 345 // Report status |
335 status(dbName); | 346 status(dbName); |
336 } | 347 } |
348 | |
349 | |
350 // BATCHINSERT_LARGE_ADB | |
351 // | |
352 // This method inserts file pointers into the ADB instance rather than the actual feature data | |
353 // | |
354 // This method is intended for databases that are large enough to only support indexed query | |
355 // So exhaustive searching across all feature vectors will not be performed | |
356 // | |
357 // We insert featureFileName, [powerFileName], [timesFileName] | |
358 // | |
359 // l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time | |
360 // | |
361 // LIMITS: | |
362 // | |
363 // We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles | |
364 // | |
365 void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) { | |
366 | |
367 if(!key) | |
368 key=inFile; | |
369 std::ifstream *filesIn = 0; | |
370 std::ifstream *keysIn = 0; | |
371 std::ifstream* thisTimesFile = 0; | |
372 int thispowerfd = 0; | |
373 | |
374 if(!(filesIn = new std::ifstream(inFile))) | |
375 error("Could not open batch in file", inFile); | |
376 if(key && key!=inFile) | |
377 if(!(keysIn = new std::ifstream(key))) | |
378 error("Could not open batch key file",key); | |
379 | |
380 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) | |
381 error("Must use timestamps with timestamped database","use --times"); | |
382 | |
383 if(!usingPower && (dbH->flags & O2_FLAG_POWER)) | |
384 error("Must use power with power-enabled database", dbName); | |
385 | |
386 unsigned totalVectors=0; | |
387 char *thisFile = new char[MAXSTR]; | |
388 char *thisKey = 0; | |
389 if (key && (key != inFile)) { | |
390 thisKey = new char[MAXSTR]; | |
391 } | |
392 char *thisTimesFileName = new char[MAXSTR]; | |
393 char *thisPowerFileName = new char[MAXSTR]; | |
394 | |
395 std::set<std::string> s; | |
396 | |
397 for (unsigned k = 0; k < dbH->numFiles; k++) { | |
398 s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE); | |
399 } | |
400 | |
401 do { | |
402 filesIn->getline(thisFile,MAXSTR); | |
403 if(key && key!=inFile) { | |
404 keysIn->getline(thisKey,MAXSTR); | |
405 } else { | |
406 thisKey = thisFile; | |
407 } | |
408 if(usingTimes) { | |
409 timesFile->getline(thisTimesFileName,MAXSTR); | |
410 } | |
411 if(usingPower) { | |
412 powerFile->getline(thisPowerFileName, MAXSTR); | |
413 } | |
414 | |
415 if(filesIn->eof()) { | |
416 break; | |
417 } | |
418 | |
419 initInputFile(thisFile, false); | |
420 | |
421 if(!enough_per_file_space_free()) { | |
422 error("batchinsert failed: no more room for metadata", thisFile); | |
423 } | |
424 | |
425 if(s.count(thisKey)) { | |
426 VERB_LOG(0, "key already exists in database: %s\n", thisKey); | |
427 } else { | |
428 s.insert(thisKey); | |
429 // Make a track index table of features to file indexes | |
430 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | |
431 if(!numVectors) { | |
432 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey); | |
433 } | |
434 else{ | |
435 // Check that time-stamp file exists | |
436 if(usingTimes){ | |
437 if(timesFile->eof()) { | |
438 error("not enough timestamp files in timesList", timesFileName); | |
439 } | |
440 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in); | |
441 if(!thisTimesFile->is_open()) { | |
442 error("Cannot open timestamp file", thisTimesFileName); | |
443 } | |
444 if(thisTimesFile) | |
445 delete thisTimesFile; | |
446 } | |
447 | |
448 // Check that power file exists | |
449 if (usingPower) { | |
450 if(powerFile->eof()) { | |
451 error("not enough power files in powerList", powerFileName); | |
452 } | |
453 thispowerfd = open(thisPowerFileName, O_RDONLY); | |
454 if (thispowerfd < 0) { | |
455 error("failed to open power file", thisPowerFileName); | |
456 } | |
457 if (0 < thispowerfd) { | |
458 close(thispowerfd); | |
459 } | |
460 } | |
461 | |
462 // persist links to the feature files for reading from filesystem later | |
463 | |
464 // Primary Keys | |
465 INSERT_FILETABLE_STRING(fileTable, thisKey); | |
466 | |
467 // Feature Vector fileNames | |
468 INSERT_FILETABLE_STRING(dbH->dataOffset, thisFile); | |
469 | |
470 // Time Stamp fileNames | |
471 if(usingTimes) | |
472 INSERT_FILETABLE_STRING(dbH->timesTableOffset, thisTimesFileName); | |
473 | |
474 | |
475 // Power fileNames | |
476 if(usingPower) | |
477 INSERT_FILETABLE_STRING(dbH->powerTableOffset, thisPowerFileName); | |
478 | |
479 // Increment file count | |
480 dbH->numFiles++; | |
481 | |
482 // Update Header information | |
483 dbH->length+=(statbuf.st_size-sizeof(int)); | |
484 | |
485 // Update track to file index map | |
486 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); | |
487 | |
488 totalVectors+=numVectors; | |
489 | |
490 // Copy the header back to the database | |
491 memcpy (db, dbH, sizeof(dbTableHeaderT)); | |
492 } | |
493 } | |
494 // CLEAN UP | |
495 munmap(indata,statbuf.st_size); | |
496 close(infid); | |
497 } while(!filesIn->eof()); | |
498 | |
499 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double))); | |
500 | |
501 delete [] thisPowerFileName; | |
502 if(key && (key != inFile)) { | |
503 delete [] thisKey; | |
504 } | |
505 delete [] thisFile; | |
506 delete [] thisTimesFileName; | |
507 | |
508 delete filesIn; | |
509 delete keysIn; | |
510 | |
511 // Report status | |
512 status(dbName); | |
513 } |