comparison insert.cpp @ 316:25572f1bd37f large_adb

Adding large_adb support (up to 1M tracks)
author mas01mc
date Tue, 19 Aug 2008 14:27:21 +0000
parents 74824093c1c4
children c270d9e4659a
comparison
equal deleted inserted replaced
315:d2c56d4f841e 316:25572f1bd37f
9 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles; 9 maxfiles = fmaxfiles > tmaxfiles ? tmaxfiles : fmaxfiles;
10 return(dbH->numFiles < maxfiles); 10 return(dbH->numFiles < maxfiles);
11 } 11 }
12 12
13 bool audioDB::enough_data_space_free(off_t size) { 13 bool audioDB::enough_data_space_free(off_t size) {
14 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size); 14 return(dbH->timesTableOffset > dbH->dataOffset + dbH->length + size);
15 } 15 }
16 16
17 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) { 17 void audioDB::insert_data_vectors(off_t offset, void *buffer, size_t size) {
18 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET); 18 lseek(dbfid, dbH->dataOffset + offset, SEEK_SET);
19 write(dbfid, buffer, size); 19 write(dbfid, buffer, size);
20 } 20 }
21 21
22 void audioDB::insert(const char* dbName, const char* inFile) { 22 void audioDB::insert(const char* dbName, const char* inFile) {
23 forWrite = true; 23 forWrite = true;
24 initTables(dbName, inFile); 24 initTables(dbName, inFile);
25
26 if(dbH->flags & O2_FLAG_LARGE_ADB)
27 error("Single-feature inserts not allowed with LARGE audioDB instances");
25 28
26 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) 29 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
27 error("Must use timestamps with timestamped database","use --times"); 30 error("Must use timestamps with timestamped database","use --times");
28 31
29 if(!usingPower && (dbH->flags & O2_FLAG_POWER)) 32 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
47 break; 50 break;
48 } 51 }
49 52
50 if(alreadyInserted) { 53 if(alreadyInserted) {
51 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile); 54 VERB_LOG(0, "key already exists in database; ignoring: %s\n", inFile);
55 // FIXME: Do we need to munmap here (see below) ? MKC 18/08/08
52 return; 56 return;
53 } 57 }
54 58
55 // Make a track index table of features to file indexes 59 // Make a track index table of features to file indexes
56 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); 60 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
62 munmap(db,dbH->dbSize); 66 munmap(db,dbH->dbSize);
63 close(infid); 67 close(infid);
64 return; 68 return;
65 } 69 }
66 70
67 strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, key, strlen(key)); 71 INSERT_FILETABLE_STRING(fileTable, key);
68 72
69 off_t insertoffset = dbH->length;// Store current state 73 off_t insertoffset = dbH->length;// Store current state
70 74
71 // Check times status and insert times from file 75 // Check times status and insert times from file
72 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double)); 76 unsigned indexoffset = insertoffset/(dbH->dim*sizeof(double));
181 void audioDB::batchinsert(const char* dbName, const char* inFile) { 185 void audioDB::batchinsert(const char* dbName, const char* inFile) {
182 186
183 forWrite = true; 187 forWrite = true;
184 initDBHeader(dbName); 188 initDBHeader(dbName);
185 189
190 // Treat large ADB instances differently
191 if( dbH->flags & O2_FLAG_LARGE_ADB ){
192 batchinsert_large_adb(dbName, inFile) ;
193 return;
194 }
195
186 if(!key) 196 if(!key)
187 key=inFile; 197 key=inFile;
188 std::ifstream *filesIn = 0; 198 std::ifstream *filesIn = 0;
189 std::ifstream *keysIn = 0; 199 std::ifstream *keysIn = 0;
190 std::ifstream* thisTimesFile = 0; 200 std::ifstream* thisTimesFile = 0;
287 insertPowerData(numVectors, thispowerfd, powerdata); 297 insertPowerData(numVectors, thispowerfd, powerdata);
288 if (0 < thispowerfd) { 298 if (0 < thispowerfd) {
289 close(thispowerfd); 299 close(thispowerfd);
290 } 300 }
291 } 301 }
292 strncpy(fileTable + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, thisKey, strlen(thisKey)); 302
293 303 INSERT_FILETABLE_STRING(fileTable, thisKey);
304
294 off_t insertoffset = dbH->length;// Store current state 305 off_t insertoffset = dbH->length;// Store current state
295 306
296 // Increment file count 307 // Increment file count
297 dbH->numFiles++; 308 dbH->numFiles++;
298 309
299 // Update Header information 310 // Update Header information
300 dbH->length+=(statbuf.st_size-sizeof(int)); 311 dbH->length+=(statbuf.st_size-sizeof(int));
301 312
302 // Update track to file index map 313 // Update track to file index map
303 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned)); 314 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
304 315
305 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int)); 316 insert_data_vectors(insertoffset, indata + sizeof(int), statbuf.st_size - sizeof(int));
306 317
307 // Norm the vectors on input if the database is already L2 normed 318 // Norm the vectors on input if the database is already L2 normed
308 if(dbH->flags & O2_FLAG_L2NORM) 319 if(dbH->flags & O2_FLAG_L2NORM)
309 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append 320 unitNormAndInsertL2((double *)(indata + sizeof(int)), dbH->dim, numVectors, 1); // append
332 delete keysIn; 343 delete keysIn;
333 344
334 // Report status 345 // Report status
335 status(dbName); 346 status(dbName);
336 } 347 }
348
349
350 // BATCHINSERT_LARGE_ADB
351 //
352 // This method inserts file pointers into the ADB instance rather than the actual feature data
353 //
354 // This method is intended for databases that are large enough to only support indexed query
355 // So exhaustive searching across all feature vectors will not be performed
356 //
357 // We insert featureFileName, [powerFileName], [timesFileName]
358 //
359 // l2norms and power sequence sums are calculated on-the-fly at INDEX and --lsh_exact QUERY time
360 //
361 // LIMITS:
362 //
363 // We impose an upper limit of 1M keys, 1M featureFiles, 1M powerFiles and 1M timesFiles
364 //
365 void audioDB::batchinsert_large_adb(const char* dbName, const char* inFile) {
366
367 if(!key)
368 key=inFile;
369 std::ifstream *filesIn = 0;
370 std::ifstream *keysIn = 0;
371 std::ifstream* thisTimesFile = 0;
372 int thispowerfd = 0;
373
374 if(!(filesIn = new std::ifstream(inFile)))
375 error("Could not open batch in file", inFile);
376 if(key && key!=inFile)
377 if(!(keysIn = new std::ifstream(key)))
378 error("Could not open batch key file",key);
379
380 if(!usingTimes && (dbH->flags & O2_FLAG_TIMES))
381 error("Must use timestamps with timestamped database","use --times");
382
383 if(!usingPower && (dbH->flags & O2_FLAG_POWER))
384 error("Must use power with power-enabled database", dbName);
385
386 unsigned totalVectors=0;
387 char *thisFile = new char[MAXSTR];
388 char *thisKey = 0;
389 if (key && (key != inFile)) {
390 thisKey = new char[MAXSTR];
391 }
392 char *thisTimesFileName = new char[MAXSTR];
393 char *thisPowerFileName = new char[MAXSTR];
394
395 std::set<std::string> s;
396
397 for (unsigned k = 0; k < dbH->numFiles; k++) {
398 s.insert(fileTable + k*O2_FILETABLE_ENTRY_SIZE);
399 }
400
401 do {
402 filesIn->getline(thisFile,MAXSTR);
403 if(key && key!=inFile) {
404 keysIn->getline(thisKey,MAXSTR);
405 } else {
406 thisKey = thisFile;
407 }
408 if(usingTimes) {
409 timesFile->getline(thisTimesFileName,MAXSTR);
410 }
411 if(usingPower) {
412 powerFile->getline(thisPowerFileName, MAXSTR);
413 }
414
415 if(filesIn->eof()) {
416 break;
417 }
418
419 initInputFile(thisFile, false);
420
421 if(!enough_per_file_space_free()) {
422 error("batchinsert failed: no more room for metadata", thisFile);
423 }
424
425 if(s.count(thisKey)) {
426 VERB_LOG(0, "key already exists in database: %s\n", thisKey);
427 } else {
428 s.insert(thisKey);
429 // Make a track index table of features to file indexes
430 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim);
431 if(!numVectors) {
432 VERB_LOG(0, "ignoring zero-length feature vector file: %s\n", thisKey);
433 }
434 else{
435 // Check that time-stamp file exists
436 if(usingTimes){
437 if(timesFile->eof()) {
438 error("not enough timestamp files in timesList", timesFileName);
439 }
440 thisTimesFile = new std::ifstream(thisTimesFileName,std::ios::in);
441 if(!thisTimesFile->is_open()) {
442 error("Cannot open timestamp file", thisTimesFileName);
443 }
444 if(thisTimesFile)
445 delete thisTimesFile;
446 }
447
448 // Check that power file exists
449 if (usingPower) {
450 if(powerFile->eof()) {
451 error("not enough power files in powerList", powerFileName);
452 }
453 thispowerfd = open(thisPowerFileName, O_RDONLY);
454 if (thispowerfd < 0) {
455 error("failed to open power file", thisPowerFileName);
456 }
457 if (0 < thispowerfd) {
458 close(thispowerfd);
459 }
460 }
461
462 // persist links to the feature files for reading from filesystem later
463
464 // Primary Keys
465 INSERT_FILETABLE_STRING(fileTable, thisKey);
466
467 // Feature Vector fileNames
468 INSERT_FILETABLE_STRING(dbH->dataOffset, thisFile);
469
470 // Time Stamp fileNames
471 if(usingTimes)
472 INSERT_FILETABLE_STRING(dbH->timesTableOffset, thisTimesFileName);
473
474
475 // Power fileNames
476 if(usingPower)
477 INSERT_FILETABLE_STRING(dbH->powerTableOffset, thisPowerFileName);
478
479 // Increment file count
480 dbH->numFiles++;
481
482 // Update Header information
483 dbH->length+=(statbuf.st_size-sizeof(int));
484
485 // Update track to file index map
486 memcpy (trackTable+dbH->numFiles-1, &numVectors, sizeof(unsigned));
487
488 totalVectors+=numVectors;
489
490 // Copy the header back to the database
491 memcpy (db, dbH, sizeof(dbTableHeaderT));
492 }
493 }
494 // CLEAN UP
495 munmap(indata,statbuf.st_size);
496 close(infid);
497 } while(!filesIn->eof());
498
499 VERB_LOG(0, "%s %s %u vectors %ju bytes.\n", COM_BATCHINSERT, dbName, totalVectors, (intmax_t) (totalVectors * dbH->dim * sizeof(double)));
500
501 delete [] thisPowerFileName;
502 if(key && (key != inFile)) {
503 delete [] thisKey;
504 }
505 delete [] thisFile;
506 delete [] thisTimesFileName;
507
508 delete filesIn;
509 delete keysIn;
510
511 // Report status
512 status(dbName);
513 }