Mercurial > hg > audiodb
comparison query.cpp @ 324:c93be2f3a674
Merge of branches/large_adb -r 514:524 onto the trunk. No conflicts. Added LARGE_ADB support. Turn on with --ntracks 20001 or greater. Use --adb_feature_root to locate feature files at QUERY time. A bug fix in LSH indexing that was incorrectly thresholding large numbers of shingles.
author | mas01mc |
---|---|
date | Thu, 21 Aug 2008 21:28:33 +0000 |
parents | d2c56d4f841e |
children | 8f11ea4d9cd2 |
comparison
equal
deleted
inserted
replaced
315:d2c56d4f841e | 324:c93be2f3a674 |
---|---|
44 reporter = new trackAveragingReporter< std::less< NNresult > >(pointNN, trackNN, dbH->numFiles); | 44 reporter = new trackAveragingReporter< std::less< NNresult > >(pointNN, trackNN, dbH->numFiles); |
45 } else { | 45 } else { |
46 if(index_exists(dbName, radius, sequenceLength)){ | 46 if(index_exists(dbName, radius, sequenceLength)){ |
47 char* indexName = index_get_name(dbName, radius, sequenceLength); | 47 char* indexName = index_get_name(dbName, radius, sequenceLength); |
48 lsh = index_allocate(indexName, false); | 48 lsh = index_allocate(indexName, false); |
49 reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp())+1); | 49 reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1); |
50 delete[] indexName; | 50 delete[] indexName; |
51 } | 51 } |
52 else | 52 else |
53 reporter = new trackSequenceQueryRadReporter(trackNN, dbH->numFiles); | 53 reporter = new trackSequenceQueryRadReporter(trackNN, dbH->numFiles); |
54 } | 54 } |
60 reporter = new trackSequenceQueryNNReporter< std::less < NNresult > >(pointNN, trackNN, dbH->numFiles); | 60 reporter = new trackSequenceQueryNNReporter< std::less < NNresult > >(pointNN, trackNN, dbH->numFiles); |
61 } else { | 61 } else { |
62 if(index_exists(dbName, radius, sequenceLength)){ | 62 if(index_exists(dbName, radius, sequenceLength)){ |
63 char* indexName = index_get_name(dbName, radius, sequenceLength); | 63 char* indexName = index_get_name(dbName, radius, sequenceLength); |
64 lsh = index_allocate(indexName, false); | 64 lsh = index_allocate(indexName, false); |
65 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp())+1); | 65 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1); |
66 delete[] indexName; | 66 delete[] indexName; |
67 } | 67 } |
68 else | 68 else |
69 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, dbH->numFiles); | 69 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, dbH->numFiles); |
70 } | 70 } |
218 delete[] DD[j]; | 218 delete[] DD[j]; |
219 } | 219 } |
220 } | 220 } |
221 } | 221 } |
222 | 222 |
223 void audioDB::read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p) { | 223 void audioDB::read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p) { |
224 if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) { | 224 if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) { |
225 if(*data_buffer_p) { | 225 if(*data_buffer_p) { |
226 free(*data_buffer_p); | 226 free(*data_buffer_p); |
227 } | 227 } |
228 { | 228 { |
233 } | 233 } |
234 *data_buffer_p = (double *) tmp; | 234 *data_buffer_p = (double *) tmp; |
235 } | 235 } |
236 } | 236 } |
237 | 237 |
238 read(dbfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); | 238 read(trkfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); |
239 } | 239 } |
240 | 240 |
241 // These names deserve some unpicking. The names starting with a "q" | 241 // These names deserve some unpicking. The names starting with a "q" |
242 // are pointers to the query, norm and power vectors; the names | 242 // are pointers to the query, norm and power vectors; the names |
243 // starting with "v" are things that will end up pointing to the | 243 // starting with "v" are things that will end up pointing to the |
339 error("Query shorter than requested sequence length", "maybe use -l"); | 339 error("Query shorter than requested sequence length", "maybe use -l"); |
340 } | 340 } |
341 | 341 |
342 VERB_LOG(1, "performing norms... "); | 342 VERB_LOG(1, "performing norms... "); |
343 | 343 |
344 // Read query feature vectors from database | 344 // For LARGE_ADB load query features from file |
345 *qp = NULL; | 345 if( dbH->flags & O2_FLAG_LARGE_ADB ){ |
346 lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); | 346 if(infid>0) |
347 size_t allocatedSize = 0; | 347 close(infid); |
348 read_data(queryIndex, qp, &allocatedSize); | 348 char* prefixedString = new char[O2_MAXFILESTR]; |
349 // Consistency check on allocated memory and query feature size | 349 char* tmpStr = prefixedString; |
350 if(*nvp*sizeof(double)*dbH->dim != allocatedSize) | 350 strncpy(prefixedString, featureFileNameTable+queryIndex*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); |
351 error("Query memory allocation failed consitency check","set_up_query_from_key"); | 351 prefix_name(&prefixedString, adb_feature_root); |
352 | 352 if(tmpStr!=prefixedString) |
353 Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors | 353 delete[] tmpStr; |
354 // Copy L2 norm partial-sum coefficients | 354 initInputFile(prefixedString, false); // nommap, file pointer at correct position |
355 assert(*qnp = new double[*nvp]); | 355 size_t allocatedSize = 0; |
356 memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); | 356 read_data(infid, queryIndex, qp, &allocatedSize); // over-writes qp and allocatedSize |
357 sequence_sum(*qnp, *nvp, sequenceLength); | 357 // Consistency check on allocated memory and query feature size |
358 sequence_sqrt(*qnp, *nvp, sequenceLength); | 358 if(*nvp*sizeof(double)*dbH->dim != allocatedSize) |
359 | 359 error("Query memory allocation failed consitency check","set_up_query_from_key"); |
360 if( usingPower ){ | 360 // Allocated and calculate auxillary sequences: l2norm and power |
361 // Copy Power partial-sum coefficients | 361 init_track_aux_data(queryIndex, *qp, qnp, vqnp, qpp, vqpp); |
362 assert(*qpp = new double[*nvp]); | 362 } |
363 memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); | 363 else{ // Load from self-contained ADB database |
364 sequence_sum(*qpp, *nvp, sequenceLength); | 364 // Read query feature vectors from database |
365 sequence_average(*qpp, *nvp, sequenceLength); | 365 *qp = NULL; |
366 } | 366 lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); |
367 | 367 size_t allocatedSize = 0; |
368 if (usingTimes) { | 368 read_data(dbfid, queryIndex, qp, &allocatedSize); |
369 unsigned int k; | 369 // Consistency check on allocated memory and query feature size |
370 *mqdp = 0.0; | 370 if(*nvp*sizeof(double)*dbH->dim != allocatedSize) |
371 double *querydurs = new double[*nvp]; | 371 error("Query memory allocation failed consitency check","set_up_query_from_key"); |
372 double *timesdata = new double[*nvp*2]; | |
373 assert(querydurs && timesdata); | |
374 memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double)); | |
375 for(k = 0; k < *nvp; k++) { | |
376 querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; | |
377 *mqdp += querydurs[k]; | |
378 } | |
379 *mqdp /= k; | |
380 | 372 |
381 VERB_LOG(1, "mean query file duration: %f\n", *mqdp); | 373 Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors |
374 // Copy L2 norm partial-sum coefficients | |
375 assert(*qnp = new double[*nvp]); | |
376 memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); | |
377 sequence_sum(*qnp, *nvp, sequenceLength); | |
378 sequence_sqrt(*qnp, *nvp, sequenceLength); | |
382 | 379 |
383 delete [] querydurs; | 380 if( usingPower ){ |
384 delete [] timesdata; | 381 // Copy Power partial-sum coefficients |
385 } | 382 assert(*qpp = new double[*nvp]); |
383 memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); | |
384 sequence_sum(*qpp, *nvp, sequenceLength); | |
385 sequence_average(*qpp, *nvp, sequenceLength); | |
386 } | |
387 | |
388 if (usingTimes) { | |
389 unsigned int k; | |
390 *mqdp = 0.0; | |
391 double *querydurs = new double[*nvp]; | |
392 double *timesdata = new double[*nvp*2]; | |
393 assert(querydurs && timesdata); | |
394 memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double)); | |
395 for(k = 0; k < *nvp; k++) { | |
396 querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; | |
397 *mqdp += querydurs[k]; | |
398 } | |
399 *mqdp /= k; | |
400 | |
401 VERB_LOG(1, "mean query file duration: %f\n", *mqdp); | |
402 | |
403 delete [] querydurs; | |
404 delete [] timesdata; | |
405 } | |
406 } | |
407 | |
386 // Defaults, for exhaustive search (!usingQueryPoint) | 408 // Defaults, for exhaustive search (!usingQueryPoint) |
387 *vqp = *qp; | 409 *vqp = *qp; |
388 *vqnp = *qnp; | 410 *vqnp = *qnp; |
389 *vqpp = *qpp; | 411 *vqpp = *qpp; |
390 | 412 |
485 return; | 507 return; |
486 | 508 |
487 // Compute database info | 509 // Compute database info |
488 // FIXME: we more than likely don't need very much of the database | 510 // FIXME: we more than likely don't need very much of the database |
489 // so make a new method to build these values per-track or, even better, per-point | 511 // so make a new method to build these values per-track or, even better, per-point |
490 set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); | 512 if( !( dbH->flags & O2_FLAG_LARGE_ADB) ) |
513 set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); | |
491 | 514 |
492 VERB_LOG(1, "matching points..."); | 515 VERB_LOG(1, "matching points..."); |
493 | 516 |
494 assert(pointNN>0 && pointNN<=O2_MAXNN); | 517 assert(pointNN>0 && pointNN<=O2_MAXNN); |
495 assert(trackNN>0 && trackNN<=O2_MAXNN); | 518 assert(trackNN>0 && trackNN<=O2_MAXNN); |
496 | 519 |
497 // We are guaranteed that the order of points is sorted by: | 520 // We are guaranteed that the order of points is sorted by: |
498 // qpos, trackID, spos | 521 // trackID, spos, qpos |
499 // so we can be relatively efficient in initialization of track data. | 522 // so we can be relatively efficient in initialization of track data. |
500 // Here we assume that points don't overlap, so we will use exhaustive dot | 523 // Here we assume that points don't overlap, so we will use exhaustive dot |
501 // product evaluation over the sequence | 524 // product evaluation instead of memoization of partial sums which is used |
525 // for exhaustive brute-force evaluation from smaller databases: e.g. query_loop() | |
502 double dist; | 526 double dist; |
503 size_t data_buffer_size = 0; | 527 size_t data_buffer_size = 0; |
504 double *data_buffer = 0; | 528 double *data_buffer = 0; |
505 Uns32T trackOffset; | 529 Uns32T trackOffset = 0; |
506 Uns32T trackIndexOffset; | 530 Uns32T trackIndexOffset = 0; |
507 Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range | 531 Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range |
508 Uns32T npairs = exact_evaluation_queue->size(); | 532 Uns32T npairs = exact_evaluation_queue->size(); |
509 while(npairs--){ | 533 while(npairs--){ |
510 PointPair pp = exact_evaluation_queue->top(); | 534 PointPair pp = exact_evaluation_queue->top(); |
511 trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset | 535 // Large ADB track data must be loaded here for sPower |
512 trackIndexOffset=trackOffset/dbH->dim; // num vectors offset | 536 if(dbH->flags & O2_FLAG_LARGE_ADB){ |
513 if((!(usingPower) || powers_acceptable(qpPtr[usingQueryPoint?0:pp.qpos], sPower[trackIndexOffset+pp.spos])) && | 537 trackOffset=0; |
514 ((usingQueryPoint?0:pp.qpos) < numVectors-sequenceLength+1 && pp.spos < trackTable[pp.trackID]-sequenceLength+1)){ | 538 trackIndexOffset=0; |
515 if(currentTrack!=pp.trackID){ | 539 if(currentTrack!=pp.trackID){ |
540 char* prefixedString = new char[O2_MAXFILESTR]; | |
541 char* tmpStr = prefixedString; | |
542 // On currentTrack change, allocate and load track data | |
516 currentTrack=pp.trackID; | 543 currentTrack=pp.trackID; |
517 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); | 544 SAFE_DELETE_ARRAY(sNorm); |
518 read_data(currentTrack, &data_buffer, &data_buffer_size); | 545 SAFE_DELETE_ARRAY(sPower); |
519 } | 546 if(infid>0) |
520 dist = dot_product_points(query+(usingQueryPoint?0:pp.qpos*dbH->dim), data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); | 547 close(infid); |
548 // Open and check dimensions of feature file | |
549 strncpy(prefixedString, featureFileNameTable+pp.trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); | |
550 prefix_name((char ** const) &prefixedString, adb_feature_root); | |
551 if (prefixedString!=tmpStr) | |
552 delete[] tmpStr; | |
553 initInputFile(prefixedString, false); // nommap, file pointer at correct position | |
554 // Load the feature vector data for current track into data_buffer | |
555 read_data(infid, pp.trackID, &data_buffer, &data_buffer_size); | |
556 // Load power and calculate power and l2norm sequence sums | |
557 init_track_aux_data(pp.trackID, data_buffer, &sNorm, &snPtr, &sPower, &spPtr); | |
558 } | |
559 } | |
560 else{ | |
561 // These offsets are w.r.t. the entire database of feature vectors and auxillary variables | |
562 trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset | |
563 trackIndexOffset=trackOffset/dbH->dim; // num vectors offset | |
564 } | |
565 Uns32T qPos = usingQueryPoint?0:pp.qpos;// index for query point | |
566 Uns32T sPos = trackIndexOffset+pp.spos; // index into l2norm table | |
567 // Test power thresholds before computing distance | |
568 if( ( !usingPower || powers_acceptable(qpPtr[qPos], sPower[sPos])) && | |
569 ( qPos<numVectors-sequenceLength+1 && pp.spos<trackTable[pp.trackID]-sequenceLength+1 ) ){ | |
570 // Non-large ADB track data is loaded inside power test for efficiency | |
571 if( !(dbH->flags & O2_FLAG_LARGE_ADB) && (currentTrack!=pp.trackID) ){ | |
572 // On currentTrack change, allocate and load track data | |
573 currentTrack=pp.trackID; | |
574 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); | |
575 read_data(dbfid, currentTrack, &data_buffer, &data_buffer_size); | |
576 } | |
577 // Compute distance | |
578 dist = dot_product_points(query+qPos*dbH->dim, data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); | |
579 double qn = qnPtr[qPos]; | |
580 double sn = sNorm[sPos]; | |
521 if(normalizedDistance) | 581 if(normalizedDistance) |
522 dist = 2-(2/(qnPtr[usingQueryPoint?0:pp.qpos]*sNorm[trackIndexOffset+pp.spos]))*dist; | 582 dist = 2 - (2/(qn*sn))*dist; |
523 else | 583 else |
524 if(no_unit_norming) | 584 if(no_unit_norming) |
525 dist = qnPtr[usingQueryPoint?0:pp.qpos]*qnPtr[usingQueryPoint?0:pp.qpos]+sNorm[trackIndexOffset+pp.spos]*sNorm[trackIndexOffset+pp.spos] - 2*dist; | 585 dist = qn*qn + sn*sn - 2*dist; |
526 // else | 586 // else |
527 // dist = dist; | 587 // dist = dist; |
528 if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) | 588 if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) |
529 reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); | 589 reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); |
530 } | 590 } |
531 exact_evaluation_queue->pop(); | 591 exact_evaluation_queue->pop(); |
532 } | 592 } |
533 // Cleanup | 593 // Cleanup |
534 if(sNorm) | 594 SAFE_DELETE_ARRAY(sNorm); |
535 delete sNorm; | 595 SAFE_DELETE_ARRAY(sPower); |
536 if(sPower) | 596 SAFE_DELETE_ARRAY(meanDBdur); |
537 delete sPower; | |
538 if(meanDBdur) | |
539 delete meanDBdur; | |
540 } | 597 } |
541 | 598 |
542 // A completely unprotected dot-product method | 599 // A completely unprotected dot-product method |
543 // Caller is responsible for ensuring that memory is within bounds | 600 // Caller is responsible for ensuring that memory is within bounds |
544 inline double audioDB::dot_product_points(double* q, double* p, Uns32T L){ | 601 inline double audioDB::dot_product_points(double* q, double* p, Uns32T L){ |
552 | 609 |
553 unsigned int numVectors; | 610 unsigned int numVectors; |
554 double *query, *query_data; | 611 double *query, *query_data; |
555 double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; | 612 double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; |
556 double meanQdur; | 613 double meanQdur; |
614 | |
615 if( dbH->flags & O2_FLAG_LARGE_ADB ) | |
616 error("error: LARGE_ADB requires indexed query"); | |
557 | 617 |
558 if(query_from_key) | 618 if(query_from_key) |
559 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); | 619 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); |
560 else | 620 else |
561 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); | 621 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); |
616 } | 676 } |
617 } | 677 } |
618 | 678 |
619 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset | 679 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset |
620 | 680 |
621 read_data(track, &data_buffer, &data_buffer_size); | 681 read_data(dbfid, track, &data_buffer, &data_buffer_size); |
622 if(sequenceLength <= trackTable[track]) { // test for short sequences | 682 if(sequenceLength <= trackTable[track]) { // test for short sequences |
623 | 683 |
624 VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]); | 684 VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]); |
625 | 685 |
626 initialize_arrays(track, numVectors, query, data_buffer, D, DD); | 686 initialize_arrays(track, numVectors, query, data_buffer, D, DD); |