comparison query.cpp @ 324:c93be2f3a674

Merge of branches/large_adb -r 514:524 onto the trunk. No conflicts. Added LARGE_ADB support. Turn on with --ntracks 20001 or greater. Use --adb_feature_root to locate feature files at QUERY time. A bug fix in LSH indexing that was incorrectly thresholding large numbers of shingles.
author mas01mc
date Thu, 21 Aug 2008 21:28:33 +0000
parents d2c56d4f841e
children 8f11ea4d9cd2
comparison
equal deleted inserted replaced
315:d2c56d4f841e 324:c93be2f3a674
44 reporter = new trackAveragingReporter< std::less< NNresult > >(pointNN, trackNN, dbH->numFiles); 44 reporter = new trackAveragingReporter< std::less< NNresult > >(pointNN, trackNN, dbH->numFiles);
45 } else { 45 } else {
46 if(index_exists(dbName, radius, sequenceLength)){ 46 if(index_exists(dbName, radius, sequenceLength)){
47 char* indexName = index_get_name(dbName, radius, sequenceLength); 47 char* indexName = index_get_name(dbName, radius, sequenceLength);
48 lsh = index_allocate(indexName, false); 48 lsh = index_allocate(indexName, false);
49 reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp())+1); 49 reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1);
50 delete[] indexName; 50 delete[] indexName;
51 } 51 }
52 else 52 else
53 reporter = new trackSequenceQueryRadReporter(trackNN, dbH->numFiles); 53 reporter = new trackSequenceQueryRadReporter(trackNN, dbH->numFiles);
54 } 54 }
60 reporter = new trackSequenceQueryNNReporter< std::less < NNresult > >(pointNN, trackNN, dbH->numFiles); 60 reporter = new trackSequenceQueryNNReporter< std::less < NNresult > >(pointNN, trackNN, dbH->numFiles);
61 } else { 61 } else {
62 if(index_exists(dbName, radius, sequenceLength)){ 62 if(index_exists(dbName, radius, sequenceLength)){
63 char* indexName = index_get_name(dbName, radius, sequenceLength); 63 char* indexName = index_get_name(dbName, radius, sequenceLength);
64 lsh = index_allocate(indexName, false); 64 lsh = index_allocate(indexName, false);
65 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp())+1); 65 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1);
66 delete[] indexName; 66 delete[] indexName;
67 } 67 }
68 else 68 else
69 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, dbH->numFiles); 69 reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, dbH->numFiles);
70 } 70 }
218 delete[] DD[j]; 218 delete[] DD[j];
219 } 219 }
220 } 220 }
221 } 221 }
222 222
223 void audioDB::read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p) { 223 void audioDB::read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p) {
224 if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) { 224 if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) {
225 if(*data_buffer_p) { 225 if(*data_buffer_p) {
226 free(*data_buffer_p); 226 free(*data_buffer_p);
227 } 227 }
228 { 228 {
233 } 233 }
234 *data_buffer_p = (double *) tmp; 234 *data_buffer_p = (double *) tmp;
235 } 235 }
236 } 236 }
237 237
238 read(dbfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); 238 read(trkfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim);
239 } 239 }
240 240
241 // These names deserve some unpicking. The names starting with a "q" 241 // These names deserve some unpicking. The names starting with a "q"
242 // are pointers to the query, norm and power vectors; the names 242 // are pointers to the query, norm and power vectors; the names
243 // starting with "v" are things that will end up pointing to the 243 // starting with "v" are things that will end up pointing to the
339 error("Query shorter than requested sequence length", "maybe use -l"); 339 error("Query shorter than requested sequence length", "maybe use -l");
340 } 340 }
341 341
342 VERB_LOG(1, "performing norms... "); 342 VERB_LOG(1, "performing norms... ");
343 343
344 // Read query feature vectors from database 344 // For LARGE_ADB load query features from file
345 *qp = NULL; 345 if( dbH->flags & O2_FLAG_LARGE_ADB ){
346 lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); 346 if(infid>0)
347 size_t allocatedSize = 0; 347 close(infid);
348 read_data(queryIndex, qp, &allocatedSize); 348 char* prefixedString = new char[O2_MAXFILESTR];
349 // Consistency check on allocated memory and query feature size 349 char* tmpStr = prefixedString;
350 if(*nvp*sizeof(double)*dbH->dim != allocatedSize) 350 strncpy(prefixedString, featureFileNameTable+queryIndex*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
351 error("Query memory allocation failed consitency check","set_up_query_from_key"); 351 prefix_name(&prefixedString, adb_feature_root);
352 352 if(tmpStr!=prefixedString)
353 Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors 353 delete[] tmpStr;
354 // Copy L2 norm partial-sum coefficients 354 initInputFile(prefixedString, false); // nommap, file pointer at correct position
355 assert(*qnp = new double[*nvp]); 355 size_t allocatedSize = 0;
356 memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); 356 read_data(infid, queryIndex, qp, &allocatedSize); // over-writes qp and allocatedSize
357 sequence_sum(*qnp, *nvp, sequenceLength); 357 // Consistency check on allocated memory and query feature size
358 sequence_sqrt(*qnp, *nvp, sequenceLength); 358 if(*nvp*sizeof(double)*dbH->dim != allocatedSize)
359 359 error("Query memory allocation failed consitency check","set_up_query_from_key");
360 if( usingPower ){ 360 // Allocated and calculate auxillary sequences: l2norm and power
361 // Copy Power partial-sum coefficients 361 init_track_aux_data(queryIndex, *qp, qnp, vqnp, qpp, vqpp);
362 assert(*qpp = new double[*nvp]); 362 }
363 memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); 363 else{ // Load from self-contained ADB database
364 sequence_sum(*qpp, *nvp, sequenceLength); 364 // Read query feature vectors from database
365 sequence_average(*qpp, *nvp, sequenceLength); 365 *qp = NULL;
366 } 366 lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET);
367 367 size_t allocatedSize = 0;
368 if (usingTimes) { 368 read_data(dbfid, queryIndex, qp, &allocatedSize);
369 unsigned int k; 369 // Consistency check on allocated memory and query feature size
370 *mqdp = 0.0; 370 if(*nvp*sizeof(double)*dbH->dim != allocatedSize)
371 double *querydurs = new double[*nvp]; 371 error("Query memory allocation failed consitency check","set_up_query_from_key");
372 double *timesdata = new double[*nvp*2];
373 assert(querydurs && timesdata);
374 memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double));
375 for(k = 0; k < *nvp; k++) {
376 querydurs[k] = timesdata[2*k+1] - timesdata[2*k];
377 *mqdp += querydurs[k];
378 }
379 *mqdp /= k;
380 372
381 VERB_LOG(1, "mean query file duration: %f\n", *mqdp); 373 Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors
374 // Copy L2 norm partial-sum coefficients
375 assert(*qnp = new double[*nvp]);
376 memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double));
377 sequence_sum(*qnp, *nvp, sequenceLength);
378 sequence_sqrt(*qnp, *nvp, sequenceLength);
382 379
383 delete [] querydurs; 380 if( usingPower ){
384 delete [] timesdata; 381 // Copy Power partial-sum coefficients
385 } 382 assert(*qpp = new double[*nvp]);
383 memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double));
384 sequence_sum(*qpp, *nvp, sequenceLength);
385 sequence_average(*qpp, *nvp, sequenceLength);
386 }
387
388 if (usingTimes) {
389 unsigned int k;
390 *mqdp = 0.0;
391 double *querydurs = new double[*nvp];
392 double *timesdata = new double[*nvp*2];
393 assert(querydurs && timesdata);
394 memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double));
395 for(k = 0; k < *nvp; k++) {
396 querydurs[k] = timesdata[2*k+1] - timesdata[2*k];
397 *mqdp += querydurs[k];
398 }
399 *mqdp /= k;
400
401 VERB_LOG(1, "mean query file duration: %f\n", *mqdp);
402
403 delete [] querydurs;
404 delete [] timesdata;
405 }
406 }
407
386 // Defaults, for exhaustive search (!usingQueryPoint) 408 // Defaults, for exhaustive search (!usingQueryPoint)
387 *vqp = *qp; 409 *vqp = *qp;
388 *vqnp = *qnp; 410 *vqnp = *qnp;
389 *vqpp = *qpp; 411 *vqpp = *qpp;
390 412
485 return; 507 return;
486 508
487 // Compute database info 509 // Compute database info
488 // FIXME: we more than likely don't need very much of the database 510 // FIXME: we more than likely don't need very much of the database
489 // so make a new method to build these values per-track or, even better, per-point 511 // so make a new method to build these values per-track or, even better, per-point
490 set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); 512 if( !( dbH->flags & O2_FLAG_LARGE_ADB) )
513 set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors);
491 514
492 VERB_LOG(1, "matching points..."); 515 VERB_LOG(1, "matching points...");
493 516
494 assert(pointNN>0 && pointNN<=O2_MAXNN); 517 assert(pointNN>0 && pointNN<=O2_MAXNN);
495 assert(trackNN>0 && trackNN<=O2_MAXNN); 518 assert(trackNN>0 && trackNN<=O2_MAXNN);
496 519
497 // We are guaranteed that the order of points is sorted by: 520 // We are guaranteed that the order of points is sorted by:
498 // qpos, trackID, spos 521 // trackID, spos, qpos
499 // so we can be relatively efficient in initialization of track data. 522 // so we can be relatively efficient in initialization of track data.
500 // Here we assume that points don't overlap, so we will use exhaustive dot 523 // Here we assume that points don't overlap, so we will use exhaustive dot
501 // product evaluation over the sequence 524 // product evaluation instead of memoization of partial sums which is used
525 // for exhaustive brute-force evaluation from smaller databases: e.g. query_loop()
502 double dist; 526 double dist;
503 size_t data_buffer_size = 0; 527 size_t data_buffer_size = 0;
504 double *data_buffer = 0; 528 double *data_buffer = 0;
505 Uns32T trackOffset; 529 Uns32T trackOffset = 0;
506 Uns32T trackIndexOffset; 530 Uns32T trackIndexOffset = 0;
507 Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range 531 Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range
508 Uns32T npairs = exact_evaluation_queue->size(); 532 Uns32T npairs = exact_evaluation_queue->size();
509 while(npairs--){ 533 while(npairs--){
510 PointPair pp = exact_evaluation_queue->top(); 534 PointPair pp = exact_evaluation_queue->top();
511 trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset 535 // Large ADB track data must be loaded here for sPower
512 trackIndexOffset=trackOffset/dbH->dim; // num vectors offset 536 if(dbH->flags & O2_FLAG_LARGE_ADB){
513 if((!(usingPower) || powers_acceptable(qpPtr[usingQueryPoint?0:pp.qpos], sPower[trackIndexOffset+pp.spos])) && 537 trackOffset=0;
514 ((usingQueryPoint?0:pp.qpos) < numVectors-sequenceLength+1 && pp.spos < trackTable[pp.trackID]-sequenceLength+1)){ 538 trackIndexOffset=0;
515 if(currentTrack!=pp.trackID){ 539 if(currentTrack!=pp.trackID){
540 char* prefixedString = new char[O2_MAXFILESTR];
541 char* tmpStr = prefixedString;
542 // On currentTrack change, allocate and load track data
516 currentTrack=pp.trackID; 543 currentTrack=pp.trackID;
517 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); 544 SAFE_DELETE_ARRAY(sNorm);
518 read_data(currentTrack, &data_buffer, &data_buffer_size); 545 SAFE_DELETE_ARRAY(sPower);
519 } 546 if(infid>0)
520 dist = dot_product_points(query+(usingQueryPoint?0:pp.qpos*dbH->dim), data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); 547 close(infid);
548 // Open and check dimensions of feature file
549 strncpy(prefixedString, featureFileNameTable+pp.trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
550 prefix_name((char ** const) &prefixedString, adb_feature_root);
551 if (prefixedString!=tmpStr)
552 delete[] tmpStr;
553 initInputFile(prefixedString, false); // nommap, file pointer at correct position
554 // Load the feature vector data for current track into data_buffer
555 read_data(infid, pp.trackID, &data_buffer, &data_buffer_size);
556 // Load power and calculate power and l2norm sequence sums
557 init_track_aux_data(pp.trackID, data_buffer, &sNorm, &snPtr, &sPower, &spPtr);
558 }
559 }
560 else{
561 // These offsets are w.r.t. the entire database of feature vectors and auxillary variables
562 trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset
563 trackIndexOffset=trackOffset/dbH->dim; // num vectors offset
564 }
565 Uns32T qPos = usingQueryPoint?0:pp.qpos;// index for query point
566 Uns32T sPos = trackIndexOffset+pp.spos; // index into l2norm table
567 // Test power thresholds before computing distance
568 if( ( !usingPower || powers_acceptable(qpPtr[qPos], sPower[sPos])) &&
569 ( qPos<numVectors-sequenceLength+1 && pp.spos<trackTable[pp.trackID]-sequenceLength+1 ) ){
570 // Non-large ADB track data is loaded inside power test for efficiency
571 if( !(dbH->flags & O2_FLAG_LARGE_ADB) && (currentTrack!=pp.trackID) ){
572 // On currentTrack change, allocate and load track data
573 currentTrack=pp.trackID;
574 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET);
575 read_data(dbfid, currentTrack, &data_buffer, &data_buffer_size);
576 }
577 // Compute distance
578 dist = dot_product_points(query+qPos*dbH->dim, data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength);
579 double qn = qnPtr[qPos];
580 double sn = sNorm[sPos];
521 if(normalizedDistance) 581 if(normalizedDistance)
522 dist = 2-(2/(qnPtr[usingQueryPoint?0:pp.qpos]*sNorm[trackIndexOffset+pp.spos]))*dist; 582 dist = 2 - (2/(qn*sn))*dist;
523 else 583 else
524 if(no_unit_norming) 584 if(no_unit_norming)
525 dist = qnPtr[usingQueryPoint?0:pp.qpos]*qnPtr[usingQueryPoint?0:pp.qpos]+sNorm[trackIndexOffset+pp.spos]*sNorm[trackIndexOffset+pp.spos] - 2*dist; 585 dist = qn*qn + sn*sn - 2*dist;
526 // else 586 // else
527 // dist = dist; 587 // dist = dist;
528 if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) 588 if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE))
529 reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); 589 reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist);
530 } 590 }
531 exact_evaluation_queue->pop(); 591 exact_evaluation_queue->pop();
532 } 592 }
533 // Cleanup 593 // Cleanup
534 if(sNorm) 594 SAFE_DELETE_ARRAY(sNorm);
535 delete sNorm; 595 SAFE_DELETE_ARRAY(sPower);
536 if(sPower) 596 SAFE_DELETE_ARRAY(meanDBdur);
537 delete sPower;
538 if(meanDBdur)
539 delete meanDBdur;
540 } 597 }
541 598
542 // A completely unprotected dot-product method 599 // A completely unprotected dot-product method
543 // Caller is responsible for ensuring that memory is within bounds 600 // Caller is responsible for ensuring that memory is within bounds
544 inline double audioDB::dot_product_points(double* q, double* p, Uns32T L){ 601 inline double audioDB::dot_product_points(double* q, double* p, Uns32T L){
552 609
553 unsigned int numVectors; 610 unsigned int numVectors;
554 double *query, *query_data; 611 double *query, *query_data;
555 double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; 612 double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0;
556 double meanQdur; 613 double meanQdur;
614
615 if( dbH->flags & O2_FLAG_LARGE_ADB )
616 error("error: LARGE_ADB requires indexed query");
557 617
558 if(query_from_key) 618 if(query_from_key)
559 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); 619 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
560 else 620 else
561 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); 621 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors);
616 } 676 }
617 } 677 }
618 678
619 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset 679 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset
620 680
621 read_data(track, &data_buffer, &data_buffer_size); 681 read_data(dbfid, track, &data_buffer, &data_buffer_size);
622 if(sequenceLength <= trackTable[track]) { // test for short sequences 682 if(sequenceLength <= trackTable[track]) { // test for short sequences
623 683
624 VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]); 684 VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]);
625 685
626 initialize_arrays(track, numVectors, query, data_buffer, D, DD); 686 initialize_arrays(track, numVectors, query, data_buffer, D, DD);