Mercurial > hg > audiodb
diff query.cpp @ 324:c93be2f3a674
Merge of branches/large_adb -r 514:524 onto the trunk. No conflicts. Added LARGE_ADB support. Turn on with --ntracks 20001 or greater. Use --adb_feature_root to locate feature files at QUERY time. A bug fix in LSH indexing that was incorrectly thresholding large numbers of shingles.
author | mas01mc |
---|---|
date | Thu, 21 Aug 2008 21:28:33 +0000 |
parents | d2c56d4f841e |
children | 8f11ea4d9cd2 |
line wrap: on
line diff
--- a/query.cpp Tue Aug 12 14:25:51 2008 +0000 +++ b/query.cpp Thu Aug 21 21:28:33 2008 +0000 @@ -46,7 +46,7 @@ if(index_exists(dbName, radius, sequenceLength)){ char* indexName = index_get_name(dbName, radius, sequenceLength); lsh = index_allocate(indexName, false); - reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp())+1); + reporter = new trackSequenceQueryRadReporter(trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1); delete[] indexName; } else @@ -62,7 +62,7 @@ if(index_exists(dbName, radius, sequenceLength)){ char* indexName = index_get_name(dbName, radius, sequenceLength); lsh = index_allocate(indexName, false); - reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp())+1); + reporter = new trackSequenceQueryRadNNReporter(pointNN,trackNN, index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1); delete[] indexName; } else @@ -220,7 +220,7 @@ } } -void audioDB::read_data(int track, double **data_buffer_p, size_t *data_buffer_size_p) { +void audioDB::read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p) { if (trackTable[track] * sizeof(double) * dbH->dim > *data_buffer_size_p) { if(*data_buffer_p) { free(*data_buffer_p); @@ -235,7 +235,7 @@ } } - read(dbfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); + read(trkfid, *data_buffer_p, trackTable[track] * sizeof(double) * dbH->dim); } // These names deserve some unpicking. The names starting with a "q" @@ -341,48 +341,70 @@ VERB_LOG(1, "performing norms... "); - // Read query feature vectors from database - *qp = NULL; - lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); - size_t allocatedSize = 0; - read_data(queryIndex, qp, &allocatedSize); - // Consistency check on allocated memory and query feature size - if(*nvp*sizeof(double)*dbH->dim != allocatedSize) - error("Query memory allocation failed consitency check","set_up_query_from_key"); - - Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors - // Copy L2 norm partial-sum coefficients - assert(*qnp = new double[*nvp]); - memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); - sequence_sum(*qnp, *nvp, sequenceLength); - sequence_sqrt(*qnp, *nvp, sequenceLength); - - if( usingPower ){ - // Copy Power partial-sum coefficients - assert(*qpp = new double[*nvp]); - memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); - sequence_sum(*qpp, *nvp, sequenceLength); - sequence_average(*qpp, *nvp, sequenceLength); + // For LARGE_ADB load query features from file + if( dbH->flags & O2_FLAG_LARGE_ADB ){ + if(infid>0) + close(infid); + char* prefixedString = new char[O2_MAXFILESTR]; + char* tmpStr = prefixedString; + strncpy(prefixedString, featureFileNameTable+queryIndex*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); + prefix_name(&prefixedString, adb_feature_root); + if(tmpStr!=prefixedString) + delete[] tmpStr; + initInputFile(prefixedString, false); // nommap, file pointer at correct position + size_t allocatedSize = 0; + read_data(infid, queryIndex, qp, &allocatedSize); // over-writes qp and allocatedSize + // Consistency check on allocated memory and query feature size + if(*nvp*sizeof(double)*dbH->dim != allocatedSize) + error("Query memory allocation failed consitency check","set_up_query_from_key"); + // Allocated and calculate auxillary sequences: l2norm and power + init_track_aux_data(queryIndex, *qp, qnp, vqnp, qpp, vqpp); + } + else{ // Load from self-contained ADB database + // Read query feature vectors from database + *qp = NULL; + lseek(dbfid, dbH->dataOffset + trackOffsetTable[queryIndex] * sizeof(double), SEEK_SET); + size_t allocatedSize = 0; + read_data(dbfid, queryIndex, qp, &allocatedSize); + // Consistency check on allocated memory and query feature size + if(*nvp*sizeof(double)*dbH->dim != allocatedSize) + error("Query memory allocation failed consitency check","set_up_query_from_key"); + + Uns32T trackIndexOffset = trackOffsetTable[queryIndex]/dbH->dim; // Convert num data elements to num vectors + // Copy L2 norm partial-sum coefficients + assert(*qnp = new double[*nvp]); + memcpy(*qnp, l2normTable+trackIndexOffset, *nvp*sizeof(double)); + sequence_sum(*qnp, *nvp, sequenceLength); + sequence_sqrt(*qnp, *nvp, sequenceLength); + + if( usingPower ){ + // Copy Power partial-sum coefficients + assert(*qpp = new double[*nvp]); + memcpy(*qpp, powerTable+trackIndexOffset, *nvp*sizeof(double)); + sequence_sum(*qpp, *nvp, sequenceLength); + sequence_average(*qpp, *nvp, sequenceLength); + } + + if (usingTimes) { + unsigned int k; + *mqdp = 0.0; + double *querydurs = new double[*nvp]; + double *timesdata = new double[*nvp*2]; + assert(querydurs && timesdata); + memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double)); + for(k = 0; k < *nvp; k++) { + querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; + *mqdp += querydurs[k]; + } + *mqdp /= k; + + VERB_LOG(1, "mean query file duration: %f\n", *mqdp); + + delete [] querydurs; + delete [] timesdata; + } } - if (usingTimes) { - unsigned int k; - *mqdp = 0.0; - double *querydurs = new double[*nvp]; - double *timesdata = new double[*nvp*2]; - assert(querydurs && timesdata); - memcpy(timesdata, timesTable+trackIndexOffset, *nvp*sizeof(double)); - for(k = 0; k < *nvp; k++) { - querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; - *mqdp += querydurs[k]; - } - *mqdp /= k; - - VERB_LOG(1, "mean query file duration: %f\n", *mqdp); - - delete [] querydurs; - delete [] timesdata; - } // Defaults, for exhaustive search (!usingQueryPoint) *vqp = *qp; *vqnp = *qnp; @@ -487,7 +509,8 @@ // Compute database info // FIXME: we more than likely don't need very much of the database // so make a new method to build these values per-track or, even better, per-point - set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); + if( !( dbH->flags & O2_FLAG_LARGE_ADB) ) + set_up_db(&sNorm, &snPtr, &sPower, &spPtr, &meanDBdur, &dbVectors); VERB_LOG(1, "matching points..."); @@ -495,48 +518,82 @@ assert(trackNN>0 && trackNN<=O2_MAXNN); // We are guaranteed that the order of points is sorted by: - // qpos, trackID, spos + // trackID, spos, qpos // so we can be relatively efficient in initialization of track data. // Here we assume that points don't overlap, so we will use exhaustive dot - // product evaluation over the sequence + // product evaluation instead of memoization of partial sums which is used + // for exhaustive brute-force evaluation from smaller databases: e.g. query_loop() double dist; size_t data_buffer_size = 0; double *data_buffer = 0; - Uns32T trackOffset; - Uns32T trackIndexOffset; + Uns32T trackOffset = 0; + Uns32T trackIndexOffset = 0; Uns32T currentTrack = 0x80000000; // Initialize with a value outside of track index range Uns32T npairs = exact_evaluation_queue->size(); while(npairs--){ PointPair pp = exact_evaluation_queue->top(); - trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset - trackIndexOffset=trackOffset/dbH->dim; // num vectors offset - if((!(usingPower) || powers_acceptable(qpPtr[usingQueryPoint?0:pp.qpos], sPower[trackIndexOffset+pp.spos])) && - ((usingQueryPoint?0:pp.qpos) < numVectors-sequenceLength+1 && pp.spos < trackTable[pp.trackID]-sequenceLength+1)){ + // Large ADB track data must be loaded here for sPower + if(dbH->flags & O2_FLAG_LARGE_ADB){ + trackOffset=0; + trackIndexOffset=0; if(currentTrack!=pp.trackID){ + char* prefixedString = new char[O2_MAXFILESTR]; + char* tmpStr = prefixedString; + // On currentTrack change, allocate and load track data currentTrack=pp.trackID; - lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); - read_data(currentTrack, &data_buffer, &data_buffer_size); + SAFE_DELETE_ARRAY(sNorm); + SAFE_DELETE_ARRAY(sPower); + if(infid>0) + close(infid); + // Open and check dimensions of feature file + strncpy(prefixedString, featureFileNameTable+pp.trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); + prefix_name((char ** const) &prefixedString, adb_feature_root); + if (prefixedString!=tmpStr) + delete[] tmpStr; + initInputFile(prefixedString, false); // nommap, file pointer at correct position + // Load the feature vector data for current track into data_buffer + read_data(infid, pp.trackID, &data_buffer, &data_buffer_size); + // Load power and calculate power and l2norm sequence sums + init_track_aux_data(pp.trackID, data_buffer, &sNorm, &snPtr, &sPower, &spPtr); } - dist = dot_product_points(query+(usingQueryPoint?0:pp.qpos*dbH->dim), data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); + } + else{ + // These offsets are w.r.t. the entire database of feature vectors and auxillary variables + trackOffset=trackOffsetTable[pp.trackID]; // num data elements offset + trackIndexOffset=trackOffset/dbH->dim; // num vectors offset + } + Uns32T qPos = usingQueryPoint?0:pp.qpos;// index for query point + Uns32T sPos = trackIndexOffset+pp.spos; // index into l2norm table + // Test power thresholds before computing distance + if( ( !usingPower || powers_acceptable(qpPtr[qPos], sPower[sPos])) && + ( qPos<numVectors-sequenceLength+1 && pp.spos<trackTable[pp.trackID]-sequenceLength+1 ) ){ + // Non-large ADB track data is loaded inside power test for efficiency + if( !(dbH->flags & O2_FLAG_LARGE_ADB) && (currentTrack!=pp.trackID) ){ + // On currentTrack change, allocate and load track data + currentTrack=pp.trackID; + lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); + read_data(dbfid, currentTrack, &data_buffer, &data_buffer_size); + } + // Compute distance + dist = dot_product_points(query+qPos*dbH->dim, data_buffer+pp.spos*dbH->dim, dbH->dim*sequenceLength); + double qn = qnPtr[qPos]; + double sn = sNorm[sPos]; if(normalizedDistance) - dist = 2-(2/(qnPtr[usingQueryPoint?0:pp.qpos]*sNorm[trackIndexOffset+pp.spos]))*dist; + dist = 2 - (2/(qn*sn))*dist; else if(no_unit_norming) - dist = qnPtr[usingQueryPoint?0:pp.qpos]*qnPtr[usingQueryPoint?0:pp.qpos]+sNorm[trackIndexOffset+pp.spos]*sNorm[trackIndexOffset+pp.spos] - 2*dist; + dist = qn*qn + sn*sn - 2*dist; // else // dist = dist; if((!radius) || dist <= (O2_LSH_EXACT_MULT*radius+O2_DISTANCE_TOLERANCE)) - reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); + reporter->add_point(pp.trackID, pp.qpos, pp.spos, dist); } exact_evaluation_queue->pop(); } // Cleanup - if(sNorm) - delete sNorm; - if(sPower) - delete sPower; - if(meanDBdur) - delete meanDBdur; + SAFE_DELETE_ARRAY(sNorm); + SAFE_DELETE_ARRAY(sPower); + SAFE_DELETE_ARRAY(meanDBdur); } // A completely unprotected dot-product method @@ -555,6 +612,9 @@ double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0; double meanQdur; + if( dbH->flags & O2_FLAG_LARGE_ADB ) + error("error: LARGE_ADB requires indexed query"); + if(query_from_key) set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); else @@ -618,7 +678,7 @@ trackIndexOffset=trackOffset/dbH->dim; // numVectors offset - read_data(track, &data_buffer, &data_buffer_size); + read_data(dbfid, track, &data_buffer, &data_buffer_size); if(sequenceLength <= trackTable[track]) { // test for short sequences VERB_LOG(7,"%u.%jd.%u | ", track, (intmax_t) trackIndexOffset, trackTable[track]);