mas01mc@292: // LSH indexing mas01mc@292: // mas01mc@292: // Construct a persistent LSH table structure mas01mc@292: // Store at the same location as dbName mas01mc@292: // Naming convention: mas01mc@292: // dbName.lsh.${radius}.${sequenceLength} mas01mc@292: // mas01mc@292: // mas01mc@292: // Author: Michael Casey mas01mc@292: // Date: 23 June 2008 mas01mc@324: // mas01mc@324: // 19th August 2008 - added O2_FLAG_LARGE_ADB support mas01mc@292: mas01mc@292: #include "audioDB.h" mas01cr@426: #include "audioDB-internals.h" mas01mc@292: mas01cr@458: typedef struct adb_qcallback { mas01cr@458: adb_t *adb; mas01cr@458: adb_qstate_internal_t *qstate; mas01cr@458: } adb_qcallback_t; mas01mc@292: mas01mc@292: /************************* LSH indexing and query initialization *****************/ mas01mc@292: mas01cr@460: /* FIXME: there are several things wrong with this: the memory mas01cr@460: * discipline isn't ideal, the radius printing is a bit lame, the name mas01cr@460: * getting will succeed or fail depending on whether the path was mas01cr@460: * relative or absolute -- but most importantly encoding all that mas01cr@460: * information in a filename is going to lose: it's impossible to mas01cr@460: * maintain backwards-compatibility. Instead we should probably store mas01cr@460: * the index metadata inside the audiodb instance. */ mas01cr@460: char *audiodb_index_get_name(const char *dbName, double radius, Uns32T sequenceLength) { mas01cr@460: char *indexName; mas01cr@460: if(strlen(dbName) > (MAXSTR - 32)) { mas01cr@460: return NULL; mas01cr@460: } mas01cr@460: indexName = new char[MAXSTR]; mas01mc@292: strncpy(indexName, dbName, MAXSTR); mas01mc@292: sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength); mas01mc@292: return indexName; mas01mc@292: } mas01mc@292: mas01cr@460: bool audiodb_index_exists(const char *dbName, double radius, Uns32T sequenceLength) { mas01cr@460: char *indexName = audiodb_index_get_name(dbName, radius, sequenceLength); mas01cr@460: if(!indexName) { mas01cr@460: return false; mas01cr@460: } mas01cr@460: struct stat st; mas01cr@460: if(stat(indexName, &st)) { mas01cr@460: delete [] indexName; mas01cr@460: return false; mas01cr@460: } mas01cr@460: /* FIXME: other stat checks here? */ mas01cr@460: /* FIXME: is there any better way to check whether we can open a mas01cr@460: * file for reading than by opening a file for reading? */ mas01cr@460: int fd = open(indexName, O_RDONLY); mas01cr@460: delete [] indexName; mas01cr@460: if(fd < 0) { mas01cr@460: return false; mas01cr@460: } else { mas01cr@460: close(fd); mas01mc@292: return true; mas01cr@460: } mas01mc@292: } mas01mc@292: mas01cr@465: /* FIXME: the indexName arg should be "const char *", but the LSH mas01cr@465: * library doesn't like that. mas01cr@465: */ mas01cr@465: LSH *audiodb_index_allocate(adb_t *adb, char *indexName, bool load_tables) { mas01cr@465: LSH *lsh; mas01cr@465: if(adb->cached_lsh) { mas01cr@465: if(!strncmp(adb->cached_lsh->get_indexName(), indexName, MAXSTR)) { mas01cr@465: return adb->cached_lsh; mas01cr@465: } else { mas01cr@465: delete adb->cached_lsh; mas01cr@465: } mas01mc@308: } mas01cr@465: lsh = new LSH(indexName, load_tables); mas01cr@465: if(load_tables) { mas01cr@465: adb->cached_lsh = lsh; mas01cr@465: } mas01cr@465: return lsh; mas01mc@308: } mas01mc@308: mas01cr@459: vector > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) { mas01cr@459: std::vector > *vv = new vector >(sz); mas01cr@459: for(Uns32T i=0 ; i < sz ; i++) { mas01cr@459: (*vv)[i]=vector(dim * seqLen); mas01cr@459: } mas01mc@292: return vv; mas01mc@292: } mas01mc@292: mas01cr@459: void audiodb_index_delete_shingles(vector > *vv) { mas01cr@459: delete vv; mas01cr@459: } mas01cr@459: mas01mc@292: /******************** LSH indexing audioDB database access forall s \in {S} ***********************/ mas01mc@292: mas01mc@292: // Prepare the AudioDB database for read access and allocate auxillary memory mas01mc@292: void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { mas01mc@324: if (!(dbH->flags & O2_FLAG_POWER)) { mas01mc@324: error("INDEXed database must be power-enabled", dbName); mas01mc@324: } mas01mc@324: mas01mc@325: double *snpp = 0, *sppp = 0; mas01mc@324: mas01mc@292: *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors mas01mc@292: *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector mas01mc@325: snpp = *snp; mas01mc@292: *spp = new double[*dvp]; // song powertable pointer mas01mc@292: sppp = *spp; mas01mc@325: mas01mc@324: memcpy(*snp, l2normTable, *dvp * sizeof(double)); mas01mc@292: memcpy(*spp, powerTable, *dvp * sizeof(double)); mas01mc@324: mas01mc@324: mas01mc@292: for(Uns32T i = 0; i < dbH->numFiles; i++){ mas01mc@292: if(trackTable[i] >= sequenceLength) { mas01cr@427: audiodb_sequence_sum(snpp, trackTable[i], sequenceLength); mas01cr@427: audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength); mas01mc@292: mas01cr@427: audiodb_sequence_sum(sppp, trackTable[i], sequenceLength); mas01cr@427: audiodb_sequence_average(sppp, trackTable[i], sequenceLength); mas01mc@292: } mas01mc@292: snpp += trackTable[i]; mas01mc@292: sppp += trackTable[i]; mas01mc@292: } mas01mc@324: mas01mc@292: *vsnp = *snp; mas01mc@292: *vspp = *spp; mas01mc@324: mas01mc@292: // Move the feature vector read pointer to start of fetures in database mas01mc@292: lseek(dbfid, dbH->dataOffset, SEEK_SET); mas01mc@292: } mas01mc@292: mas01mc@292: mas01cr@456: /********************* LSH shingle construction ***************************/ mas01cr@456: mas01cr@456: // Construct shingles out of a feature matrix mas01cr@456: // inputs: mas01cr@456: // idx is vector index in feature matrix mas01cr@456: // fvp is base feature matrix pointer double* [numVecs x dbH->dim] mas01cr@456: // mas01cr@456: // pre-conditions: mas01cr@456: // dbH->dim mas01cr@456: // sequenceLength mas01cr@456: // idx < numVectors - sequenceLength + 1 mas01cr@456: // mas01cr@456: // post-conditions: mas01cr@456: // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values mas01cr@456: mas01cr@456: static void audiodb_index_make_shingle(vector >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){ mas01cr@456: assert(idx<(*vv).size()); mas01cr@456: vector::iterator ve = (*vv)[idx].end(); mas01cr@456: vector::iterator vi = (*vv)[idx].begin(); mas01cr@456: // First feature vector in shingle mas01cr@456: if(idx == 0) { mas01cr@456: while(vi!=ve) { mas01cr@456: *vi++ = (float)(*fvp++); mas01cr@456: } mas01cr@456: } else { mas01cr@456: // Not first feature vector in shingle mas01cr@456: vector::iterator ui=(*vv)[idx-1].begin() + dim; mas01cr@456: // Previous seqLen-1 dim-vectors mas01cr@456: while(vi!=ve-dim) { mas01cr@456: *vi++ = *ui++; mas01cr@456: } mas01cr@456: // Move data pointer to next feature vector mas01cr@456: fvp += ( seqLen + idx - 1 ) * dim ; mas01cr@456: // New d-vector mas01cr@456: while(vi!=ve) { mas01cr@456: *vi++ = (float)(*fvp++); mas01cr@456: } mas01cr@456: } mas01cr@456: } mas01cr@456: mas01cr@456: // norm shingles mas01cr@456: // in-place norming, no deletions mas01cr@456: // If using power, return number of shingles above power threshold mas01cr@459: int audiodb_index_norm_shingles(vector >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) { mas01cr@456: int z = 0; // number of above-threshold shingles mas01cr@456: float l2norm; mas01cr@456: double power; mas01cr@456: float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2 mas01cr@456: float oneOverSqrtl2NormDivRad = oneOverRadius; mas01cr@459: Uns32T shingleSize = seqLen * dim; mas01cr@459: mas01cr@459: if(!spp) { mas01cr@459: return -1; mas01cr@459: } mas01cr@456: for(Uns32T a=0; a<(*vv).size(); a++){ mas01cr@456: l2norm = (float)(*snp++); mas01cr@459: if(normed_vectors) mas01cr@456: oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius; mas01cr@456: mas01cr@456: for(Uns32T b=0; b < shingleSize ; b++) mas01cr@456: (*vv)[a][b]*=oneOverSqrtl2NormDivRad; mas01cr@456: mas01cr@456: power = *spp++; mas01cr@459: if(use_pthreshold){ mas01cr@459: if (power >= pthreshold) mas01cr@456: z++; mas01cr@456: } mas01cr@456: else mas01cr@456: z++; mas01cr@456: } mas01cr@456: return z; mas01cr@456: } mas01cr@456: mas01cr@456: mas01mc@292: /************************ LSH indexing ***********************************/ mas01mc@292: void audioDB::index_index_db(const char* dbName){ mas01mc@292: char* newIndexName; mas01mc@292: double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; mas01mc@292: Uns32T dbVectors = 0; mas01mc@292: mas01mc@324: mas01mc@292: printf("INDEX: initializing header\n"); mas01mc@292: // Check if audioDB exists, initialize header and open database for read mas01mc@292: forWrite = false; mas01mc@292: initDBHeader(dbName); mas01mc@292: mas01mc@324: if(dbH->flags & O2_FLAG_POWER) mas01mc@324: usingPower = true; mas01mc@324: mas01mc@324: if(dbH->flags & O2_FLAG_TIMES) mas01mc@324: usingTimes = true; mas01mc@324: mas01cr@460: newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength); mas01cr@460: if(!newIndexName) { mas01cr@460: error("failed to get index name", dbName); mas01cr@460: } mas01mc@292: mas01mc@292: // Set unit norming flag override mas01mc@292: audioDB::normalizedDistance = !audioDB::no_unit_norming; mas01mc@292: mas01mc@327: VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim); mas01mc@327: VERB_LOG(1, "INDEX: R %f\n", radius); mas01mc@327: VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength); mas01mc@327: VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w); mas01mc@327: VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k); mas01mc@327: VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m); mas01mc@327: VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N); mas01mc@327: VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols); mas01mc@327: VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b); mas01mc@327: VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false"); mas01mc@292: mas01mc@292: if((lshfid = open(newIndexName,O_RDONLY))<0){ mas01mc@292: printf("INDEX: constructing new LSH index\n"); mas01mc@292: printf("INDEX: making index file %s\n", newIndexName); mas01mc@292: fflush(stdout); mas01mc@292: // Construct new LSH index mas01mc@292: lsh = new LSH((float)lsh_param_w, lsh_param_k, mas01mc@292: lsh_param_m, mas01mc@292: (Uns32T)(sequenceLength*dbH->dim), mas01mc@292: lsh_param_N, mas01mc@292: lsh_param_ncols, mas01mc@292: (float)radius); mas01mc@292: assert(lsh); mas01mc@292: mas01mc@292: Uns32T endTrack = lsh_param_b; mas01mc@292: if( endTrack > dbH->numFiles) mas01mc@292: endTrack = dbH->numFiles; mas01mc@292: // Insert up to lsh_param_b tracks mas01mc@324: if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@324: index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); mas01mc@324: } mas01mc@324: index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); mas01mc@292: lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); mas01mc@292: mas01mc@292: // Clean up mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01cr@465: } else { mas01mc@292: close(lshfid); mas01mc@292: } mas01mc@292: mas01mc@292: // Attempt to open LSH file mas01mc@292: if((lshfid = open(newIndexName,O_RDONLY))>0){ mas01mc@292: printf("INDEX: merging with existing LSH index\n"); mas01mc@292: fflush(stdout); mas01mc@340: char* mergeIndexName = newIndexName; mas01mc@292: mas01mc@292: // Get the lsh header info and find how many tracks are inserted already mas01mc@340: lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here mas01mc@292: assert(lsh); mas01cr@458: Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1; mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@292: mas01mc@340: // Insert up to lsh_param_b tracks mas01mc@340: if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@340: index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); mas01mc@340: } mas01mc@292: // This allows for updating index after more tracks are inserted into audioDB mas01mc@292: for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ mas01mc@292: mas01mc@292: Uns32T endTrack = startTrack + lsh_param_b; mas01mc@292: if( endTrack > dbH->numFiles) mas01mc@292: endTrack = dbH->numFiles; mas01mc@292: printf("Indexing track range: %d - %d\n", startTrack, endTrack); mas01mc@292: fflush(stdout); mas01mc@340: lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables mas01mc@292: assert(lsh); mas01mc@292: mas01mc@292: // Insert up to lsh_param_b database tracks mas01mc@292: index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); mas01mc@292: mas01mc@340: // Serialize to file (merging is performed here) mas01mc@340: lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@340: } mas01mc@292: mas01mc@292: close(lshfid); mas01mc@292: printf("INDEX: done constructing LSH index.\n"); mas01mc@292: fflush(stdout); mas01mc@292: mas01mc@292: } mas01mc@292: else{ mas01mc@292: error("Something's wrong with LSH index file"); mas01mc@292: exit(1); mas01mc@292: } mas01mc@292: mas01mc@324: delete[] newIndexName; mas01mc@324: delete[] sNorm; mas01mc@324: delete[] sPower; mas01mc@324: } mas01mc@292: mas01mc@292: mas01cr@405: void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { mas01cr@405: if(usingPower){ mas01cr@405: int one; mas01cr@405: unsigned int count; mas01cr@405: mas01cr@405: count = read(powerfd, &one, sizeof(unsigned int)); mas01cr@405: if (count != sizeof(unsigned int)) { mas01cr@405: error("powerfd read failed", "int", "read"); mas01cr@405: } mas01cr@405: if (one != 1) { mas01cr@405: error("dimensionality of power file not 1", powerFileName); mas01cr@405: } mas01cr@405: mas01cr@405: // FIXME: should check that the powerfile is the right size for mas01cr@405: // this. -- CSR, 2007-10-30 mas01cr@405: count = read(powerfd, powerdata, numVectors * sizeof(double)); mas01cr@405: if (count != numVectors * sizeof(double)) { mas01cr@405: error("powerfd read failed", "double", "read"); mas01cr@405: } mas01cr@405: } mas01cr@405: } mas01cr@405: mas01mc@324: // initialize auxillary track data from filesystem mas01mc@324: // pre-conditions: mas01mc@324: // dbH->flags & O2_FLAG_LARGE_ADB mas01mc@324: // feature data allocated and copied (fvp) mas01mc@324: // mas01mc@324: // post-conditions: mas01mc@324: // allocated power data mas01mc@324: // allocated l2norm data mas01mc@324: // mas01mc@324: void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ mas01mc@324: if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) mas01mc@324: error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); mas01mc@292: mas01mc@324: // Allocate and read the power sequence mas01mc@324: if(trackTable[trackID]>=sequenceLength){ mas01mc@324: mas01mc@324: char* prefixedString = new char[O2_MAXFILESTR]; mas01mc@324: char* tmpStr = prefixedString; mas01mc@324: // Open and check dimensions of power file mas01mc@324: strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); mas01mc@324: prefix_name((char ** const)&prefixedString, adb_feature_root); mas01mc@324: if(prefixedString!=tmpStr) mas01mc@324: delete[] tmpStr; mas01mc@324: powerfd = open(prefixedString, O_RDONLY); mas01mc@324: if (powerfd < 0) { mas01mc@324: error("failed to open power file", prefixedString); mas01mc@324: } mas01mc@324: if (fstat(powerfd, &statbuf) < 0) { mas01mc@324: error("fstat error finding size of power file", prefixedString, "fstat"); mas01mc@324: } mas01mc@324: mas01mc@324: if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) mas01mc@324: error("Dimension mismatch: numPowers != numVectors", prefixedString); mas01mc@324: mas01mc@324: *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values mas01mc@324: assert(*sPowerp); mas01mc@324: *spPtrp = *sPowerp; mas01mc@324: insertPowerData(trackTable[trackID], powerfd, *sPowerp); mas01mc@324: if (0 < powerfd) { mas01mc@324: close(powerfd); mas01mc@324: } mas01mc@324: mas01cr@427: audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); mas01cr@427: audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength); mas01mc@324: powerTable = 0; mas01mc@324: mas01mc@324: // Allocate and calculate the l2norm sequence mas01mc@324: *sNormpp = new double[trackTable[trackID]]; mas01mc@324: assert(*sNormpp); mas01mc@324: *snPtrp = *sNormpp; mas01cr@426: audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp); mas01cr@427: audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); mas01cr@427: audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); mas01mc@324: } mas01mc@292: } mas01mc@292: mas01mc@292: void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, mas01mc@292: double** fvpp, double** sNormpp,double** snPtrp, mas01mc@292: double** sPowerp, double** spPtrp){ mas01mc@292: size_t nfv = 0; mas01mc@292: double* fvp = 0; // Keep pointer for memory allocation and free() for track data mas01mc@292: Uns32T trackID = 0; mas01mc@292: mas01mc@292: VERB_LOG(1, "indexing tracks..."); mas01mc@292: mas01mc@324: int trackfd = dbfid; mas01mc@292: for(trackID = start_track ; trackID < end_track ; trackID++ ){ mas01mc@324: if( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@324: char* prefixedString = new char[O2_MAXFILESTR]; mas01mc@324: char* tmpStr = prefixedString; mas01mc@324: // Open and check dimensions of feature file mas01mc@324: strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); mas01mc@324: prefix_name((char ** const) &prefixedString, adb_feature_root); mas01mc@324: if(prefixedString!=tmpStr) mas01mc@324: delete[] tmpStr; mas01cr@454: initInputFile(prefixedString); mas01mc@324: trackfd = infid; mas01mc@324: } mas01cr@433: if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv)) mas01cr@433: error("failed to read data"); mas01mc@292: *fvpp = fvp; // Protect memory allocation and free() for track data mas01mc@324: mas01mc@324: if( dbH->flags & O2_FLAG_LARGE_ADB ) mas01mc@324: // Load power and calculate power and l2norm sequence sums mas01mc@324: init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); mas01mc@324: mas01mc@292: if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) mas01mc@292: break; mas01mc@324: if ( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@324: close(infid); mas01mc@324: delete[] *sNormpp; mas01mc@324: delete[] *sPowerp; mas01mc@324: *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; mas01mc@324: } mas01mc@324: } // end for(trackID = start_track ; ... ) mas01mc@292: std::cout << "finished inserting." << endl; mas01mc@292: } mas01mc@292: mas01mc@292: int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ mas01mc@292: // Loop over the current input track's vectors mas01cr@305: Uns32T numVecs = 0; mas01cr@305: if (trackTable[trackID] > O2_MAXTRACKLEN) { mas01cr@305: if (O2_MAXTRACKLEN < sequenceLength - 1) { mas01cr@305: numVecs = 0; mas01cr@305: } else { mas01cr@305: numVecs = O2_MAXTRACKLEN - sequenceLength + 1; mas01cr@305: } mas01cr@305: } else { mas01cr@305: if (trackTable[trackID] < sequenceLength - 1) { mas01cr@305: numVecs = 0; mas01cr@305: } else { mas01cr@305: numVecs = trackTable[trackID] - sequenceLength + 1; mas01cr@305: } mas01cr@305: } mas01mc@292: mas01mc@324: Uns32T numVecsAboveThreshold = 0, collisionCount = 0; mas01mc@324: if(numVecs){ mas01cr@459: std::vector > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength); mas01mc@324: mas01mc@324: for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) mas01cr@456: audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); mas01cr@459: int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold); mas01cr@459: if(vcount == -1) { mas01cr@459: audiodb_index_delete_shingles(vv); mas01cr@459: error("failed to norm shingles"); mas01cr@459: } mas01cr@459: numVecsAboveThreshold = vcount; mas01mc@324: collisionCount = index_insert_shingles(vv, trackID, *sppp); mas01cr@459: audiodb_index_delete_shingles(vv); mas01mc@324: } mas01cr@459: mas01mc@292: float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; mas01mc@292: mas01cr@459: /* audiodb_index_norm_shingles() only goes as far as the end of the mas01mc@292: sequence, which is right, but the space allocated is for the mas01mc@292: whole track. */ mas01mc@292: mas01mc@292: /* But numVecs will be O2_MAXTRACKLEN mas01mc@292: * So let's be certain the pointers are in the correct place mas01mc@292: */ mas01mc@292: mas01mc@324: if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@324: *snpp += trackTable[trackID]; mas01mc@324: *sppp += trackTable[trackID]; mas01mc@324: *fvpp += trackTable[trackID] * dbH->dim; mas01mc@324: } mas01mc@292: mas01mc@292: std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; mas01mc@292: std::cout.flush(); mas01mc@292: return true; mas01mc@292: } mas01mc@292: mas01mc@292: Uns32T audioDB::index_insert_shingles(vector >* vv, Uns32T trackID, double* spp){ mas01mc@292: Uns32T collisionCount = 0; mas01mc@292: cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; mas01mc@324: for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){ mas01mc@324: if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))) mas01cr@458: collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb))); mas01mc@324: spp+=sequenceHop; mas01mc@311: } mas01mc@292: return collisionCount; mas01mc@292: } mas01mc@292: mas01mc@292: /*********************** LSH retrieval ****************************/ mas01mc@292: mas01mc@292: mas01mc@292: // return true if indexed query performed else return false mas01cr@473: int audiodb_index_init_query(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate, bool corep) { mas01mc@292: mas01cr@465: uint32_t sequence_length = spec->qid.sequence_length; mas01cr@465: double radius = spec->refine.radius; mas01cr@465: if(!(audiodb_index_exists(adb->path, radius, sequence_length))) mas01mc@292: return false; mas01mc@292: mas01cr@465: char *indexName = audiodb_index_get_name(adb->path, radius, sequence_length); mas01cr@460: if(!indexName) { mas01cr@465: return false; mas01cr@460: } mas01mc@292: mas01cr@465: qstate->lsh = audiodb_index_allocate(adb, indexName, corep); mas01cr@465: mas01cr@465: /* FIXME: it would be nice if the LSH library didn't make me do mas01cr@465: * this. */ mas01cr@465: if((!corep) && (qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2)) { mas01cr@465: delete qstate->lsh; mas01cr@465: qstate->lsh = audiodb_index_allocate(adb, indexName, true); mas01mc@292: } mas01mc@292: mas01mc@292: delete[] indexName; mas01mc@292: return true; mas01mc@292: } mas01mc@292: mas01cr@458: void audiodb_index_add_point_approximate(void *user_data, Uns32T pointID, Uns32T qpos, float dist) { mas01cr@458: adb_qcallback_t *data = (adb_qcallback_t *) user_data; mas01cr@458: adb_t *adb = data->adb; mas01cr@458: adb_qstate_internal_t *qstate = data->qstate; mas01cr@458: uint32_t nbits = audiodb_lsh_n_point_bits(adb); mas01cr@458: uint32_t trackID = audiodb_index_to_track_id(pointID, nbits); mas01cr@458: uint32_t spos = audiodb_index_to_track_pos(pointID, nbits); mas01cr@458: std::set::iterator keys_end = qstate->allowed_keys->end(); mas01cr@458: if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) { mas01cr@424: adb_result_t r; mas01cr@458: r.key = (*adb->keys)[trackID].c_str(); mas01cr@424: r.dist = dist; mas01cr@424: r.qpos = qpos; mas01cr@424: r.ipos = spos; mas01cr@458: qstate->accumulator->add_point(&r); mas01cr@424: } mas01mc@292: } mas01mc@292: mas01cr@463: // Maintain a queue of points to pass to audiodb_query_queue_loop() mas01cr@463: // for exact evaluation mas01cr@458: void audiodb_index_add_point_exact(void *user_data, Uns32T pointID, Uns32T qpos, float dist) { mas01cr@458: adb_qcallback_t *data = (adb_qcallback_t *) user_data; mas01cr@458: adb_t *adb = data->adb; mas01cr@458: adb_qstate_internal_t *qstate = data->qstate; mas01cr@458: uint32_t nbits = audiodb_lsh_n_point_bits(adb); mas01cr@458: uint32_t trackID = audiodb_index_to_track_id(pointID, nbits); mas01cr@458: uint32_t spos = audiodb_index_to_track_pos(pointID, nbits); mas01cr@458: std::set::iterator keys_end = qstate->allowed_keys->end(); mas01cr@458: if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) { mas01cr@458: PointPair p(trackID, qpos, spos); mas01cr@458: qstate->exact_evaluation_queue->push(p); mas01cr@458: } mas01mc@292: } mas01mc@292: mas01cr@466: // return -1 on error mas01mc@292: // return 0: if index does not exist mas01mc@292: // return nqv: if index exists mas01cr@473: int audiodb_index_query_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) { mas01mc@292: mas01mc@324: double *query = 0, *query_data = 0; mas01cr@437: adb_qpointers_internal_t qpointers = {0}; mas01cr@437: mas01cr@458: adb_qcallback_t callback_data; mas01cr@458: callback_data.adb = adb; mas01cr@458: callback_data.qstate = qstate; mas01cr@458: mas01cr@466: void (*add_point_func)(void *, uint32_t, uint32_t, float); mas01mc@292: mas01cr@466: uint32_t sequence_length = spec->qid.sequence_length; mas01cr@466: bool normalized = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED); mas01cr@466: double radius = spec->refine.radius; mas01cr@466: bool use_absolute_threshold = spec->refine.flags & ADB_REFINE_ABSOLUTE_THRESHOLD; mas01cr@466: double absolute_threshold = spec->refine.absolute_threshold; mas01cr@431: mas01cr@468: if(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES) { mas01cr@468: add_point_func = &audiodb_index_add_point_approximate; mas01cr@468: } else { mas01cr@458: qstate->exact_evaluation_queue = new std::priority_queue; mas01cr@458: add_point_func = &audiodb_index_add_point_exact; mas01mc@292: } mas01mc@292: mas01cr@468: /* FIXME: this hardwired lsh_in_core is here to allow for a mas01cr@468: * transition period while the need for the argument is worked mas01cr@468: * through. Hopefully it will disappear again eventually. */ mas01cr@468: bool lsh_in_core = true; mas01cr@468: mas01cr@465: if(!audiodb_index_init_query(adb, spec, qstate, lsh_in_core)) { mas01mc@292: return 0; mas01cr@465: } mas01mc@292: mas01cr@466: char *database = audiodb_index_get_name(adb->path, radius, sequence_length); mas01cr@460: if(!database) { mas01cr@466: return -1; mas01cr@460: } mas01mc@292: mas01cr@444: if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) { mas01cr@466: delete [] database; mas01cr@466: return -1; mas01cr@444: } mas01mc@292: mas01cr@466: uint32_t Nq = (qpointers.nvectors > O2_MAXTRACKLEN ? O2_MAXTRACKLEN : qpointers.nvectors) - sequence_length + 1; mas01cr@466: std::vector > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequence_length); mas01cr@447: mas01mc@292: // Construct shingles from query features mas01cr@468: for(uint32_t pointID = 0; pointID < Nq; pointID++) { mas01cr@466: audiodb_index_make_shingle(vv, pointID, query, adb->header->dim, sequence_length); mas01cr@468: } mas01mc@292: mas01mc@292: // Normalize query vectors mas01cr@466: int vcount = audiodb_index_norm_shingles(vv, qpointers.l2norm, qpointers.power, adb->header->dim, sequence_length, radius, normalized, use_absolute_threshold, absolute_threshold); mas01cr@459: if(vcount == -1) { mas01cr@459: audiodb_index_delete_shingles(vv); mas01cr@466: delete [] database; mas01cr@466: return -1; mas01cr@459: } mas01cr@466: uint32_t numVecsAboveThreshold = vcount; mas01mc@292: mas01mc@292: // Nq contains number of inspected points in query file, mas01mc@292: // numVecsAboveThreshold is number of points with power >= absolute_threshold mas01cr@466: double *qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation mas01cr@468: if(!(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) && numVecsAboveThreshold) { mas01cr@466: if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) { mas01cr@466: qstate->lsh->retrieve_point((*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data); mas01cr@466: } else { mas01cr@466: qstate->lsh->serial_retrieve_point(database, (*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data); mas01cr@466: } mas01cr@466: } else if(numVecsAboveThreshold) { mas01cr@466: for(uint32_t pointID = 0; pointID < Nq; pointID++) { mas01cr@370: if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) { mas01cr@466: if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) { mas01cr@465: qstate->lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, &callback_data); mas01cr@370: } else { mas01cr@465: qstate->lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, &callback_data); mas01cr@370: } mas01cr@370: } mas01cr@466: } mas01cr@466: } mas01cr@459: audiodb_index_delete_shingles(vv); mas01mc@292: mas01cr@468: if(!(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES)) { mas01cr@463: audiodb_query_queue_loop(adb, spec, qstate, query, &qpointers); mas01cr@466: } mas01mc@292: mas01mc@292: // Clean up mas01mc@292: if(query_data) mas01mc@292: delete[] query_data; mas01cr@437: if(qpointers.l2norm_data) mas01cr@437: delete[] qpointers.l2norm_data; mas01cr@437: if(qpointers.power_data) mas01cr@437: delete[] qpointers.power_data; mas01cr@437: if(qpointers.mean_duration) mas01cr@437: delete[] qpointers.mean_duration; mas01mc@292: if(database) mas01mc@292: delete[] database; mas01cr@465: if(qstate->lsh != adb->cached_lsh) mas01cr@465: delete qstate->lsh; mas01mc@292: mas01mc@292: return Nq; mas01mc@292: } mas01mc@292: