mas01mc@292: // LSH indexing mas01mc@292: // mas01mc@292: // Construct a persistent LSH table structure mas01mc@292: // Store at the same location as dbName mas01mc@292: // Naming convention: mas01mc@292: // dbName.lsh.${radius}.${sequenceLength} mas01mc@292: // mas01mc@292: // mas01mc@292: // Author: Michael Casey mas01mc@292: // Date: 23 June 2008 mas01mc@319: // mas01mc@319: // 19th August 2008 - added O2_FLAG_LARGE_ADB support mas01mc@292: mas01mc@292: #include "audioDB.h" mas01mc@292: #include "ReporterBase.h" mas01mc@292: mas01mc@292: mas01mc@292: /************************* LSH point index to audioDB conversion *****************/ mas01mc@319: Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){ mas01mc@319: assert(nPntBits); mas01mc@319: return lshID>>nPntBits; mas01mc@292: } mas01mc@292: mas01mc@319: Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){ mas01mc@319: assert(nPntBits); mas01mc@319: return lshID&((1< (MAXSTR - 32)) mas01mc@292: error("dbName is too long for LSH index filename appendages"); mas01mc@292: strncpy(indexName, dbName, MAXSTR); mas01mc@292: sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength); mas01mc@292: return indexName; mas01mc@292: } mas01mc@292: mas01mc@292: // return true if index exists else return false mas01mc@292: int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){ mas01mc@292: // Test to see if file exists mas01mc@292: char* indexName = index_get_name(dbName, radius, sequenceLength); mas01mc@292: lshfid = open (indexName, O_RDONLY); mas01mc@292: delete[] indexName; mas01mc@292: close(lshfid); mas01mc@292: mas01mc@292: if(lshfid<0) mas01mc@292: return false; mas01mc@292: else mas01mc@292: return true; mas01mc@292: } mas01mc@292: mas01mc@321: // If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName()) mas01mc@321: // If they match, i.e. path+dbName_resident == path+dbName_requested, use mas01mc@321: // the memory-resident index. mas01mc@321: // Else allocate a new LSH instance and load the index from disk mas01mc@308: LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){ mas01mc@308: LSH* gIndx=SERVER_LSH_INDEX_SINGLETON; mas01mc@308: if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) ) mas01mc@308: audioDB::lsh = gIndx; // Use the global SERVER resident index mas01mc@308: else{ mas01mc@308: if(audioDB::lsh) mas01mc@308: delete audioDB::lsh; mas01mc@308: audioDB::lsh = new LSH(indexName, load_hashTables); mas01mc@308: } mas01mc@308: assert(audioDB::lsh); mas01mc@308: return audioDB::lsh; mas01mc@308: } mas01mc@308: mas01mc@292: vector >* audioDB::index_initialize_shingles(Uns32T sz){ mas01mc@292: if(vv) mas01mc@292: delete vv; mas01mc@292: vv = new vector >(sz); mas01mc@292: for(Uns32T i=0 ; i < sz ; i++) mas01mc@292: (*vv)[i]=vector(dbH->dim*sequenceLength); // allocate shingle storage mas01mc@292: return vv; mas01mc@292: } mas01mc@292: mas01mc@292: /******************** LSH indexing audioDB database access forall s \in {S} ***********************/ mas01mc@292: mas01mc@292: // Prepare the AudioDB database for read access and allocate auxillary memory mas01mc@292: void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { mas01mc@319: if (!(dbH->flags & O2_FLAG_POWER)) { mas01mc@319: error("INDEXed database must be power-enabled", dbName); mas01mc@319: } mas01mc@319: mas01mc@319: double *snpp = *snp, *sppp = 0; mas01mc@319: mas01mc@292: *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors mas01mc@292: *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector mas01mc@292: *spp = new double[*dvp]; // song powertable pointer mas01mc@292: sppp = *spp; mas01mc@319: memcpy(*snp, l2normTable, *dvp * sizeof(double)); mas01mc@292: memcpy(*spp, powerTable, *dvp * sizeof(double)); mas01mc@319: mas01mc@319: mas01mc@292: for(Uns32T i = 0; i < dbH->numFiles; i++){ mas01mc@292: if(trackTable[i] >= sequenceLength) { mas01mc@292: sequence_sum(snpp, trackTable[i], sequenceLength); mas01mc@292: sequence_sqrt(snpp, trackTable[i], sequenceLength); mas01mc@292: mas01mc@292: sequence_sum(sppp, trackTable[i], sequenceLength); mas01mc@292: sequence_average(sppp, trackTable[i], sequenceLength); mas01mc@292: } mas01mc@292: snpp += trackTable[i]; mas01mc@292: sppp += trackTable[i]; mas01mc@292: } mas01mc@319: mas01mc@292: *vsnp = *snp; mas01mc@292: *vspp = *spp; mas01mc@319: mas01mc@292: // Move the feature vector read pointer to start of fetures in database mas01mc@292: lseek(dbfid, dbH->dataOffset, SEEK_SET); mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@292: /************************ LSH indexing ***********************************/ mas01mc@292: void audioDB::index_index_db(const char* dbName){ mas01mc@292: char* newIndexName; mas01mc@292: double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; mas01mc@292: Uns32T dbVectors = 0; mas01mc@292: mas01mc@320: mas01mc@292: printf("INDEX: initializing header\n"); mas01mc@292: // Check if audioDB exists, initialize header and open database for read mas01mc@292: forWrite = false; mas01mc@292: initDBHeader(dbName); mas01mc@292: mas01mc@320: if(dbH->flags & O2_FLAG_POWER) mas01mc@320: usingPower = true; mas01mc@320: mas01mc@320: if(dbH->flags & O2_FLAG_TIMES) mas01mc@320: usingTimes = true; mas01mc@320: mas01mc@292: newIndexName = index_get_name(dbName, radius, sequenceLength); mas01mc@292: mas01mc@292: // Set unit norming flag override mas01mc@292: audioDB::normalizedDistance = !audioDB::no_unit_norming; mas01mc@292: mas01mc@320: printf("INDEX: dim %d\n", (int)dbH->dim); mas01mc@292: printf("INDEX: R %f\n", radius); mas01mc@292: printf("INDEX: seqlen %d\n", sequenceLength); mas01mc@292: printf("INDEX: lsh_w %f\n", lsh_param_w); mas01mc@292: printf("INDEX: lsh_k %d\n", lsh_param_k); mas01mc@292: printf("INDEX: lsh_m %d\n", lsh_param_m); mas01mc@292: printf("INDEX: lsh_N %d\n", lsh_param_N); mas01mc@296: printf("INDEX: lsh_C %d\n", lsh_param_ncols); mas01mc@292: printf("INDEX: lsh_b %d\n", lsh_param_b); mas01mc@292: printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); mas01mc@292: fflush(stdout); mas01mc@292: mas01mc@292: mas01mc@292: if((lshfid = open(newIndexName,O_RDONLY))<0){ mas01mc@292: printf("INDEX: constructing new LSH index\n"); mas01mc@292: printf("INDEX: making index file %s\n", newIndexName); mas01mc@292: fflush(stdout); mas01mc@292: // Construct new LSH index mas01mc@292: lsh = new LSH((float)lsh_param_w, lsh_param_k, mas01mc@292: lsh_param_m, mas01mc@292: (Uns32T)(sequenceLength*dbH->dim), mas01mc@292: lsh_param_N, mas01mc@292: lsh_param_ncols, mas01mc@292: (float)radius); mas01mc@292: assert(lsh); mas01mc@292: mas01mc@292: Uns32T endTrack = lsh_param_b; mas01mc@292: if( endTrack > dbH->numFiles) mas01mc@292: endTrack = dbH->numFiles; mas01mc@292: // Insert up to lsh_param_b tracks mas01mc@320: if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@320: index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); mas01mc@319: } mas01mc@320: index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); mas01mc@292: lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); mas01mc@292: mas01mc@292: // Clean up mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@292: close(lshfid); mas01mc@292: } mas01mc@292: mas01mc@292: // Attempt to open LSH file mas01mc@292: if((lshfid = open(newIndexName,O_RDONLY))>0){ mas01mc@292: printf("INDEX: merging with existing LSH index\n"); mas01mc@292: fflush(stdout); mas01mc@292: mas01mc@292: // Get the lsh header info and find how many tracks are inserted already mas01mc@292: lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here mas01mc@292: assert(lsh); mas01mc@319: Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1; mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@292: mas01mc@292: // This allows for updating index after more tracks are inserted into audioDB mas01mc@292: for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ mas01mc@292: mas01mc@292: Uns32T endTrack = startTrack + lsh_param_b; mas01mc@292: if( endTrack > dbH->numFiles) mas01mc@292: endTrack = dbH->numFiles; mas01mc@292: printf("Indexing track range: %d - %d\n", startTrack, endTrack); mas01mc@292: fflush(stdout); mas01mc@292: lsh = new LSH(newIndexName, lsh_in_core); // Initialize core memory for LSH tables mas01mc@292: assert(lsh); mas01mc@292: mas01mc@292: // Insert up to lsh_param_b database tracks mas01mc@292: index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); mas01mc@292: mas01mc@292: // Serialize to file mas01mc@292: lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@292: } mas01mc@292: mas01mc@292: close(lshfid); mas01mc@292: printf("INDEX: done constructing LSH index.\n"); mas01mc@292: fflush(stdout); mas01mc@292: mas01mc@292: } mas01mc@292: else{ mas01mc@292: error("Something's wrong with LSH index file"); mas01mc@292: exit(1); mas01mc@292: } mas01mc@292: mas01mc@292: delete[] newIndexName; mas01mc@320: delete[] sNorm; mas01mc@320: delete[] sPower; mas01mc@292: } mas01mc@292: mas01mc@319: mas01mc@319: // initialize auxillary track data from filesystem mas01mc@319: // pre-conditions: mas01mc@319: // dbH->flags & O2_FLAG_LARGE_ADB mas01mc@319: // feature data allocated and copied (fvp) mas01mc@319: // mas01mc@319: // post-conditions: mas01mc@319: // allocated power data mas01mc@319: // allocated l2norm data mas01mc@319: // mas01mc@319: void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ mas01mc@319: if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) mas01mc@319: error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); mas01mc@319: mas01mc@319: // Allocate and read the power sequence mas01mc@319: if(trackTable[trackID]>=sequenceLength){ mas01mc@321: mas01mc@321: char* prefixedString = new char[O2_MAXFILESTR]; mas01mc@321: char* tmpStr = prefixedString; mas01mc@319: // Open and check dimensions of power file mas01mc@321: strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); mas01mc@321: prefix_name((char ** const)&prefixedString, adb_feature_root); mas01mc@321: if(prefixedString!=tmpStr) mas01mc@321: delete[] tmpStr; mas01mc@321: powerfd = open(prefixedString, O_RDONLY); mas01mc@319: if (powerfd < 0) { mas01mc@321: error("failed to open power file", prefixedString); mas01mc@319: } mas01mc@319: if (fstat(powerfd, &statbuf) < 0) { mas01mc@321: error("fstat error finding size of power file", prefixedString, "fstat"); mas01mc@319: } mas01mc@319: mas01mc@319: if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) mas01mc@321: error("Dimension mismatch: numPowers != numVectors", prefixedString); mas01mc@320: mas01mc@319: *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values mas01mc@319: assert(*sPowerp); mas01mc@319: *spPtrp = *sPowerp; mas01mc@319: insertPowerData(trackTable[trackID], powerfd, *sPowerp); mas01mc@319: if (0 < powerfd) { mas01mc@319: close(powerfd); mas01mc@319: } mas01mc@319: mas01mc@319: sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); mas01mc@319: sequence_average(*sPowerp, trackTable[trackID], sequenceLength); mas01mc@319: powerTable = 0; mas01mc@319: mas01mc@319: // Allocate and calculate the l2norm sequence mas01mc@319: *sNormpp = new double[trackTable[trackID]]; mas01mc@319: assert(*sNormpp); mas01mc@319: *snPtrp = *sNormpp; mas01mc@319: unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp); mas01mc@319: sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); mas01mc@319: sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); mas01mc@319: } mas01mc@319: } mas01mc@319: mas01mc@292: void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, mas01mc@292: double** fvpp, double** sNormpp,double** snPtrp, mas01mc@292: double** sPowerp, double** spPtrp){ mas01mc@292: size_t nfv = 0; mas01mc@292: double* fvp = 0; // Keep pointer for memory allocation and free() for track data mas01mc@292: Uns32T trackID = 0; mas01mc@292: mas01mc@292: VERB_LOG(1, "indexing tracks..."); mas01mc@292: mas01mc@319: int trackfd = dbfid; mas01mc@292: for(trackID = start_track ; trackID < end_track ; trackID++ ){ mas01mc@319: if( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@321: char* prefixedString = new char[O2_MAXFILESTR]; mas01mc@321: char* tmpStr = prefixedString; mas01mc@319: // Open and check dimensions of feature file mas01mc@321: strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); mas01mc@321: prefix_name((char ** const) &prefixedString, adb_feature_root); mas01mc@321: if(prefixedString!=tmpStr) mas01mc@321: delete[] tmpStr; mas01mc@321: initInputFile(prefixedString, false); // nommap, file pointer at correct position mas01mc@319: trackfd = infid; mas01mc@319: } mas01mc@319: read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv mas01mc@292: *fvpp = fvp; // Protect memory allocation and free() for track data mas01mc@319: mas01mc@319: if( dbH->flags & O2_FLAG_LARGE_ADB ) mas01mc@319: // Load power and calculate power and l2norm sequence sums mas01mc@319: init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); mas01mc@319: mas01mc@292: if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) mas01mc@292: break; mas01mc@319: if ( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@319: close(infid); mas01mc@320: delete[] *sNormpp; mas01mc@320: delete[] *sPowerp; mas01mc@319: *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; mas01mc@319: } mas01mc@319: } // end for(trackID = start_track ; ... ) mas01mc@292: std::cout << "finished inserting." << endl; mas01mc@292: } mas01mc@292: mas01mc@292: int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ mas01mc@292: // Loop over the current input track's vectors mas01cr@305: Uns32T numVecs = 0; mas01cr@305: if (trackTable[trackID] > O2_MAXTRACKLEN) { mas01cr@305: if (O2_MAXTRACKLEN < sequenceLength - 1) { mas01cr@305: numVecs = 0; mas01cr@305: } else { mas01cr@305: numVecs = O2_MAXTRACKLEN - sequenceLength + 1; mas01cr@305: } mas01cr@305: } else { mas01cr@305: if (trackTable[trackID] < sequenceLength - 1) { mas01cr@305: numVecs = 0; mas01cr@305: } else { mas01cr@305: numVecs = trackTable[trackID] - sequenceLength + 1; mas01cr@305: } mas01cr@305: } mas01mc@292: mas01mc@319: Uns32T numVecsAboveThreshold = 0, collisionCount = 0; mas01mc@319: if(numVecs){ mas01mc@319: vv = index_initialize_shingles(numVecs); mas01mc@319: mas01mc@319: for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) mas01mc@319: index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); mas01mc@319: mas01mc@319: numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); mas01mc@319: collisionCount = index_insert_shingles(vv, trackID, *sppp); mas01mc@319: } mas01mc@292: float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; mas01mc@292: mas01mc@292: /* index_norm_shingles() only goes as far as the end of the mas01mc@292: sequence, which is right, but the space allocated is for the mas01mc@292: whole track. */ mas01mc@292: mas01mc@292: /* But numVecs will be O2_MAXTRACKLEN mas01mc@292: * So let's be certain the pointers are in the correct place mas01mc@292: */ mas01mc@292: mas01mc@319: if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@319: *snpp += trackTable[trackID]; mas01mc@319: *sppp += trackTable[trackID]; mas01mc@319: *fvpp += trackTable[trackID] * dbH->dim; mas01mc@319: } mas01mc@292: mas01mc@292: std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; mas01mc@292: std::cout.flush(); mas01mc@292: return true; mas01mc@292: } mas01mc@292: mas01mc@292: Uns32T audioDB::index_insert_shingles(vector >* vv, Uns32T trackID, double* spp){ mas01mc@292: Uns32T collisionCount = 0; mas01mc@292: cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; mas01mc@323: for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){ mas01mc@323: if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))) mas01mc@319: collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits)); mas01mc@323: spp+=sequenceHop; mas01mc@311: } mas01mc@292: return collisionCount; mas01mc@292: } mas01mc@292: mas01mc@292: /********************* LSH shingle construction ***************************/ mas01mc@292: mas01mc@292: // Construct shingles out of a feature matrix mas01mc@292: // inputs: mas01mc@292: // idx is vector index in feature matrix mas01mc@292: // fvp is base feature matrix pointer double* [numVecs x dbH->dim] mas01mc@292: // mas01mc@292: // pre-conditions: mas01mc@292: // dbH->dim mas01mc@292: // sequenceLength mas01mc@292: // idx < numVectors - sequenceLength + 1 mas01mc@292: // mas01mc@292: // post-conditions: mas01mc@292: // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values mas01mc@292: mas01mc@292: void audioDB::index_make_shingle(vector >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){ mas01mc@292: assert(idx<(*vv).size()); mas01mc@292: vector::iterator ve = (*vv)[idx].end(); mas01mc@292: vi=(*vv)[idx].begin(); // shingle iterator mas01mc@292: // First feature vector in shingle mas01mc@292: if(idx==0){ mas01mc@292: while(vi!=ve) mas01mc@292: *vi++ = (float)(*fvp++); mas01mc@292: } mas01mc@292: // Not first feature vector in shingle mas01mc@292: else{ mas01mc@292: vector::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator mas01mc@292: // Previous seqLen-1 dim-vectors mas01mc@292: while(vi!=ve-dim) mas01mc@292: *vi++=*ui++; mas01mc@292: // Move data pointer to next feature vector mas01mc@292: fvp += ( seqLen + idx - 1 ) * dim ; mas01mc@292: // New d-vector mas01mc@292: while(vi!=ve) mas01mc@292: *vi++ = (float)(*fvp++); mas01mc@292: } mas01mc@292: } mas01mc@292: mas01mc@292: // norm shingles mas01mc@292: // in-place norming, no deletions mas01mc@292: // If using power, return number of shingles above power threshold mas01mc@292: int audioDB::index_norm_shingles(vector >* vv, double* snp, double* spp){ mas01mc@292: int z = 0; // number of above-threshold shingles mas01mc@292: float l2norm; mas01mc@292: double power; mas01mc@292: float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2 mas01mc@292: float oneOverSqrtl2NormDivRad = oneOverRadius; mas01mc@292: if(!spp) mas01mc@292: error("LSH indexing and query requires a power feature using -w or -W"); mas01mc@292: Uns32T shingleSize = sequenceLength*dbH->dim; mas01mc@292: for(Uns32T a=0; a<(*vv).size(); a++){ mas01mc@292: l2norm = (float)(*snp++); mas01mc@292: if(audioDB::normalizedDistance) mas01mc@292: oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius; mas01mc@292: mas01mc@292: for(Uns32T b=0; b < shingleSize ; b++) mas01mc@292: (*vv)[a][b]*=oneOverSqrtl2NormDivRad; mas01mc@292: mas01mc@292: power = *spp++; mas01mc@292: if(use_absolute_threshold){ mas01mc@292: if ( power >= absolute_threshold ) mas01mc@292: z++; mas01mc@292: } mas01mc@292: else mas01mc@292: z++; mas01mc@292: } mas01mc@292: return z; mas01mc@292: } mas01mc@292: mas01mc@292: mas01mc@292: /*********************** LSH retrieval ****************************/ mas01mc@292: mas01mc@292: mas01mc@292: // return true if indexed query performed else return false mas01mc@292: int audioDB::index_init_query(const char* dbName){ mas01mc@292: mas01mc@292: if(!(index_exists(dbName, radius, sequenceLength))) mas01mc@292: return false; mas01mc@292: mas01mc@292: char* indexName = index_get_name(dbName, radius, sequenceLength); mas01mc@292: mas01mc@292: // Test to see if file exists mas01mc@292: if((lshfid = open (indexName, O_RDONLY)) < 0){ mas01mc@292: delete[] indexName; mas01mc@292: return false; mas01mc@292: } mas01mc@292: mas01mc@308: lsh = index_allocate(indexName, false); // Get the header only here mas01mc@292: sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim mas01mc@292: mas01mc@311: if(lsh!=SERVER_LSH_INDEX_SINGLETON){ mas01mc@308: if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE)) mas01mc@308: printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius()); mas01mc@320: printf("INDEX: dim %d\n", (int)dbH->dim); mas01mc@308: printf("INDEX: R %f\n", lsh->get_radius()); mas01mc@308: printf("INDEX: seqlen %d\n", sequenceLength); mas01mc@308: printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth()); mas01mc@308: printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns()); mas01mc@308: printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables()); mas01mc@308: printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows()); mas01mc@319: printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)); mas01mc@308: printf("INDEX: Opened LSH index file %s\n", indexName); mas01mc@308: fflush(stdout); mas01mc@308: } mas01mc@292: mas01mc@292: // Check to see if we are loading hash tables into core, and do so if true mas01mc@292: if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){ mas01mc@308: if(SERVER_LSH_INDEX_SINGLETON) mas01mc@308: fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName()); mas01mc@308: else mas01mc@308: printf("INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1"); mas01mc@308: lsh = index_allocate(indexName, true); mas01mc@292: } mas01mc@292: mas01mc@292: delete[] indexName; mas01mc@292: return true; mas01mc@292: } mas01mc@292: mas01mc@292: // *Static* approximate NN point reporter callback method for lshlib mas01mc@292: void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ mas01mc@292: assert(instancePtr); // We need an instance for this callback mas01mc@292: audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance mas01mc@319: Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits); mas01mc@319: Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits); mas01mc@292: // Skip identity in query_from_key mas01mc@292: if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) mas01mc@292: myself->reporter->add_point(trackID, qpos, spos, dist); mas01mc@292: } mas01mc@292: mas01mc@292: // *Static* exact NN point reporter callback method for lshlib mas01mc@292: // Maintain a queue of points to pass to query_points() for exact evaluation mas01mc@292: void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ mas01mc@292: assert(instancePtr); // We need an instance for this callback mas01mc@292: audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance mas01mc@319: Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits); mas01mc@319: Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits); mas01mc@292: // Skip identity in query_from_key mas01mc@292: if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) mas01mc@292: myself->index_insert_exact_evaluation_queue(trackID, qpos, spos); mas01mc@292: } mas01mc@292: mas01mc@292: void audioDB::initialize_exact_evalutation_queue(){ mas01mc@292: if(exact_evaluation_queue) mas01mc@292: delete exact_evaluation_queue; mas01mc@292: exact_evaluation_queue = new priority_queue, std::less >; mas01mc@292: } mas01mc@292: mas01mc@292: void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){ mas01mc@292: PointPair p(trackID, qpos, spos); mas01mc@292: exact_evaluation_queue->push(p); mas01mc@292: } mas01mc@292: mas01mc@292: // return 0: if index does not exist mas01mc@292: // return nqv: if index exists mas01mc@292: int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) { mas01mc@292: mas01mc@320: unsigned int numVectors = 0; mas01mc@320: double *query = 0, *query_data = 0; mas01mc@320: double *qNorm = 0, *qnPtr = 0, *qPower = 0, *qpPtr = 0; mas01mc@320: double meanQdur = 0; mas01mc@292: void (*add_point_func)(void*,Uns32T,Uns32T,float); mas01mc@292: mas01mc@292: // Set the point-reporter callback based on the value of lsh_exact mas01mc@292: if(lsh_exact){ mas01mc@292: initialize_exact_evalutation_queue(); mas01mc@292: add_point_func = &index_add_point_exact; mas01mc@292: } mas01mc@292: else mas01mc@292: add_point_func = &index_add_point_approximate; mas01mc@292: mas01mc@292: if(!index_init_query(dbName)) // sets-up LSH index structures for querying mas01mc@292: return 0; mas01mc@292: mas01mc@292: char* database = index_get_name(dbName, radius, sequenceLength); mas01mc@292: mas01mc@292: if(query_from_key) mas01mc@292: set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex); mas01mc@292: else mas01mc@292: set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors mas01mc@292: mas01mc@292: VERB_LOG(1, "retrieving tracks..."); mas01mc@292: mas01mc@292: assert(pointNN>0 && pointNN<=O2_MAXNN); mas01mc@292: assert(trackNN>0 && trackNN<=O2_MAXNN); mas01mc@292: mas01mc@292: gettimeofday(&tv1, NULL); mas01mc@292: // query vector index mas01mc@292: Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1; mas01mc@292: vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles mas01mc@292: cout << "Nq=" << Nq; cout.flush(); mas01mc@292: // Construct shingles from query features mas01mc@292: for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ ) mas01mc@292: index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength); mas01mc@292: mas01mc@292: // Normalize query vectors mas01mc@292: Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr ); mas01mc@292: cout << " Nq'=" << numVecsAboveThreshold << endl; cout.flush(); mas01mc@292: mas01mc@292: // Nq contains number of inspected points in query file, mas01mc@292: // numVecsAboveThreshold is number of points with power >= absolute_threshold mas01mc@292: double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation mas01mc@292: if(usingQueryPoint && numVecsAboveThreshold){ mas01mc@292: if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) mas01mc@292: lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this); mas01mc@292: else mas01mc@292: lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this); mas01mc@292: } mas01mc@292: else if(numVecsAboveThreshold) mas01mc@292: for( Uns32T pointID = 0 ; pointID < Nq; pointID++ ) mas01mc@292: if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) mas01mc@292: if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) mas01mc@292: lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this); mas01mc@292: else mas01mc@292: lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this); mas01mc@292: mas01mc@292: if(lsh_exact) mas01mc@292: // Perform exact distance computation on point pairs in exact_evaluation_queue mas01mc@292: query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors); mas01mc@292: mas01mc@292: gettimeofday(&tv2,NULL); mas01mc@292: VERB_LOG(1,"elapsed time: %ld msec\n", mas01mc@292: (tv2.tv_sec*1000 + tv2.tv_usec/1000) - mas01mc@292: (tv1.tv_sec*1000 + tv1.tv_usec/1000)) mas01mc@292: mas01mc@292: // Close the index file mas01mc@292: close(lshfid); mas01mc@292: mas01mc@292: // Clean up mas01mc@292: if(query_data) mas01mc@292: delete[] query_data; mas01mc@292: if(qNorm) mas01mc@292: delete[] qNorm; mas01mc@292: if(qPower) mas01mc@292: delete[] qPower; mas01mc@292: if(database) mas01mc@292: delete[] database; mas01mc@292: mas01mc@292: return Nq; mas01mc@292: } mas01mc@292: