mas01mc@292: // LSH indexing mas01mc@292: // mas01mc@292: // Construct a persistent LSH table structure mas01mc@292: // Store at the same location as dbName mas01mc@292: // Naming convention: mas01mc@292: // dbName.lsh.${radius}.${sequenceLength} mas01mc@292: // mas01mc@292: // mas01mc@292: // Author: Michael Casey mas01mc@292: // Date: 23 June 2008 mas01mc@324: // mas01mc@324: // 19th August 2008 - added O2_FLAG_LARGE_ADB support mas01mc@292: mas01mc@292: #include "audioDB.h" mas01mc@292: mas01cr@509: /******* LSH indexing audioDB database access forall s \in {S} *******/ mas01mc@292: mas01mc@292: // Prepare the AudioDB database for read access and allocate auxillary memory mas01mc@292: void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { mas01mc@324: if (!(dbH->flags & O2_FLAG_POWER)) { mas01mc@324: error("INDEXed database must be power-enabled", dbName); mas01mc@324: } mas01mc@324: mas01mc@325: double *snpp = 0, *sppp = 0; mas01mc@324: mas01mc@292: *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors mas01mc@292: *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector mas01mc@325: snpp = *snp; mas01mc@292: *spp = new double[*dvp]; // song powertable pointer mas01mc@292: sppp = *spp; mas01mc@325: mas01mc@324: memcpy(*snp, l2normTable, *dvp * sizeof(double)); mas01mc@292: memcpy(*spp, powerTable, *dvp * sizeof(double)); mas01mc@324: mas01mc@324: mas01mc@292: for(Uns32T i = 0; i < dbH->numFiles; i++){ mas01mc@292: if(trackTable[i] >= sequenceLength) { mas01cr@498: audiodb_sequence_sum(snpp, trackTable[i], sequenceLength); mas01cr@498: audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength); mas01mc@292: mas01cr@498: audiodb_sequence_sum(sppp, trackTable[i], sequenceLength); mas01cr@498: audiodb_sequence_average(sppp, trackTable[i], sequenceLength); mas01mc@292: } mas01mc@292: snpp += trackTable[i]; mas01mc@292: sppp += trackTable[i]; mas01mc@292: } mas01mc@324: mas01mc@292: *vsnp = *snp; mas01mc@292: *vspp = *spp; mas01mc@324: mas01mc@292: // Move the feature vector read pointer to start of fetures in database mas01mc@292: lseek(dbfid, dbH->dataOffset, SEEK_SET); mas01mc@292: } mas01mc@292: mas01mc@292: /************************ LSH indexing ***********************************/ mas01mc@292: void audioDB::index_index_db(const char* dbName){ mas01mc@292: char* newIndexName; mas01mc@292: double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; mas01mc@292: Uns32T dbVectors = 0; mas01mc@292: mas01mc@324: mas01mc@292: printf("INDEX: initializing header\n"); mas01mc@292: // Check if audioDB exists, initialize header and open database for read mas01mc@292: forWrite = false; mas01mc@292: initDBHeader(dbName); mas01mc@292: mas01mc@324: if(dbH->flags & O2_FLAG_POWER) mas01mc@324: usingPower = true; mas01mc@324: mas01mc@324: if(dbH->flags & O2_FLAG_TIMES) mas01mc@324: usingTimes = true; mas01mc@324: mas01cr@498: newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength); mas01cr@498: if(!newIndexName) { mas01cr@498: error("failed to get index name", dbName); mas01cr@498: } mas01mc@292: mas01mc@292: // Set unit norming flag override mas01mc@292: audioDB::normalizedDistance = !audioDB::no_unit_norming; mas01mc@292: mas01mc@327: VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim); mas01mc@327: VERB_LOG(1, "INDEX: R %f\n", radius); mas01mc@327: VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength); mas01mc@327: VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w); mas01mc@327: VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k); mas01mc@327: VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m); mas01mc@327: VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N); mas01mc@327: VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols); mas01mc@327: VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b); mas01mc@327: VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false"); mas01mc@292: mas01mc@292: if((lshfid = open(newIndexName,O_RDONLY))<0){ mas01mc@292: printf("INDEX: constructing new LSH index\n"); mas01mc@292: printf("INDEX: making index file %s\n", newIndexName); mas01mc@292: fflush(stdout); mas01mc@292: // Construct new LSH index mas01mc@292: lsh = new LSH((float)lsh_param_w, lsh_param_k, mas01mc@292: lsh_param_m, mas01mc@292: (Uns32T)(sequenceLength*dbH->dim), mas01mc@292: lsh_param_N, mas01mc@292: lsh_param_ncols, mas01mc@292: (float)radius); mas01mc@292: assert(lsh); mas01mc@292: mas01mc@292: Uns32T endTrack = lsh_param_b; mas01mc@292: if( endTrack > dbH->numFiles) mas01mc@292: endTrack = dbH->numFiles; mas01mc@292: // Insert up to lsh_param_b tracks mas01mc@324: if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@324: index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); mas01mc@324: } mas01mc@324: index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); mas01mc@292: lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); mas01mc@292: mas01mc@292: // Clean up mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01cr@498: } else { mas01mc@292: close(lshfid); mas01mc@292: } mas01mc@292: mas01mc@292: // Attempt to open LSH file mas01mc@292: if((lshfid = open(newIndexName,O_RDONLY))>0){ mas01mc@292: printf("INDEX: merging with existing LSH index\n"); mas01mc@292: fflush(stdout); mas01mc@340: char* mergeIndexName = newIndexName; mas01mc@292: mas01mc@292: // Get the lsh header info and find how many tracks are inserted already mas01mc@340: lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here mas01mc@292: assert(lsh); mas01mc@534: Uns32T maxs = audiodb_index_to_track_id(adb, lsh->get_maxp())+1; mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@292: mas01mc@340: // Insert up to lsh_param_b tracks mas01mc@340: if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@340: index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); mas01mc@340: } mas01mc@292: // This allows for updating index after more tracks are inserted into audioDB mas01mc@292: for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ mas01mc@292: mas01mc@292: Uns32T endTrack = startTrack + lsh_param_b; mas01mc@292: if( endTrack > dbH->numFiles) mas01mc@292: endTrack = dbH->numFiles; mas01mc@292: printf("Indexing track range: %d - %d\n", startTrack, endTrack); mas01mc@292: fflush(stdout); mas01mc@340: lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables mas01mc@292: assert(lsh); mas01mc@292: mas01mc@292: // Insert up to lsh_param_b database tracks mas01mc@292: index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); mas01mc@292: mas01mc@340: // Serialize to file (merging is performed here) mas01mc@340: lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk mas01mc@292: delete lsh; mas01mc@308: lsh = 0; mas01mc@340: } mas01mc@292: mas01mc@292: close(lshfid); mas01mc@292: printf("INDEX: done constructing LSH index.\n"); mas01mc@292: fflush(stdout); mas01mc@292: mas01mc@292: } mas01mc@292: else{ mas01mc@292: error("Something's wrong with LSH index file"); mas01mc@292: exit(1); mas01mc@292: } mas01mc@292: mas01mc@324: delete[] newIndexName; mas01mc@324: delete[] sNorm; mas01mc@324: delete[] sPower; mas01mc@324: } mas01mc@292: mas01mc@292: mas01cr@498: void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { mas01cr@498: if(usingPower){ mas01cr@498: int one; mas01cr@498: unsigned int count; mas01cr@498: mas01cr@498: count = read(powerfd, &one, sizeof(unsigned int)); mas01cr@498: if (count != sizeof(unsigned int)) { mas01cr@498: error("powerfd read failed", "int", "read"); mas01cr@498: } mas01cr@498: if (one != 1) { mas01cr@498: error("dimensionality of power file not 1", powerFileName); mas01cr@498: } mas01cr@498: mas01cr@498: // FIXME: should check that the powerfile is the right size for mas01cr@498: // this. -- CSR, 2007-10-30 mas01cr@498: count = read(powerfd, powerdata, numVectors * sizeof(double)); mas01cr@498: if (count != numVectors * sizeof(double)) { mas01cr@498: error("powerfd read failed", "double", "read"); mas01cr@498: } mas01cr@498: } mas01cr@498: } mas01cr@498: mas01mc@324: // initialize auxillary track data from filesystem mas01mc@324: // pre-conditions: mas01mc@324: // dbH->flags & O2_FLAG_LARGE_ADB mas01mc@324: // feature data allocated and copied (fvp) mas01mc@324: // mas01mc@324: // post-conditions: mas01mc@324: // allocated power data mas01mc@324: // allocated l2norm data mas01mc@324: // mas01mc@324: void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ mas01mc@324: if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) mas01mc@324: error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); mas01mc@292: mas01mc@324: // Allocate and read the power sequence mas01mc@324: if(trackTable[trackID]>=sequenceLength){ mas01mc@324: mas01mc@324: char* prefixedString = new char[O2_MAXFILESTR]; mas01mc@324: char* tmpStr = prefixedString; mas01mc@324: // Open and check dimensions of power file mas01mc@324: strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); mas01mc@324: prefix_name((char ** const)&prefixedString, adb_feature_root); mas01mc@324: if(prefixedString!=tmpStr) mas01mc@324: delete[] tmpStr; mas01mc@324: powerfd = open(prefixedString, O_RDONLY); mas01mc@324: if (powerfd < 0) { mas01mc@324: error("failed to open power file", prefixedString); mas01mc@324: } mas01mc@324: if (fstat(powerfd, &statbuf) < 0) { mas01mc@324: error("fstat error finding size of power file", prefixedString, "fstat"); mas01mc@324: } mas01mc@324: mas01mc@324: if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) mas01mc@324: error("Dimension mismatch: numPowers != numVectors", prefixedString); mas01mc@324: mas01mc@324: *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values mas01mc@324: assert(*sPowerp); mas01mc@324: *spPtrp = *sPowerp; mas01mc@324: insertPowerData(trackTable[trackID], powerfd, *sPowerp); mas01mc@324: if (0 < powerfd) { mas01mc@324: close(powerfd); mas01mc@324: } mas01mc@324: mas01cr@498: audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); mas01cr@498: audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength); mas01mc@324: powerTable = 0; mas01mc@324: mas01mc@324: // Allocate and calculate the l2norm sequence mas01mc@324: *sNormpp = new double[trackTable[trackID]]; mas01mc@324: assert(*sNormpp); mas01mc@324: *snPtrp = *sNormpp; mas01cr@498: audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp); mas01cr@498: audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); mas01cr@498: audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); mas01mc@324: } mas01mc@292: } mas01mc@292: mas01mc@292: void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, mas01mc@292: double** fvpp, double** sNormpp,double** snPtrp, mas01mc@292: double** sPowerp, double** spPtrp){ mas01mc@292: size_t nfv = 0; mas01mc@292: double* fvp = 0; // Keep pointer for memory allocation and free() for track data mas01mc@292: Uns32T trackID = 0; mas01mc@292: mas01mc@292: VERB_LOG(1, "indexing tracks..."); mas01mc@292: mas01mc@324: int trackfd = dbfid; mas01mc@292: for(trackID = start_track ; trackID < end_track ; trackID++ ){ mas01mc@324: if( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@324: char* prefixedString = new char[O2_MAXFILESTR]; mas01mc@324: char* tmpStr = prefixedString; mas01mc@324: // Open and check dimensions of feature file mas01mc@324: strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); mas01mc@324: prefix_name((char ** const) &prefixedString, adb_feature_root); mas01mc@324: if(prefixedString!=tmpStr) mas01mc@324: delete[] tmpStr; mas01cr@498: initInputFile(prefixedString); mas01mc@324: trackfd = infid; mas01mc@324: } mas01cr@498: if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv)) mas01cr@498: error("failed to read data"); mas01mc@292: *fvpp = fvp; // Protect memory allocation and free() for track data mas01mc@324: mas01mc@324: if( dbH->flags & O2_FLAG_LARGE_ADB ) mas01mc@324: // Load power and calculate power and l2norm sequence sums mas01mc@324: init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); mas01mc@324: mas01mc@292: if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) mas01mc@292: break; mas01mc@324: if ( dbH->flags & O2_FLAG_LARGE_ADB ){ mas01mc@324: close(infid); mas01mc@324: delete[] *sNormpp; mas01mc@324: delete[] *sPowerp; mas01mc@324: *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; mas01mc@324: } mas01mc@324: } // end for(trackID = start_track ; ... ) mas01mc@292: std::cout << "finished inserting." << endl; mas01mc@292: } mas01mc@292: mas01mc@292: int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ mas01mc@292: // Loop over the current input track's vectors mas01cr@305: Uns32T numVecs = 0; mas01mc@534: mas01mc@534: if (trackTable[trackID] < sequenceLength - 1) { mas01mc@534: numVecs = 0; mas01cr@305: } else { mas01mc@534: numVecs = trackTable[trackID] - sequenceLength + 1; mas01cr@305: } mas01mc@534: mas01mc@292: mas01mc@324: Uns32T numVecsAboveThreshold = 0, collisionCount = 0; mas01mc@324: if(numVecs){ mas01cr@498: std::vector > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength); mas01mc@324: mas01mc@324: for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) mas01cr@498: audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); mas01cr@498: int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold); mas01cr@498: if(vcount == -1) { mas01cr@498: audiodb_index_delete_shingles(vv); mas01cr@498: error("failed to norm shingles"); mas01cr@498: } mas01cr@498: numVecsAboveThreshold = vcount; mas01mc@324: collisionCount = index_insert_shingles(vv, trackID, *sppp); mas01cr@498: audiodb_index_delete_shingles(vv); mas01mc@324: } mas01cr@498: mas01mc@292: float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; mas01mc@292: mas01cr@498: /* audiodb_index_norm_shingles() only goes as far as the end of the mas01mc@292: sequence, which is right, but the space allocated is for the mas01mc@292: whole track. */ mas01mc@292: mas01mc@292: /* But numVecs will be O2_MAXTRACKLEN mas01mc@292: * So let's be certain the pointers are in the correct place mas01mc@292: */ mas01mc@292: mas01mc@324: if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ mas01mc@324: *snpp += trackTable[trackID]; mas01mc@324: *sppp += trackTable[trackID]; mas01mc@324: *fvpp += trackTable[trackID] * dbH->dim; mas01mc@324: } mas01mc@292: mas01mc@292: std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; mas01mc@292: std::cout.flush(); mas01mc@292: return true; mas01mc@292: } mas01mc@292: mas01mc@292: Uns32T audioDB::index_insert_shingles(vector >* vv, Uns32T trackID, double* spp){ mas01mc@292: Uns32T collisionCount = 0; mas01mc@292: cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; mas01mc@324: for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){ mas01mc@324: if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))) mas01mc@534: collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(adb, trackID, pointID)); mas01mc@324: spp+=sequenceHop; mas01mc@311: } mas01mc@292: return collisionCount; mas01mc@292: }