mas01mc@292: // LSH indexing
mas01mc@292: //
mas01mc@292: // Construct a persistent LSH table structure
mas01mc@292: // Store at the same location as dbName
mas01mc@292: // Naming convention:
mas01mc@292: //         dbName.lsh.${radius}.${sequenceLength}
mas01mc@292: //
mas01mc@292: //
mas01mc@292: // Author: Michael Casey
mas01mc@292: //   Date: 23 June 2008
mas01mc@324: //
mas01mc@324: // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292: 
mas01mc@292: #include "audioDB.h"
mas01mc@292: 
mas01cr@509: /*******  LSH indexing audioDB database access forall s \in {S} *******/
mas01mc@292: 
mas01mc@292: // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292: void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324:   if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324:     error("INDEXed database must be power-enabled", dbName);
mas01mc@324:   }
mas01mc@324: 
mas01mc@325:   double *snpp = 0, *sppp = 0;
mas01mc@324: 
mas01mc@292:   *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292:   *snp = new double[*dvp];  // songs norm pointer: L2 norm table for each vector
mas01mc@325:   snpp = *snp;
mas01mc@292:   *spp = new double[*dvp]; // song powertable pointer
mas01mc@292:   sppp = *spp;
mas01mc@325: 
mas01mc@324:   memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292:   memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324:   
mas01mc@324:   
mas01mc@292:   for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292:     if(trackTable[i] >= sequenceLength) {
mas01cr@498:       audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@498:       audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292:       
mas01cr@498:       audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@498:       audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292:     }
mas01mc@292:     snpp += trackTable[i];
mas01mc@292:     sppp += trackTable[i];
mas01mc@292:   }
mas01mc@324:   
mas01mc@292:   *vsnp = *snp;
mas01mc@292:   *vspp = *spp;
mas01mc@324:   
mas01mc@292:   // Move the feature vector read pointer to start of fetures in database
mas01mc@292:   lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292: }
mas01mc@292: 
mas01mc@292: /************************ LSH indexing ***********************************/
mas01mc@292: void audioDB::index_index_db(const char* dbName){
mas01mc@292:   char* newIndexName;
mas01mc@292:   double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292:   Uns32T dbVectors = 0;
mas01mc@292: 
mas01mc@324: 
mas01mc@292:   printf("INDEX: initializing header\n");
mas01mc@292:   // Check if audioDB exists, initialize header and open database for read
mas01mc@292:   forWrite = false;
mas01mc@292:   initDBHeader(dbName);
mas01mc@292: 
mas01mc@324:   if(dbH->flags & O2_FLAG_POWER)
mas01mc@324:     usingPower = true;
mas01mc@324:   
mas01mc@324:   if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324:     usingTimes = true;
mas01mc@324: 
mas01cr@498:   newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@498:   if(!newIndexName) {
mas01cr@498:     error("failed to get index name", dbName);
mas01cr@498:   }
mas01mc@292: 
mas01mc@292:   // Set unit norming flag override
mas01mc@292:   audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292: 
mas01mc@327:   VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327:   VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327:   VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327:   VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327:   VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327:   VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327:   VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327:   VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327:   VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327:   VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false"); 
mas01mc@292: 
mas01mc@292:   if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292:     printf("INDEX: constructing new LSH index\n");  
mas01mc@292:     printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292:     fflush(stdout);
mas01mc@292:     // Construct new LSH index
mas01mc@292:     lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292: 		  lsh_param_m,
mas01mc@292: 		  (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292: 		  lsh_param_N,
mas01mc@292: 		  lsh_param_ncols,
mas01mc@292: 		  (float)radius);
mas01mc@292:     assert(lsh);  
mas01mc@292: 
mas01mc@292:     Uns32T endTrack = lsh_param_b;
mas01mc@292:     if( endTrack > dbH->numFiles)
mas01mc@292:       endTrack = dbH->numFiles;
mas01mc@292:     // Insert up to lsh_param_b tracks
mas01mc@324:     if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324:       index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);  
mas01mc@324:     }
mas01mc@324:     index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292:     lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292:     
mas01mc@292:     // Clean up
mas01mc@292:     delete lsh;
mas01mc@308:     lsh = 0;
mas01cr@498:   } else {
mas01mc@292:     close(lshfid);
mas01mc@292:   }
mas01mc@292:   
mas01mc@292:   // Attempt to open LSH file
mas01mc@292:   if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292:     printf("INDEX: merging with existing LSH index\n");
mas01mc@292:     fflush(stdout);
mas01mc@340:     char* mergeIndexName = newIndexName;
mas01mc@292: 
mas01mc@292:     // Get the lsh header info and find how many tracks are inserted already
mas01mc@340:     lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292:     assert(lsh);
mas01mc@534:     Uns32T maxs = audiodb_index_to_track_id(adb, lsh->get_maxp())+1;
mas01mc@292:     delete lsh;
mas01mc@308:     lsh = 0;
mas01mc@292: 
mas01mc@340:     // Insert up to lsh_param_b tracks
mas01mc@340:     if(  !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340:       index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);  
mas01mc@340:     }
mas01mc@292:     // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292:     for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292: 
mas01mc@292:       Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292:       if( endTrack > dbH->numFiles)
mas01mc@292: 	endTrack = dbH->numFiles;
mas01mc@292:       printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292:       fflush(stdout);
mas01mc@340:       lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292:       assert(lsh);
mas01mc@292:       
mas01mc@292:       // Insert up to lsh_param_b database tracks
mas01mc@292:       index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292: 
mas01mc@340:       // Serialize to file (merging is performed here)
mas01mc@340:       lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292:       delete lsh;
mas01mc@308:       lsh = 0;
mas01mc@340:     }
mas01mc@292:     
mas01mc@292:     close(lshfid);    
mas01mc@292:     printf("INDEX: done constructing LSH index.\n");  
mas01mc@292:     fflush(stdout);
mas01mc@292:     
mas01mc@292:   }
mas01mc@292:   else{
mas01mc@292:     error("Something's wrong with LSH index file");
mas01mc@292:     exit(1);
mas01mc@292:   }
mas01mc@292:     
mas01mc@324:   delete[] newIndexName;
mas01mc@324:   delete[] sNorm;
mas01mc@324:   delete[] sPower;
mas01mc@324: }
mas01mc@292: 
mas01mc@292: 
mas01cr@498: void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@498:   if(usingPower){
mas01cr@498:     int one;
mas01cr@498:     unsigned int count;
mas01cr@498:     
mas01cr@498:     count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@498:     if (count != sizeof(unsigned int)) {
mas01cr@498:       error("powerfd read failed", "int", "read");
mas01cr@498:     }
mas01cr@498:     if (one != 1) {
mas01cr@498:       error("dimensionality of power file not 1", powerFileName);
mas01cr@498:     }
mas01cr@498:     
mas01cr@498:     // FIXME: should check that the powerfile is the right size for
mas01cr@498:     // this.  -- CSR, 2007-10-30
mas01cr@498:     count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@498:     if (count != numVectors * sizeof(double)) {
mas01cr@498:       error("powerfd read failed", "double", "read");
mas01cr@498:     }
mas01cr@498:   }
mas01cr@498: }
mas01cr@498: 
mas01mc@324: // initialize auxillary track data from filesystem
mas01mc@324: // pre-conditions:
mas01mc@324: // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324: // feature data allocated and copied (fvp)
mas01mc@324: //
mas01mc@324: // post-conditions:
mas01mc@324: // allocated power data
mas01mc@324: // allocated l2norm data
mas01mc@324: //
mas01mc@324: void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){  
mas01mc@324:   if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324:     error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292: 
mas01mc@324:   // Allocate and read the power sequence
mas01mc@324:   if(trackTable[trackID]>=sequenceLength){
mas01mc@324:     
mas01mc@324:     char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324:     char* tmpStr = prefixedString;
mas01mc@324:     // Open and check dimensions of power file
mas01mc@324:     strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324:     prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324:     if(prefixedString!=tmpStr)
mas01mc@324:       delete[] tmpStr;
mas01mc@324:     powerfd = open(prefixedString, O_RDONLY);
mas01mc@324:     if (powerfd < 0) {
mas01mc@324:       error("failed to open power file", prefixedString);
mas01mc@324:     }
mas01mc@324:     if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324:       error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324:     }
mas01mc@324:     
mas01mc@324:     if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324:       error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324:    
mas01mc@324:     *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324:     assert(*sPowerp);
mas01mc@324:     *spPtrp = *sPowerp;
mas01mc@324:     insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324:     if (0 < powerfd) {
mas01mc@324:       close(powerfd);
mas01mc@324:     }
mas01mc@324:     
mas01cr@498:     audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@498:     audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324:     powerTable = 0;
mas01mc@324: 
mas01mc@324:     // Allocate and calculate the l2norm sequence
mas01mc@324:     *sNormpp = new double[trackTable[trackID]];
mas01mc@324:     assert(*sNormpp);
mas01mc@324:     *snPtrp = *sNormpp;
mas01cr@498:     audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@498:     audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@498:     audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324:   }
mas01mc@292: }
mas01mc@292: 
mas01mc@292: void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292: 				  double** fvpp, double** sNormpp,double** snPtrp, 
mas01mc@292: 				  double** sPowerp, double** spPtrp){  
mas01mc@292:   size_t nfv = 0;
mas01mc@292:   double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292:   Uns32T trackID = 0;
mas01mc@292: 
mas01mc@292:   VERB_LOG(1, "indexing tracks...");
mas01mc@292: 
mas01mc@324:   int trackfd = dbfid;
mas01mc@292:   for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324:     if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324:       char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324:       char* tmpStr = prefixedString;
mas01mc@324:       // Open and check dimensions of feature file
mas01mc@324:       strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324:       prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324:       if(prefixedString!=tmpStr)
mas01mc@324: 	delete[] tmpStr;
mas01cr@498:       initInputFile(prefixedString);
mas01mc@324:       trackfd = infid;
mas01mc@324:     }
mas01cr@498:     if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@498:       error("failed to read data");
mas01mc@292:     *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324:     
mas01mc@324:     if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324:       // Load power and calculate power and l2norm sequence sums
mas01mc@324:       init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324:     
mas01mc@292:     if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292:       break;    
mas01mc@324:     if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324:       close(infid);
mas01mc@324:       delete[] *sNormpp;
mas01mc@324:       delete[] *sPowerp;
mas01mc@324:       *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324:     }
mas01mc@324:   } // end for(trackID = start_track ; ... )
mas01mc@292:   std::cout << "finished inserting." << endl;
mas01mc@292: }
mas01mc@292: 
mas01mc@292: int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292:   // Loop over the current input track's vectors
mas01cr@305:   Uns32T numVecs = 0;
mas01mc@534:   
mas01mc@534:   if (trackTable[trackID] < sequenceLength - 1) {
mas01mc@534:     numVecs = 0;
mas01cr@305:   } else {
mas01mc@534:     numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305:   }
mas01mc@534: 
mas01mc@292:   
mas01mc@324:   Uns32T numVecsAboveThreshold = 0, collisionCount = 0; 
mas01mc@324:   if(numVecs){
mas01cr@498:     std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324:     
mas01mc@324:     for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@498:       audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@498:     int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@498:     if(vcount == -1) {
mas01cr@498:       audiodb_index_delete_shingles(vv);
mas01cr@498:       error("failed to norm shingles");
mas01cr@498:     }
mas01cr@498:     numVecsAboveThreshold = vcount;
mas01mc@324:     collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@498:     audiodb_index_delete_shingles(vv);
mas01mc@324:   }
mas01cr@498: 
mas01mc@292:   float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292: 
mas01cr@498:   /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292:      sequence, which is right, but the space allocated is for the
mas01mc@292:      whole track.  */
mas01mc@292: 
mas01mc@292:   /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292:    * So let's be certain the pointers are in the correct place
mas01mc@292:    */
mas01mc@292: 
mas01mc@324:   if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324:     *snpp += trackTable[trackID];
mas01mc@324:     *sppp += trackTable[trackID];
mas01mc@324:     *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324:   }
mas01mc@292: 
mas01mc@292:   std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292:   std::cout.flush();  
mas01mc@292:   return true;
mas01mc@292: }
mas01mc@292: 
mas01mc@292: Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292:   Uns32T collisionCount = 0;
mas01mc@292:   cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324:   for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324:     if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01mc@534:       collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(adb, trackID, pointID));
mas01mc@324:     spp+=sequenceHop;
mas01mc@311:     }
mas01mc@292:   return collisionCount;
mas01mc@292: }