annotate index.cpp @ 311:cac5b3465318

Added sequencehop on Indexing to optionally reduce size of large indexes. Added cleanup() call on WS query exit to fix MMAP memory leak
author mas01mc
date Sat, 09 Aug 2008 15:04:16 +0000
parents 896679d8cc39
children b9eff6896943 c93be2f3a674
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@292 11
mas01mc@292 12 #include "audioDB.h"
mas01mc@292 13 #include "ReporterBase.h"
mas01mc@292 14
mas01mc@292 15
mas01mc@292 16 /************************* LSH point index to audioDB conversion *****************/
mas01mc@292 17 Uns32T audioDB::index_to_trackID(Uns32T lshID){
mas01mc@292 18 return lshID>>LSH_N_POINT_BITS;
mas01mc@292 19 }
mas01mc@292 20
mas01mc@292 21 Uns32T audioDB::index_to_trackPos(Uns32T lshID){
mas01mc@292 22 return lshID&LSH_POINT_MASK;
mas01mc@292 23 }
mas01mc@292 24
mas01mc@292 25 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){
mas01mc@292 26 return (trackID << LSH_N_POINT_BITS) | spos;
mas01mc@292 27 }
mas01mc@292 28
mas01mc@292 29 /************************* LSH indexing and query initialization *****************/
mas01mc@292 30
mas01mc@292 31 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
mas01mc@292 32 char* indexName = new char[MAXSTR];
mas01mc@292 33 // Attempt to make new file
mas01mc@292 34 if(strlen(dbName) > (MAXSTR - 32))
mas01mc@292 35 error("dbName is too long for LSH index filename appendages");
mas01mc@292 36 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 37 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 38 return indexName;
mas01mc@292 39 }
mas01mc@292 40
mas01mc@292 41 // return true if index exists else return false
mas01mc@292 42 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
mas01mc@292 43 // Test to see if file exists
mas01mc@292 44 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 45 lshfid = open (indexName, O_RDONLY);
mas01mc@292 46 delete[] indexName;
mas01mc@292 47 close(lshfid);
mas01mc@292 48
mas01mc@292 49 if(lshfid<0)
mas01mc@292 50 return false;
mas01mc@292 51 else
mas01mc@292 52 return true;
mas01mc@292 53 }
mas01mc@292 54
mas01mc@308 55 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
mas01mc@308 56 LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
mas01mc@308 57 if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
mas01mc@308 58 audioDB::lsh = gIndx; // Use the global SERVER resident index
mas01mc@308 59 else{
mas01mc@308 60 if(audioDB::lsh)
mas01mc@308 61 delete audioDB::lsh;
mas01mc@308 62 audioDB::lsh = new LSH(indexName, load_hashTables);
mas01mc@308 63 }
mas01mc@308 64 assert(audioDB::lsh);
mas01mc@308 65 return audioDB::lsh;
mas01mc@308 66 }
mas01mc@308 67
mas01mc@292 68 vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){
mas01mc@292 69 if(vv)
mas01mc@292 70 delete vv;
mas01mc@292 71 vv = new vector<vector<float> >(sz);
mas01mc@292 72 for(Uns32T i=0 ; i < sz ; i++)
mas01mc@292 73 (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage
mas01mc@292 74 return vv;
mas01mc@292 75 }
mas01mc@292 76
mas01mc@292 77 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 78
mas01mc@292 79 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 80 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@292 81 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 82 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@292 83
mas01mc@292 84 double *snpp = *snp, *sppp = 0;
mas01mc@292 85 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 86
mas01mc@292 87 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@292 88 error("database not power-enabled", dbName);
mas01mc@292 89 }
mas01mc@292 90 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 91 sppp = *spp;
mas01mc@292 92 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@292 93
mas01mc@292 94 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 95 if(trackTable[i] >= sequenceLength) {
mas01mc@292 96 sequence_sum(snpp, trackTable[i], sequenceLength);
mas01mc@292 97 sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 98
mas01mc@292 99 sequence_sum(sppp, trackTable[i], sequenceLength);
mas01mc@292 100 sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 101 }
mas01mc@292 102 snpp += trackTable[i];
mas01mc@292 103 sppp += trackTable[i];
mas01mc@292 104 }
mas01mc@292 105
mas01mc@292 106 *vsnp = *snp;
mas01mc@292 107 *vspp = *spp;
mas01mc@292 108
mas01mc@292 109 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 110 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 111 }
mas01mc@292 112
mas01mc@292 113
mas01mc@292 114 /************************ LSH indexing ***********************************/
mas01mc@292 115 void audioDB::index_index_db(const char* dbName){
mas01mc@292 116
mas01mc@292 117 char* newIndexName;
mas01mc@292 118 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 119 Uns32T dbVectors = 0;
mas01mc@292 120
mas01mc@292 121 printf("INDEX: initializing header\n");
mas01mc@292 122 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 123 forWrite = false;
mas01mc@292 124 initDBHeader(dbName);
mas01mc@292 125
mas01mc@292 126 newIndexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 127
mas01mc@292 128 // Set unit norming flag override
mas01mc@292 129 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 130
mas01mc@292 131 printf("INDEX: dim %d\n", dbH->dim);
mas01mc@292 132 printf("INDEX: R %f\n", radius);
mas01mc@292 133 printf("INDEX: seqlen %d\n", sequenceLength);
mas01mc@292 134 printf("INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@292 135 printf("INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@292 136 printf("INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@292 137 printf("INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@296 138 printf("INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@292 139 printf("INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@292 140 printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 141 fflush(stdout);
mas01mc@292 142
mas01mc@292 143
mas01mc@292 144 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@292 145
mas01mc@292 146 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 147 printf("INDEX: constructing new LSH index\n");
mas01mc@292 148 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 149 fflush(stdout);
mas01mc@292 150 // Construct new LSH index
mas01mc@292 151 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 152 lsh_param_m,
mas01mc@292 153 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 154 lsh_param_N,
mas01mc@292 155 lsh_param_ncols,
mas01mc@292 156 (float)radius);
mas01mc@292 157 assert(lsh);
mas01mc@292 158
mas01mc@292 159 Uns32T endTrack = lsh_param_b;
mas01mc@292 160 if( endTrack > dbH->numFiles)
mas01mc@292 161 endTrack = dbH->numFiles;
mas01mc@292 162 // Insert up to lsh_param_b tracks
mas01mc@292 163 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 164 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 165
mas01mc@292 166 // Clean up
mas01mc@292 167 delete lsh;
mas01mc@308 168 lsh = 0;
mas01mc@292 169 close(lshfid);
mas01mc@292 170 }
mas01mc@292 171
mas01mc@292 172 // Attempt to open LSH file
mas01mc@292 173 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 174 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 175 fflush(stdout);
mas01mc@292 176
mas01mc@292 177 // Get the lsh header info and find how many tracks are inserted already
mas01mc@292 178 lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 179 assert(lsh);
mas01mc@292 180 Uns32T maxs = index_to_trackID(lsh->get_maxp())+1;
mas01mc@292 181 delete lsh;
mas01mc@308 182 lsh = 0;
mas01mc@292 183
mas01mc@292 184 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 185 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 186
mas01mc@292 187 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 188 if( endTrack > dbH->numFiles)
mas01mc@292 189 endTrack = dbH->numFiles;
mas01mc@292 190 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 191 fflush(stdout);
mas01mc@292 192 lsh = new LSH(newIndexName, lsh_in_core); // Initialize core memory for LSH tables
mas01mc@292 193 assert(lsh);
mas01mc@292 194
mas01mc@292 195 // Insert up to lsh_param_b database tracks
mas01mc@292 196 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 197
mas01mc@292 198 // Serialize to file
mas01mc@292 199 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 200 delete lsh;
mas01mc@308 201 lsh = 0;
mas01mc@292 202 }
mas01mc@292 203
mas01mc@292 204 close(lshfid);
mas01mc@292 205 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 206 fflush(stdout);
mas01mc@292 207
mas01mc@292 208 }
mas01mc@292 209 else{
mas01mc@292 210 error("Something's wrong with LSH index file");
mas01mc@292 211 exit(1);
mas01mc@292 212 }
mas01mc@292 213
mas01mc@292 214
mas01mc@292 215 delete[] newIndexName;
mas01mc@292 216 if(sNorm)
mas01mc@292 217 delete[] sNorm;
mas01mc@292 218 if(sPower)
mas01mc@292 219 delete[] sPower;
mas01mc@292 220
mas01mc@292 221
mas01mc@292 222 }
mas01mc@292 223
mas01mc@292 224 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 225 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 226 double** sPowerp, double** spPtrp){
mas01mc@292 227 size_t nfv = 0;
mas01mc@292 228 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 229 Uns32T trackID = 0;
mas01mc@292 230
mas01mc@292 231 VERB_LOG(1, "indexing tracks...");
mas01mc@292 232
mas01mc@292 233
mas01mc@292 234 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@292 235 read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv
mas01mc@292 236 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@292 237 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 238 break;
mas01mc@292 239 }
mas01mc@292 240 std::cout << "finished inserting." << endl;
mas01mc@292 241 }
mas01mc@292 242
mas01mc@292 243 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 244 // Loop over the current input track's vectors
mas01cr@305 245 Uns32T numVecs = 0;
mas01cr@305 246 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 247 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 248 numVecs = 0;
mas01cr@305 249 } else {
mas01cr@305 250 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 251 }
mas01cr@305 252 } else {
mas01cr@305 253 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 254 numVecs = 0;
mas01cr@305 255 } else {
mas01cr@305 256 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 257 }
mas01cr@305 258 }
mas01mc@292 259 vv = index_initialize_shingles(numVecs);
mas01mc@292 260
mas01mc@292 261 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01mc@292 262 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01mc@292 263
mas01mc@292 264 Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
mas01mc@292 265 Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01mc@292 266 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 267
mas01mc@292 268 /* index_norm_shingles() only goes as far as the end of the
mas01mc@292 269 sequence, which is right, but the space allocated is for the
mas01mc@292 270 whole track. */
mas01mc@292 271
mas01mc@292 272 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 273 * So let's be certain the pointers are in the correct place
mas01mc@292 274 */
mas01mc@292 275
mas01mc@292 276 *snpp += trackTable[trackID];
mas01mc@292 277 *sppp += trackTable[trackID];
mas01mc@292 278 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@292 279
mas01mc@292 280 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 281 std::cout.flush();
mas01mc@292 282 return true;
mas01mc@292 283 }
mas01mc@292 284
mas01mc@292 285 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 286 Uns32T collisionCount = 0;
mas01mc@292 287 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@311 288 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop)
mas01mc@311 289 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){
mas01mc@292 290 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID));
mas01mc@311 291 spp+=sequenceHop;
mas01mc@311 292 }
mas01mc@292 293 return collisionCount;
mas01mc@292 294 }
mas01mc@292 295
mas01mc@292 296 /********************* LSH shingle construction ***************************/
mas01mc@292 297
mas01mc@292 298 // Construct shingles out of a feature matrix
mas01mc@292 299 // inputs:
mas01mc@292 300 // idx is vector index in feature matrix
mas01mc@292 301 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01mc@292 302 //
mas01mc@292 303 // pre-conditions:
mas01mc@292 304 // dbH->dim
mas01mc@292 305 // sequenceLength
mas01mc@292 306 // idx < numVectors - sequenceLength + 1
mas01mc@292 307 //
mas01mc@292 308 // post-conditions:
mas01mc@292 309 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01mc@292 310
mas01mc@292 311 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01mc@292 312 assert(idx<(*vv).size());
mas01mc@292 313 vector<float>::iterator ve = (*vv)[idx].end();
mas01mc@292 314 vi=(*vv)[idx].begin(); // shingle iterator
mas01mc@292 315 // First feature vector in shingle
mas01mc@292 316 if(idx==0){
mas01mc@292 317 while(vi!=ve)
mas01mc@292 318 *vi++ = (float)(*fvp++);
mas01mc@292 319 }
mas01mc@292 320 // Not first feature vector in shingle
mas01mc@292 321 else{
mas01mc@292 322 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator
mas01mc@292 323 // Previous seqLen-1 dim-vectors
mas01mc@292 324 while(vi!=ve-dim)
mas01mc@292 325 *vi++=*ui++;
mas01mc@292 326 // Move data pointer to next feature vector
mas01mc@292 327 fvp += ( seqLen + idx - 1 ) * dim ;
mas01mc@292 328 // New d-vector
mas01mc@292 329 while(vi!=ve)
mas01mc@292 330 *vi++ = (float)(*fvp++);
mas01mc@292 331 }
mas01mc@292 332 }
mas01mc@292 333
mas01mc@292 334 // norm shingles
mas01mc@292 335 // in-place norming, no deletions
mas01mc@292 336 // If using power, return number of shingles above power threshold
mas01mc@292 337 int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){
mas01mc@292 338 int z = 0; // number of above-threshold shingles
mas01mc@292 339 float l2norm;
mas01mc@292 340 double power;
mas01mc@292 341 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01mc@292 342 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01mc@292 343 if(!spp)
mas01mc@292 344 error("LSH indexing and query requires a power feature using -w or -W");
mas01mc@292 345 Uns32T shingleSize = sequenceLength*dbH->dim;
mas01mc@292 346 for(Uns32T a=0; a<(*vv).size(); a++){
mas01mc@292 347 l2norm = (float)(*snp++);
mas01mc@292 348 if(audioDB::normalizedDistance)
mas01mc@292 349 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01mc@292 350
mas01mc@292 351 for(Uns32T b=0; b < shingleSize ; b++)
mas01mc@292 352 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01mc@292 353
mas01mc@292 354 power = *spp++;
mas01mc@292 355 if(use_absolute_threshold){
mas01mc@292 356 if ( power >= absolute_threshold )
mas01mc@292 357 z++;
mas01mc@292 358 }
mas01mc@292 359 else
mas01mc@292 360 z++;
mas01mc@292 361 }
mas01mc@292 362 return z;
mas01mc@292 363 }
mas01mc@292 364
mas01mc@292 365
mas01mc@292 366 /*********************** LSH retrieval ****************************/
mas01mc@292 367
mas01mc@292 368
mas01mc@292 369 // return true if indexed query performed else return false
mas01mc@292 370 int audioDB::index_init_query(const char* dbName){
mas01mc@292 371
mas01mc@292 372 if(!(index_exists(dbName, radius, sequenceLength)))
mas01mc@292 373 return false;
mas01mc@292 374
mas01mc@292 375 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 376
mas01mc@292 377 // Test to see if file exists
mas01mc@292 378 if((lshfid = open (indexName, O_RDONLY)) < 0){
mas01mc@292 379 delete[] indexName;
mas01mc@292 380 return false;
mas01mc@292 381 }
mas01mc@292 382
mas01mc@308 383 lsh = index_allocate(indexName, false); // Get the header only here
mas01mc@292 384 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
mas01mc@292 385
mas01mc@311 386 if(lsh!=SERVER_LSH_INDEX_SINGLETON){
mas01mc@308 387 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
mas01mc@308 388 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
mas01mc@308 389 printf("INDEX: dim %d\n", dbH->dim);
mas01mc@308 390 printf("INDEX: R %f\n", lsh->get_radius());
mas01mc@308 391 printf("INDEX: seqlen %d\n", sequenceLength);
mas01mc@308 392 printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
mas01mc@308 393 printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
mas01mc@308 394 printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
mas01mc@308 395 printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
mas01mc@308 396 printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp()));
mas01mc@308 397 printf("INDEX: Opened LSH index file %s\n", indexName);
mas01mc@308 398 fflush(stdout);
mas01mc@308 399 }
mas01mc@292 400
mas01mc@292 401 // Check to see if we are loading hash tables into core, and do so if true
mas01mc@292 402 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
mas01mc@308 403 if(SERVER_LSH_INDEX_SINGLETON)
mas01mc@308 404 fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName());
mas01mc@308 405 else
mas01mc@308 406 printf("INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
mas01mc@308 407 lsh = index_allocate(indexName, true);
mas01mc@292 408 }
mas01mc@292 409
mas01mc@292 410 delete[] indexName;
mas01mc@292 411 return true;
mas01mc@292 412 }
mas01mc@292 413
mas01mc@292 414 // *Static* approximate NN point reporter callback method for lshlib
mas01mc@292 415 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 416 assert(instancePtr); // We need an instance for this callback
mas01mc@292 417 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@292 418 Uns32T trackID = index_to_trackID(pointID);
mas01mc@292 419 Uns32T spos = index_to_trackPos(pointID);
mas01mc@292 420 // Skip identity in query_from_key
mas01mc@292 421 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 422 myself->reporter->add_point(trackID, qpos, spos, dist);
mas01mc@292 423 }
mas01mc@292 424
mas01mc@292 425 // *Static* exact NN point reporter callback method for lshlib
mas01mc@292 426 // Maintain a queue of points to pass to query_points() for exact evaluation
mas01mc@292 427 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 428 assert(instancePtr); // We need an instance for this callback
mas01mc@292 429 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@292 430 Uns32T trackID = index_to_trackID(pointID);
mas01mc@292 431 Uns32T spos = index_to_trackPos(pointID);
mas01mc@292 432 // Skip identity in query_from_key
mas01mc@292 433 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 434 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
mas01mc@292 435 }
mas01mc@292 436
mas01mc@292 437 void audioDB::initialize_exact_evalutation_queue(){
mas01mc@292 438 if(exact_evaluation_queue)
mas01mc@292 439 delete exact_evaluation_queue;
mas01mc@292 440 exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >;
mas01mc@292 441 }
mas01mc@292 442
mas01mc@292 443 void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){
mas01mc@292 444 PointPair p(trackID, qpos, spos);
mas01mc@292 445 exact_evaluation_queue->push(p);
mas01mc@292 446 }
mas01mc@292 447
mas01mc@292 448 // return 0: if index does not exist
mas01mc@292 449 // return nqv: if index exists
mas01mc@292 450 int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) {
mas01mc@292 451
mas01mc@292 452 unsigned int numVectors;
mas01mc@292 453 double *query, *query_data;
mas01mc@292 454 double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0;
mas01mc@292 455 double meanQdur;
mas01mc@292 456 void (*add_point_func)(void*,Uns32T,Uns32T,float);
mas01mc@292 457
mas01mc@292 458 // Set the point-reporter callback based on the value of lsh_exact
mas01mc@292 459 if(lsh_exact){
mas01mc@292 460 initialize_exact_evalutation_queue();
mas01mc@292 461 add_point_func = &index_add_point_exact;
mas01mc@292 462 }
mas01mc@292 463 else
mas01mc@292 464 add_point_func = &index_add_point_approximate;
mas01mc@292 465
mas01mc@292 466 if(!index_init_query(dbName)) // sets-up LSH index structures for querying
mas01mc@292 467 return 0;
mas01mc@292 468
mas01mc@292 469 char* database = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 470
mas01mc@292 471 if(query_from_key)
mas01mc@292 472 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
mas01mc@292 473 else
mas01mc@292 474 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors
mas01mc@292 475
mas01mc@292 476 VERB_LOG(1, "retrieving tracks...");
mas01mc@292 477
mas01mc@292 478 assert(pointNN>0 && pointNN<=O2_MAXNN);
mas01mc@292 479 assert(trackNN>0 && trackNN<=O2_MAXNN);
mas01mc@292 480
mas01mc@292 481 gettimeofday(&tv1, NULL);
mas01mc@292 482 // query vector index
mas01mc@292 483 Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1;
mas01mc@292 484 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
mas01mc@292 485 cout << "Nq=" << Nq; cout.flush();
mas01mc@292 486 // Construct shingles from query features
mas01mc@292 487 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
mas01mc@292 488 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
mas01mc@292 489
mas01mc@292 490 // Normalize query vectors
mas01mc@292 491 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr );
mas01mc@292 492 cout << " Nq'=" << numVecsAboveThreshold << endl; cout.flush();
mas01mc@292 493
mas01mc@292 494 // Nq contains number of inspected points in query file,
mas01mc@292 495 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01mc@292 496 double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation
mas01mc@292 497 if(usingQueryPoint && numVecsAboveThreshold){
mas01mc@292 498 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
mas01mc@292 499 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 500 else
mas01mc@292 501 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 502 }
mas01mc@292 503 else if(numVecsAboveThreshold)
mas01mc@292 504 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
mas01mc@292 505 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold)))
mas01mc@292 506 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
mas01mc@292 507 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this);
mas01mc@292 508 else
mas01mc@292 509 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this);
mas01mc@292 510
mas01mc@292 511 if(lsh_exact)
mas01mc@292 512 // Perform exact distance computation on point pairs in exact_evaluation_queue
mas01mc@292 513 query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors);
mas01mc@292 514
mas01mc@292 515 gettimeofday(&tv2,NULL);
mas01mc@292 516 VERB_LOG(1,"elapsed time: %ld msec\n",
mas01mc@292 517 (tv2.tv_sec*1000 + tv2.tv_usec/1000) -
mas01mc@292 518 (tv1.tv_sec*1000 + tv1.tv_usec/1000))
mas01mc@292 519
mas01mc@292 520 // Close the index file
mas01mc@292 521 close(lshfid);
mas01mc@292 522
mas01mc@292 523 // Clean up
mas01mc@292 524 if(query_data)
mas01mc@292 525 delete[] query_data;
mas01mc@292 526 if(qNorm)
mas01mc@292 527 delete[] qNorm;
mas01mc@292 528 if(qPower)
mas01mc@292 529 delete[] qPower;
mas01mc@292 530 if(database)
mas01mc@292 531 delete[] database;
mas01mc@292 532
mas01mc@292 533 return Nq;
mas01mc@292 534 }
mas01mc@292 535