annotate index.cpp @ 427:adaa6a688a04 api-inversion

Move sequence_foo() functions out of audioDB:: namespace... ... and into the internals header, mostly to get them out of the way. That means they have to be inline, which is probably suboptimal but will do for now.
author mas01cr
date Wed, 24 Dec 2008 10:55:24 +0000
parents 4a22a0bdf9a9
children 8632cd387e24
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01cr@426 15 #include "audioDB-internals.h"
mas01mc@292 16
mas01mc@292 17 /************************* LSH point index to audioDB conversion *****************/
mas01mc@324 18 Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
mas01mc@324 19 assert(nPntBits);
mas01mc@324 20 return lshID>>nPntBits;
mas01mc@292 21 }
mas01mc@292 22
mas01mc@324 23 Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
mas01mc@324 24 assert(nPntBits);
mas01mc@324 25 return lshID&((1<<nPntBits)-1);
mas01mc@292 26 }
mas01mc@292 27
mas01mc@324 28 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
mas01mc@324 29 assert(nPntBits);
mas01mc@324 30 return (trackID << nPntBits) | spos;
mas01mc@292 31 }
mas01mc@292 32
mas01mc@292 33 /************************* LSH indexing and query initialization *****************/
mas01mc@292 34
mas01mc@292 35 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
mas01mc@292 36 char* indexName = new char[MAXSTR];
mas01mc@292 37 // Attempt to make new file
mas01mc@292 38 if(strlen(dbName) > (MAXSTR - 32))
mas01mc@292 39 error("dbName is too long for LSH index filename appendages");
mas01mc@292 40 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 41 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 42 return indexName;
mas01mc@292 43 }
mas01mc@292 44
mas01mc@292 45 // return true if index exists else return false
mas01mc@292 46 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
mas01mc@292 47 // Test to see if file exists
mas01mc@292 48 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 49 lshfid = open (indexName, O_RDONLY);
mas01mc@292 50 delete[] indexName;
mas01mc@292 51 close(lshfid);
mas01mc@292 52
mas01mc@292 53 if(lshfid<0)
mas01mc@292 54 return false;
mas01mc@292 55 else
mas01mc@292 56 return true;
mas01mc@292 57 }
mas01mc@292 58
mas01mc@324 59 // If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName())
mas01mc@324 60 // If they match, i.e. path+dbName_resident == path+dbName_requested, use
mas01mc@324 61 // the memory-resident index.
mas01mc@324 62 // Else allocate a new LSH instance and load the index from disk
mas01mc@308 63 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
mas01mc@308 64 LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
mas01mc@308 65 if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
mas01mc@308 66 audioDB::lsh = gIndx; // Use the global SERVER resident index
mas01mc@308 67 else{
mas01mc@308 68 if(audioDB::lsh)
mas01mc@308 69 delete audioDB::lsh;
mas01mc@308 70 audioDB::lsh = new LSH(indexName, load_hashTables);
mas01mc@308 71 }
mas01mc@308 72 assert(audioDB::lsh);
mas01mc@308 73 return audioDB::lsh;
mas01mc@308 74 }
mas01mc@308 75
mas01mc@292 76 vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){
mas01mc@292 77 if(vv)
mas01mc@292 78 delete vv;
mas01mc@292 79 vv = new vector<vector<float> >(sz);
mas01mc@292 80 for(Uns32T i=0 ; i < sz ; i++)
mas01mc@292 81 (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage
mas01mc@292 82 return vv;
mas01mc@292 83 }
mas01mc@292 84
mas01mc@292 85 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 86
mas01mc@292 87 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 88 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 89 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 90 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 91 }
mas01mc@324 92
mas01mc@325 93 double *snpp = 0, *sppp = 0;
mas01mc@324 94
mas01mc@292 95 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 96 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 97 snpp = *snp;
mas01mc@292 98 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 99 sppp = *spp;
mas01mc@325 100
mas01mc@324 101 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 102 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 103
mas01mc@324 104
mas01mc@292 105 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 106 if(trackTable[i] >= sequenceLength) {
mas01cr@427 107 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@427 108 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 109
mas01cr@427 110 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@427 111 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 112 }
mas01mc@292 113 snpp += trackTable[i];
mas01mc@292 114 sppp += trackTable[i];
mas01mc@292 115 }
mas01mc@324 116
mas01mc@292 117 *vsnp = *snp;
mas01mc@292 118 *vspp = *spp;
mas01mc@324 119
mas01mc@292 120 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 121 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 122 }
mas01mc@292 123
mas01mc@292 124
mas01mc@292 125 /************************ LSH indexing ***********************************/
mas01mc@292 126 void audioDB::index_index_db(const char* dbName){
mas01mc@292 127 char* newIndexName;
mas01mc@292 128 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 129 Uns32T dbVectors = 0;
mas01mc@292 130
mas01mc@324 131
mas01mc@292 132 printf("INDEX: initializing header\n");
mas01mc@292 133 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 134 forWrite = false;
mas01mc@292 135 initDBHeader(dbName);
mas01mc@292 136
mas01mc@324 137 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 138 usingPower = true;
mas01mc@324 139
mas01mc@324 140 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 141 usingTimes = true;
mas01mc@324 142
mas01mc@292 143 newIndexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 144
mas01mc@292 145 // Set unit norming flag override
mas01mc@292 146 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 147
mas01mc@327 148 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 149 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 150 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 151 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 152 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 153 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 154 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 155 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 156 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 157 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 158
mas01mc@292 159 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 160 printf("INDEX: constructing new LSH index\n");
mas01mc@292 161 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 162 fflush(stdout);
mas01mc@292 163 // Construct new LSH index
mas01mc@292 164 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 165 lsh_param_m,
mas01mc@292 166 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 167 lsh_param_N,
mas01mc@292 168 lsh_param_ncols,
mas01mc@292 169 (float)radius);
mas01mc@292 170 assert(lsh);
mas01mc@292 171
mas01mc@292 172 Uns32T endTrack = lsh_param_b;
mas01mc@292 173 if( endTrack > dbH->numFiles)
mas01mc@292 174 endTrack = dbH->numFiles;
mas01mc@292 175 // Insert up to lsh_param_b tracks
mas01mc@324 176 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 177 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 178 }
mas01mc@324 179 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 180 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 181
mas01mc@292 182 // Clean up
mas01mc@292 183 delete lsh;
mas01mc@308 184 lsh = 0;
mas01mc@292 185 close(lshfid);
mas01mc@292 186 }
mas01mc@292 187
mas01mc@292 188 // Attempt to open LSH file
mas01mc@292 189 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 190 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 191 fflush(stdout);
mas01mc@340 192 char* mergeIndexName = newIndexName;
mas01mc@292 193
mas01mc@292 194 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 195 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 196 assert(lsh);
mas01mc@324 197 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
mas01mc@292 198 delete lsh;
mas01mc@308 199 lsh = 0;
mas01mc@292 200
mas01mc@340 201 // Insert up to lsh_param_b tracks
mas01mc@340 202 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 203 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 204 }
mas01mc@292 205 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 206 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 207
mas01mc@292 208 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 209 if( endTrack > dbH->numFiles)
mas01mc@292 210 endTrack = dbH->numFiles;
mas01mc@292 211 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 212 fflush(stdout);
mas01mc@340 213 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 214 assert(lsh);
mas01mc@292 215
mas01mc@292 216 // Insert up to lsh_param_b database tracks
mas01mc@292 217 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 218
mas01mc@340 219 // Serialize to file (merging is performed here)
mas01mc@340 220 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 221 delete lsh;
mas01mc@308 222 lsh = 0;
mas01mc@340 223 }
mas01mc@292 224
mas01mc@292 225 close(lshfid);
mas01mc@292 226 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 227 fflush(stdout);
mas01mc@292 228
mas01mc@292 229 }
mas01mc@292 230 else{
mas01mc@292 231 error("Something's wrong with LSH index file");
mas01mc@292 232 exit(1);
mas01mc@292 233 }
mas01mc@292 234
mas01mc@324 235 delete[] newIndexName;
mas01mc@324 236 delete[] sNorm;
mas01mc@324 237 delete[] sPower;
mas01mc@324 238 }
mas01mc@292 239
mas01mc@292 240
mas01cr@405 241 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@405 242 if(usingPower){
mas01cr@405 243 int one;
mas01cr@405 244 unsigned int count;
mas01cr@405 245
mas01cr@405 246 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@405 247 if (count != sizeof(unsigned int)) {
mas01cr@405 248 error("powerfd read failed", "int", "read");
mas01cr@405 249 }
mas01cr@405 250 if (one != 1) {
mas01cr@405 251 error("dimensionality of power file not 1", powerFileName);
mas01cr@405 252 }
mas01cr@405 253
mas01cr@405 254 // FIXME: should check that the powerfile is the right size for
mas01cr@405 255 // this. -- CSR, 2007-10-30
mas01cr@405 256 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@405 257 if (count != numVectors * sizeof(double)) {
mas01cr@405 258 error("powerfd read failed", "double", "read");
mas01cr@405 259 }
mas01cr@405 260 }
mas01cr@405 261 }
mas01cr@405 262
mas01mc@324 263 // initialize auxillary track data from filesystem
mas01mc@324 264 // pre-conditions:
mas01mc@324 265 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 266 // feature data allocated and copied (fvp)
mas01mc@324 267 //
mas01mc@324 268 // post-conditions:
mas01mc@324 269 // allocated power data
mas01mc@324 270 // allocated l2norm data
mas01mc@324 271 //
mas01mc@324 272 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 273 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 274 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 275
mas01mc@324 276 // Allocate and read the power sequence
mas01mc@324 277 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 278
mas01mc@324 279 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 280 char* tmpStr = prefixedString;
mas01mc@324 281 // Open and check dimensions of power file
mas01mc@324 282 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 283 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 284 if(prefixedString!=tmpStr)
mas01mc@324 285 delete[] tmpStr;
mas01mc@324 286 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 287 if (powerfd < 0) {
mas01mc@324 288 error("failed to open power file", prefixedString);
mas01mc@324 289 }
mas01mc@324 290 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 291 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 292 }
mas01mc@324 293
mas01mc@324 294 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 295 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 296
mas01mc@324 297 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 298 assert(*sPowerp);
mas01mc@324 299 *spPtrp = *sPowerp;
mas01mc@324 300 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 301 if (0 < powerfd) {
mas01mc@324 302 close(powerfd);
mas01mc@324 303 }
mas01mc@324 304
mas01cr@427 305 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@427 306 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 307 powerTable = 0;
mas01mc@324 308
mas01mc@324 309 // Allocate and calculate the l2norm sequence
mas01mc@324 310 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 311 assert(*sNormpp);
mas01mc@324 312 *snPtrp = *sNormpp;
mas01cr@426 313 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@427 314 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@427 315 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 316 }
mas01mc@292 317 }
mas01mc@292 318
mas01mc@292 319 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 320 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 321 double** sPowerp, double** spPtrp){
mas01mc@292 322 size_t nfv = 0;
mas01mc@292 323 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 324 Uns32T trackID = 0;
mas01mc@292 325
mas01mc@292 326 VERB_LOG(1, "indexing tracks...");
mas01mc@292 327
mas01mc@324 328 int trackfd = dbfid;
mas01mc@292 329 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 330 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 331 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 332 char* tmpStr = prefixedString;
mas01mc@324 333 // Open and check dimensions of feature file
mas01mc@324 334 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 335 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 336 if(prefixedString!=tmpStr)
mas01mc@324 337 delete[] tmpStr;
mas01mc@324 338 initInputFile(prefixedString, false); // nommap, file pointer at correct position
mas01mc@324 339 trackfd = infid;
mas01mc@324 340 }
mas01mc@324 341 read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv
mas01mc@292 342 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 343
mas01mc@324 344 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 345 // Load power and calculate power and l2norm sequence sums
mas01mc@324 346 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 347
mas01mc@292 348 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 349 break;
mas01mc@324 350 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 351 close(infid);
mas01mc@324 352 delete[] *sNormpp;
mas01mc@324 353 delete[] *sPowerp;
mas01mc@324 354 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 355 }
mas01mc@324 356 } // end for(trackID = start_track ; ... )
mas01mc@292 357 std::cout << "finished inserting." << endl;
mas01mc@292 358 }
mas01mc@292 359
mas01mc@292 360 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 361 // Loop over the current input track's vectors
mas01cr@305 362 Uns32T numVecs = 0;
mas01cr@305 363 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 364 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 365 numVecs = 0;
mas01cr@305 366 } else {
mas01cr@305 367 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 368 }
mas01cr@305 369 } else {
mas01cr@305 370 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 371 numVecs = 0;
mas01cr@305 372 } else {
mas01cr@305 373 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 374 }
mas01cr@305 375 }
mas01mc@292 376
mas01mc@324 377 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 378 if(numVecs){
mas01mc@324 379 vv = index_initialize_shingles(numVecs);
mas01mc@324 380
mas01mc@324 381 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01mc@324 382 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01mc@324 383
mas01mc@324 384 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
mas01mc@324 385 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01mc@324 386 }
mas01mc@292 387 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 388
mas01mc@292 389 /* index_norm_shingles() only goes as far as the end of the
mas01mc@292 390 sequence, which is right, but the space allocated is for the
mas01mc@292 391 whole track. */
mas01mc@292 392
mas01mc@292 393 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 394 * So let's be certain the pointers are in the correct place
mas01mc@292 395 */
mas01mc@292 396
mas01mc@324 397 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 398 *snpp += trackTable[trackID];
mas01mc@324 399 *sppp += trackTable[trackID];
mas01mc@324 400 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 401 }
mas01mc@292 402
mas01mc@292 403 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 404 std::cout.flush();
mas01mc@292 405 return true;
mas01mc@292 406 }
mas01mc@292 407
mas01mc@292 408 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 409 Uns32T collisionCount = 0;
mas01mc@292 410 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 411 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 412 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01mc@324 413 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
mas01mc@324 414 spp+=sequenceHop;
mas01mc@311 415 }
mas01mc@292 416 return collisionCount;
mas01mc@292 417 }
mas01mc@292 418
mas01mc@292 419 /********************* LSH shingle construction ***************************/
mas01mc@292 420
mas01mc@292 421 // Construct shingles out of a feature matrix
mas01mc@292 422 // inputs:
mas01mc@292 423 // idx is vector index in feature matrix
mas01mc@292 424 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01mc@292 425 //
mas01mc@292 426 // pre-conditions:
mas01mc@292 427 // dbH->dim
mas01mc@292 428 // sequenceLength
mas01mc@292 429 // idx < numVectors - sequenceLength + 1
mas01mc@292 430 //
mas01mc@292 431 // post-conditions:
mas01mc@292 432 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01mc@292 433
mas01mc@292 434 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01mc@292 435 assert(idx<(*vv).size());
mas01mc@292 436 vector<float>::iterator ve = (*vv)[idx].end();
mas01mc@292 437 vi=(*vv)[idx].begin(); // shingle iterator
mas01mc@292 438 // First feature vector in shingle
mas01mc@292 439 if(idx==0){
mas01mc@292 440 while(vi!=ve)
mas01mc@292 441 *vi++ = (float)(*fvp++);
mas01mc@292 442 }
mas01mc@292 443 // Not first feature vector in shingle
mas01mc@292 444 else{
mas01mc@292 445 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator
mas01mc@292 446 // Previous seqLen-1 dim-vectors
mas01mc@292 447 while(vi!=ve-dim)
mas01mc@292 448 *vi++=*ui++;
mas01mc@292 449 // Move data pointer to next feature vector
mas01mc@292 450 fvp += ( seqLen + idx - 1 ) * dim ;
mas01mc@292 451 // New d-vector
mas01mc@292 452 while(vi!=ve)
mas01mc@292 453 *vi++ = (float)(*fvp++);
mas01mc@292 454 }
mas01mc@292 455 }
mas01mc@292 456
mas01mc@292 457 // norm shingles
mas01mc@292 458 // in-place norming, no deletions
mas01mc@292 459 // If using power, return number of shingles above power threshold
mas01mc@292 460 int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){
mas01mc@292 461 int z = 0; // number of above-threshold shingles
mas01mc@292 462 float l2norm;
mas01mc@292 463 double power;
mas01mc@292 464 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01mc@292 465 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01mc@292 466 if(!spp)
mas01mc@292 467 error("LSH indexing and query requires a power feature using -w or -W");
mas01mc@292 468 Uns32T shingleSize = sequenceLength*dbH->dim;
mas01mc@292 469 for(Uns32T a=0; a<(*vv).size(); a++){
mas01mc@292 470 l2norm = (float)(*snp++);
mas01mc@292 471 if(audioDB::normalizedDistance)
mas01mc@292 472 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01mc@292 473
mas01mc@292 474 for(Uns32T b=0; b < shingleSize ; b++)
mas01mc@292 475 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01mc@292 476
mas01mc@292 477 power = *spp++;
mas01mc@292 478 if(use_absolute_threshold){
mas01mc@292 479 if ( power >= absolute_threshold )
mas01mc@292 480 z++;
mas01mc@292 481 }
mas01mc@292 482 else
mas01mc@292 483 z++;
mas01mc@292 484 }
mas01mc@292 485 return z;
mas01mc@292 486 }
mas01mc@292 487
mas01mc@292 488
mas01mc@292 489 /*********************** LSH retrieval ****************************/
mas01mc@292 490
mas01mc@292 491
mas01mc@292 492 // return true if indexed query performed else return false
mas01mc@292 493 int audioDB::index_init_query(const char* dbName){
mas01mc@292 494
mas01mc@292 495 if(!(index_exists(dbName, radius, sequenceLength)))
mas01mc@292 496 return false;
mas01mc@292 497
mas01mc@292 498 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 499
mas01mc@292 500 // Test to see if file exists
mas01mc@292 501 if((lshfid = open (indexName, O_RDONLY)) < 0){
mas01mc@292 502 delete[] indexName;
mas01mc@292 503 return false;
mas01mc@292 504 }
mas01mc@292 505
mas01mc@308 506 lsh = index_allocate(indexName, false); // Get the header only here
mas01mc@292 507 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
mas01mc@292 508
mas01mc@311 509 if(lsh!=SERVER_LSH_INDEX_SINGLETON){
mas01mc@308 510 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
mas01mc@308 511 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
mas01mc@327 512 VERB_LOG(1,"INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 513 VERB_LOG(1,"INDEX: R %f\n", lsh->get_radius());
mas01mc@327 514 VERB_LOG(1,"INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 515 VERB_LOG(1,"INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
mas01mc@327 516 VERB_LOG(1,"INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
mas01mc@327 517 VERB_LOG(1,"INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
mas01mc@327 518 VERB_LOG(1,"INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
mas01mc@327 519 VERB_LOG(1,"INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
mas01mc@327 520 VERB_LOG(1,"INDEX: Opened LSH index file %s\n", indexName);
mas01mc@308 521 }
mas01mc@292 522
mas01mc@292 523 // Check to see if we are loading hash tables into core, and do so if true
mas01mc@292 524 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
mas01mc@308 525 if(SERVER_LSH_INDEX_SINGLETON)
mas01mc@308 526 fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName());
mas01mc@308 527 else
mas01mc@327 528 VERB_LOG(1,"INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
mas01mc@308 529 lsh = index_allocate(indexName, true);
mas01mc@292 530 }
mas01mc@292 531
mas01mc@292 532 delete[] indexName;
mas01mc@292 533 return true;
mas01mc@292 534 }
mas01mc@292 535
mas01mc@292 536 // *Static* approximate NN point reporter callback method for lshlib
mas01mc@292 537 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 538 assert(instancePtr); // We need an instance for this callback
mas01mc@292 539 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@324 540 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
mas01mc@324 541 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
mas01mc@292 542 // Skip identity in query_from_key
mas01cr@424 543 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) {
mas01cr@424 544 adb_result_t r;
mas01cr@424 545 r.key = myself->fileTable + trackID * O2_FILETABLE_ENTRY_SIZE;
mas01cr@424 546 r.dist = dist;
mas01cr@424 547 r.qpos = qpos;
mas01cr@424 548 r.ipos = spos;
mas01cr@424 549 myself->accumulator->add_point(&r);
mas01cr@424 550 }
mas01mc@292 551 }
mas01mc@292 552
mas01mc@292 553 // *Static* exact NN point reporter callback method for lshlib
mas01mc@292 554 // Maintain a queue of points to pass to query_points() for exact evaluation
mas01mc@292 555 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 556 assert(instancePtr); // We need an instance for this callback
mas01mc@292 557 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@324 558 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
mas01mc@324 559 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
mas01mc@292 560 // Skip identity in query_from_key
mas01mc@292 561 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 562 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
mas01mc@292 563 }
mas01mc@292 564
mas01mc@292 565 void audioDB::initialize_exact_evalutation_queue(){
mas01mc@292 566 if(exact_evaluation_queue)
mas01mc@292 567 delete exact_evaluation_queue;
mas01mc@292 568 exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >;
mas01mc@292 569 }
mas01mc@292 570
mas01mc@292 571 void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){
mas01mc@292 572 PointPair p(trackID, qpos, spos);
mas01mc@292 573 exact_evaluation_queue->push(p);
mas01mc@292 574 }
mas01mc@292 575
mas01mc@292 576 // return 0: if index does not exist
mas01mc@292 577 // return nqv: if index exists
mas01cr@425 578 int audioDB::index_query_loop(adb_query_refine_t *refine, const char* dbName, Uns32T queryIndex) {
mas01mc@292 579
mas01mc@324 580 unsigned int numVectors = 0;
mas01mc@324 581 double *query = 0, *query_data = 0;
mas01mc@324 582 double *qNorm = 0, *qnPtr = 0, *qPower = 0, *qpPtr = 0;
mas01mc@324 583 double meanQdur = 0;
mas01mc@292 584 void (*add_point_func)(void*,Uns32T,Uns32T,float);
mas01mc@292 585
mas01mc@292 586 // Set the point-reporter callback based on the value of lsh_exact
mas01mc@292 587 if(lsh_exact){
mas01mc@292 588 initialize_exact_evalutation_queue();
mas01mc@292 589 add_point_func = &index_add_point_exact;
mas01mc@292 590 }
mas01mc@292 591 else
mas01mc@292 592 add_point_func = &index_add_point_approximate;
mas01mc@292 593
mas01mc@292 594 if(!index_init_query(dbName)) // sets-up LSH index structures for querying
mas01mc@292 595 return 0;
mas01mc@292 596
mas01mc@292 597 char* database = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 598
mas01mc@292 599 if(query_from_key)
mas01mc@292 600 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
mas01mc@292 601 else
mas01mc@292 602 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors
mas01mc@292 603
mas01mc@292 604 VERB_LOG(1, "retrieving tracks...");
mas01mc@292 605
mas01mc@292 606 assert(pointNN>0 && pointNN<=O2_MAXNN);
mas01mc@292 607 assert(trackNN>0 && trackNN<=O2_MAXNN);
mas01mc@292 608
mas01mc@292 609 gettimeofday(&tv1, NULL);
mas01mc@292 610 // query vector index
mas01mc@292 611 Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1;
mas01mc@292 612 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
mas01mc@327 613 VERB_LOG(1, "Nq=%d", Nq);
mas01mc@292 614 // Construct shingles from query features
mas01mc@292 615 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
mas01mc@292 616 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
mas01mc@292 617
mas01mc@292 618 // Normalize query vectors
mas01mc@292 619 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr );
mas01mc@327 620 VERB_LOG(1, "Nq'=%d\n", numVecsAboveThreshold);
mas01mc@292 621
mas01mc@292 622 // Nq contains number of inspected points in query file,
mas01mc@292 623 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01mc@292 624 double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation
mas01mc@292 625 if(usingQueryPoint && numVecsAboveThreshold){
mas01mc@292 626 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
mas01mc@292 627 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 628 else
mas01mc@292 629 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 630 }
mas01mc@292 631 else if(numVecsAboveThreshold)
mas01mc@292 632 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
mas01cr@370 633 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@370 634 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01mc@292 635 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this);
mas01cr@370 636 } else {
mas01mc@292 637 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this);
mas01cr@370 638 }
mas01cr@370 639 }
mas01mc@292 640
mas01mc@292 641 if(lsh_exact)
mas01mc@292 642 // Perform exact distance computation on point pairs in exact_evaluation_queue
mas01cr@425 643 query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors, refine);
mas01mc@292 644
mas01mc@292 645 gettimeofday(&tv2,NULL);
mas01mc@292 646 VERB_LOG(1,"elapsed time: %ld msec\n",
mas01mc@292 647 (tv2.tv_sec*1000 + tv2.tv_usec/1000) -
mas01mc@292 648 (tv1.tv_sec*1000 + tv1.tv_usec/1000))
mas01mc@292 649
mas01mc@292 650 // Close the index file
mas01mc@292 651 close(lshfid);
mas01mc@292 652
mas01mc@292 653 // Clean up
mas01mc@292 654 if(query_data)
mas01mc@292 655 delete[] query_data;
mas01mc@292 656 if(qNorm)
mas01mc@292 657 delete[] qNorm;
mas01mc@292 658 if(qPower)
mas01mc@292 659 delete[] qPower;
mas01mc@292 660 if(database)
mas01mc@292 661 delete[] database;
mas01mc@292 662
mas01mc@292 663 return Nq;
mas01mc@292 664 }
mas01mc@292 665