annotate index.cpp @ 411:ad2206c24986 api-inversion

Fix a memory corruption bug. When allocating the adb_t in audiodb_open(), zero the memory; then we're not going to try to free() or delete some arbitrary uninitialized thing if the thing that we're opening turns out not to be an audiodb database.
author mas01cr
date Thu, 11 Dec 2008 08:54:06 +0000
parents ef4792df8f93
children 9b13ea44e302
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01mc@292 15 #include "ReporterBase.h"
mas01mc@292 16
mas01mc@292 17
mas01mc@292 18 /************************* LSH point index to audioDB conversion *****************/
mas01mc@324 19 Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
mas01mc@324 20 assert(nPntBits);
mas01mc@324 21 return lshID>>nPntBits;
mas01mc@292 22 }
mas01mc@292 23
mas01mc@324 24 Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
mas01mc@324 25 assert(nPntBits);
mas01mc@324 26 return lshID&((1<<nPntBits)-1);
mas01mc@292 27 }
mas01mc@292 28
mas01mc@324 29 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
mas01mc@324 30 assert(nPntBits);
mas01mc@324 31 return (trackID << nPntBits) | spos;
mas01mc@292 32 }
mas01mc@292 33
mas01mc@292 34 /************************* LSH indexing and query initialization *****************/
mas01mc@292 35
mas01mc@292 36 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
mas01mc@292 37 char* indexName = new char[MAXSTR];
mas01mc@292 38 // Attempt to make new file
mas01mc@292 39 if(strlen(dbName) > (MAXSTR - 32))
mas01mc@292 40 error("dbName is too long for LSH index filename appendages");
mas01mc@292 41 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 42 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 43 return indexName;
mas01mc@292 44 }
mas01mc@292 45
mas01mc@292 46 // return true if index exists else return false
mas01mc@292 47 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
mas01mc@292 48 // Test to see if file exists
mas01mc@292 49 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 50 lshfid = open (indexName, O_RDONLY);
mas01mc@292 51 delete[] indexName;
mas01mc@292 52 close(lshfid);
mas01mc@292 53
mas01mc@292 54 if(lshfid<0)
mas01mc@292 55 return false;
mas01mc@292 56 else
mas01mc@292 57 return true;
mas01mc@292 58 }
mas01mc@292 59
mas01mc@324 60 // If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName())
mas01mc@324 61 // If they match, i.e. path+dbName_resident == path+dbName_requested, use
mas01mc@324 62 // the memory-resident index.
mas01mc@324 63 // Else allocate a new LSH instance and load the index from disk
mas01mc@308 64 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
mas01mc@308 65 LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
mas01mc@308 66 if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
mas01mc@308 67 audioDB::lsh = gIndx; // Use the global SERVER resident index
mas01mc@308 68 else{
mas01mc@308 69 if(audioDB::lsh)
mas01mc@308 70 delete audioDB::lsh;
mas01mc@308 71 audioDB::lsh = new LSH(indexName, load_hashTables);
mas01mc@308 72 }
mas01mc@308 73 assert(audioDB::lsh);
mas01mc@308 74 return audioDB::lsh;
mas01mc@308 75 }
mas01mc@308 76
mas01mc@292 77 vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){
mas01mc@292 78 if(vv)
mas01mc@292 79 delete vv;
mas01mc@292 80 vv = new vector<vector<float> >(sz);
mas01mc@292 81 for(Uns32T i=0 ; i < sz ; i++)
mas01mc@292 82 (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage
mas01mc@292 83 return vv;
mas01mc@292 84 }
mas01mc@292 85
mas01mc@292 86 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 87
mas01mc@292 88 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 89 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 90 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 91 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 92 }
mas01mc@324 93
mas01mc@325 94 double *snpp = 0, *sppp = 0;
mas01mc@324 95
mas01mc@292 96 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 97 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 98 snpp = *snp;
mas01mc@292 99 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 100 sppp = *spp;
mas01mc@325 101
mas01mc@324 102 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 103 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 104
mas01mc@324 105
mas01mc@292 106 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 107 if(trackTable[i] >= sequenceLength) {
mas01mc@292 108 sequence_sum(snpp, trackTable[i], sequenceLength);
mas01mc@292 109 sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 110
mas01mc@292 111 sequence_sum(sppp, trackTable[i], sequenceLength);
mas01mc@292 112 sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 113 }
mas01mc@292 114 snpp += trackTable[i];
mas01mc@292 115 sppp += trackTable[i];
mas01mc@292 116 }
mas01mc@324 117
mas01mc@292 118 *vsnp = *snp;
mas01mc@292 119 *vspp = *spp;
mas01mc@324 120
mas01mc@292 121 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 122 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 123 }
mas01mc@292 124
mas01mc@292 125
mas01mc@292 126 /************************ LSH indexing ***********************************/
mas01mc@292 127 void audioDB::index_index_db(const char* dbName){
mas01mc@292 128 char* newIndexName;
mas01mc@292 129 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 130 Uns32T dbVectors = 0;
mas01mc@292 131
mas01mc@324 132
mas01mc@292 133 printf("INDEX: initializing header\n");
mas01mc@292 134 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 135 forWrite = false;
mas01mc@292 136 initDBHeader(dbName);
mas01mc@292 137
mas01mc@324 138 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 139 usingPower = true;
mas01mc@324 140
mas01mc@324 141 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 142 usingTimes = true;
mas01mc@324 143
mas01mc@292 144 newIndexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 145
mas01mc@292 146 // Set unit norming flag override
mas01mc@292 147 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 148
mas01mc@327 149 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 150 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 151 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 152 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 153 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 154 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 155 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 156 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 157 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 158 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 159
mas01mc@292 160 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 161 printf("INDEX: constructing new LSH index\n");
mas01mc@292 162 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 163 fflush(stdout);
mas01mc@292 164 // Construct new LSH index
mas01mc@292 165 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 166 lsh_param_m,
mas01mc@292 167 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 168 lsh_param_N,
mas01mc@292 169 lsh_param_ncols,
mas01mc@292 170 (float)radius);
mas01mc@292 171 assert(lsh);
mas01mc@292 172
mas01mc@292 173 Uns32T endTrack = lsh_param_b;
mas01mc@292 174 if( endTrack > dbH->numFiles)
mas01mc@292 175 endTrack = dbH->numFiles;
mas01mc@292 176 // Insert up to lsh_param_b tracks
mas01mc@324 177 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 178 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 179 }
mas01mc@324 180 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 181 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 182
mas01mc@292 183 // Clean up
mas01mc@292 184 delete lsh;
mas01mc@308 185 lsh = 0;
mas01mc@292 186 close(lshfid);
mas01mc@292 187 }
mas01mc@292 188
mas01mc@292 189 // Attempt to open LSH file
mas01mc@292 190 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 191 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 192 fflush(stdout);
mas01mc@340 193 char* mergeIndexName = newIndexName;
mas01mc@292 194
mas01mc@292 195 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 196 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 197 assert(lsh);
mas01mc@324 198 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
mas01mc@292 199 delete lsh;
mas01mc@308 200 lsh = 0;
mas01mc@292 201
mas01mc@340 202 // Insert up to lsh_param_b tracks
mas01mc@340 203 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 204 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 205 }
mas01mc@292 206 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 207 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 208
mas01mc@292 209 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 210 if( endTrack > dbH->numFiles)
mas01mc@292 211 endTrack = dbH->numFiles;
mas01mc@292 212 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 213 fflush(stdout);
mas01mc@340 214 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 215 assert(lsh);
mas01mc@292 216
mas01mc@292 217 // Insert up to lsh_param_b database tracks
mas01mc@292 218 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 219
mas01mc@340 220 // Serialize to file (merging is performed here)
mas01mc@340 221 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 222 delete lsh;
mas01mc@308 223 lsh = 0;
mas01mc@340 224 }
mas01mc@292 225
mas01mc@292 226 close(lshfid);
mas01mc@292 227 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 228 fflush(stdout);
mas01mc@292 229
mas01mc@292 230 }
mas01mc@292 231 else{
mas01mc@292 232 error("Something's wrong with LSH index file");
mas01mc@292 233 exit(1);
mas01mc@292 234 }
mas01mc@292 235
mas01mc@324 236 delete[] newIndexName;
mas01mc@324 237 delete[] sNorm;
mas01mc@324 238 delete[] sPower;
mas01mc@324 239 }
mas01mc@292 240
mas01mc@292 241
mas01cr@405 242 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@405 243 if(usingPower){
mas01cr@405 244 int one;
mas01cr@405 245 unsigned int count;
mas01cr@405 246
mas01cr@405 247 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@405 248 if (count != sizeof(unsigned int)) {
mas01cr@405 249 error("powerfd read failed", "int", "read");
mas01cr@405 250 }
mas01cr@405 251 if (one != 1) {
mas01cr@405 252 error("dimensionality of power file not 1", powerFileName);
mas01cr@405 253 }
mas01cr@405 254
mas01cr@405 255 // FIXME: should check that the powerfile is the right size for
mas01cr@405 256 // this. -- CSR, 2007-10-30
mas01cr@405 257 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@405 258 if (count != numVectors * sizeof(double)) {
mas01cr@405 259 error("powerfd read failed", "double", "read");
mas01cr@405 260 }
mas01cr@405 261 }
mas01cr@405 262 }
mas01cr@405 263
mas01mc@324 264 // initialize auxillary track data from filesystem
mas01mc@324 265 // pre-conditions:
mas01mc@324 266 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 267 // feature data allocated and copied (fvp)
mas01mc@324 268 //
mas01mc@324 269 // post-conditions:
mas01mc@324 270 // allocated power data
mas01mc@324 271 // allocated l2norm data
mas01mc@324 272 //
mas01mc@324 273 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 274 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 275 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 276
mas01mc@324 277 // Allocate and read the power sequence
mas01mc@324 278 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 279
mas01mc@324 280 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 281 char* tmpStr = prefixedString;
mas01mc@324 282 // Open and check dimensions of power file
mas01mc@324 283 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 284 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 285 if(prefixedString!=tmpStr)
mas01mc@324 286 delete[] tmpStr;
mas01mc@324 287 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 288 if (powerfd < 0) {
mas01mc@324 289 error("failed to open power file", prefixedString);
mas01mc@324 290 }
mas01mc@324 291 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 292 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 293 }
mas01mc@324 294
mas01mc@324 295 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 296 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 297
mas01mc@324 298 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 299 assert(*sPowerp);
mas01mc@324 300 *spPtrp = *sPowerp;
mas01mc@324 301 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 302 if (0 < powerfd) {
mas01mc@324 303 close(powerfd);
mas01mc@324 304 }
mas01mc@324 305
mas01mc@324 306 sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 307 sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 308 powerTable = 0;
mas01mc@324 309
mas01mc@324 310 // Allocate and calculate the l2norm sequence
mas01mc@324 311 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 312 assert(*sNormpp);
mas01mc@324 313 *snPtrp = *sNormpp;
mas01mc@324 314 unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01mc@324 315 sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 316 sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 317 }
mas01mc@292 318 }
mas01mc@292 319
mas01mc@292 320 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 321 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 322 double** sPowerp, double** spPtrp){
mas01mc@292 323 size_t nfv = 0;
mas01mc@292 324 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 325 Uns32T trackID = 0;
mas01mc@292 326
mas01mc@292 327 VERB_LOG(1, "indexing tracks...");
mas01mc@292 328
mas01mc@324 329 int trackfd = dbfid;
mas01mc@292 330 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 331 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 332 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 333 char* tmpStr = prefixedString;
mas01mc@324 334 // Open and check dimensions of feature file
mas01mc@324 335 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 336 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 337 if(prefixedString!=tmpStr)
mas01mc@324 338 delete[] tmpStr;
mas01mc@324 339 initInputFile(prefixedString, false); // nommap, file pointer at correct position
mas01mc@324 340 trackfd = infid;
mas01mc@324 341 }
mas01mc@324 342 read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv
mas01mc@292 343 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 344
mas01mc@324 345 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 346 // Load power and calculate power and l2norm sequence sums
mas01mc@324 347 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 348
mas01mc@292 349 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 350 break;
mas01mc@324 351 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 352 close(infid);
mas01mc@324 353 delete[] *sNormpp;
mas01mc@324 354 delete[] *sPowerp;
mas01mc@324 355 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 356 }
mas01mc@324 357 } // end for(trackID = start_track ; ... )
mas01mc@292 358 std::cout << "finished inserting." << endl;
mas01mc@292 359 }
mas01mc@292 360
mas01mc@292 361 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 362 // Loop over the current input track's vectors
mas01cr@305 363 Uns32T numVecs = 0;
mas01cr@305 364 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 365 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 366 numVecs = 0;
mas01cr@305 367 } else {
mas01cr@305 368 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 369 }
mas01cr@305 370 } else {
mas01cr@305 371 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 372 numVecs = 0;
mas01cr@305 373 } else {
mas01cr@305 374 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 375 }
mas01cr@305 376 }
mas01mc@292 377
mas01mc@324 378 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 379 if(numVecs){
mas01mc@324 380 vv = index_initialize_shingles(numVecs);
mas01mc@324 381
mas01mc@324 382 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01mc@324 383 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01mc@324 384
mas01mc@324 385 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
mas01mc@324 386 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01mc@324 387 }
mas01mc@292 388 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 389
mas01mc@292 390 /* index_norm_shingles() only goes as far as the end of the
mas01mc@292 391 sequence, which is right, but the space allocated is for the
mas01mc@292 392 whole track. */
mas01mc@292 393
mas01mc@292 394 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 395 * So let's be certain the pointers are in the correct place
mas01mc@292 396 */
mas01mc@292 397
mas01mc@324 398 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 399 *snpp += trackTable[trackID];
mas01mc@324 400 *sppp += trackTable[trackID];
mas01mc@324 401 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 402 }
mas01mc@292 403
mas01mc@292 404 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 405 std::cout.flush();
mas01mc@292 406 return true;
mas01mc@292 407 }
mas01mc@292 408
mas01mc@292 409 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 410 Uns32T collisionCount = 0;
mas01mc@292 411 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 412 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 413 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01mc@324 414 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
mas01mc@324 415 spp+=sequenceHop;
mas01mc@311 416 }
mas01mc@292 417 return collisionCount;
mas01mc@292 418 }
mas01mc@292 419
mas01mc@292 420 /********************* LSH shingle construction ***************************/
mas01mc@292 421
mas01mc@292 422 // Construct shingles out of a feature matrix
mas01mc@292 423 // inputs:
mas01mc@292 424 // idx is vector index in feature matrix
mas01mc@292 425 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01mc@292 426 //
mas01mc@292 427 // pre-conditions:
mas01mc@292 428 // dbH->dim
mas01mc@292 429 // sequenceLength
mas01mc@292 430 // idx < numVectors - sequenceLength + 1
mas01mc@292 431 //
mas01mc@292 432 // post-conditions:
mas01mc@292 433 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01mc@292 434
mas01mc@292 435 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01mc@292 436 assert(idx<(*vv).size());
mas01mc@292 437 vector<float>::iterator ve = (*vv)[idx].end();
mas01mc@292 438 vi=(*vv)[idx].begin(); // shingle iterator
mas01mc@292 439 // First feature vector in shingle
mas01mc@292 440 if(idx==0){
mas01mc@292 441 while(vi!=ve)
mas01mc@292 442 *vi++ = (float)(*fvp++);
mas01mc@292 443 }
mas01mc@292 444 // Not first feature vector in shingle
mas01mc@292 445 else{
mas01mc@292 446 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator
mas01mc@292 447 // Previous seqLen-1 dim-vectors
mas01mc@292 448 while(vi!=ve-dim)
mas01mc@292 449 *vi++=*ui++;
mas01mc@292 450 // Move data pointer to next feature vector
mas01mc@292 451 fvp += ( seqLen + idx - 1 ) * dim ;
mas01mc@292 452 // New d-vector
mas01mc@292 453 while(vi!=ve)
mas01mc@292 454 *vi++ = (float)(*fvp++);
mas01mc@292 455 }
mas01mc@292 456 }
mas01mc@292 457
mas01mc@292 458 // norm shingles
mas01mc@292 459 // in-place norming, no deletions
mas01mc@292 460 // If using power, return number of shingles above power threshold
mas01mc@292 461 int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){
mas01mc@292 462 int z = 0; // number of above-threshold shingles
mas01mc@292 463 float l2norm;
mas01mc@292 464 double power;
mas01mc@292 465 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01mc@292 466 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01mc@292 467 if(!spp)
mas01mc@292 468 error("LSH indexing and query requires a power feature using -w or -W");
mas01mc@292 469 Uns32T shingleSize = sequenceLength*dbH->dim;
mas01mc@292 470 for(Uns32T a=0; a<(*vv).size(); a++){
mas01mc@292 471 l2norm = (float)(*snp++);
mas01mc@292 472 if(audioDB::normalizedDistance)
mas01mc@292 473 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01mc@292 474
mas01mc@292 475 for(Uns32T b=0; b < shingleSize ; b++)
mas01mc@292 476 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01mc@292 477
mas01mc@292 478 power = *spp++;
mas01mc@292 479 if(use_absolute_threshold){
mas01mc@292 480 if ( power >= absolute_threshold )
mas01mc@292 481 z++;
mas01mc@292 482 }
mas01mc@292 483 else
mas01mc@292 484 z++;
mas01mc@292 485 }
mas01mc@292 486 return z;
mas01mc@292 487 }
mas01mc@292 488
mas01mc@292 489
mas01mc@292 490 /*********************** LSH retrieval ****************************/
mas01mc@292 491
mas01mc@292 492
mas01mc@292 493 // return true if indexed query performed else return false
mas01mc@292 494 int audioDB::index_init_query(const char* dbName){
mas01mc@292 495
mas01mc@292 496 if(!(index_exists(dbName, radius, sequenceLength)))
mas01mc@292 497 return false;
mas01mc@292 498
mas01mc@292 499 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 500
mas01mc@292 501 // Test to see if file exists
mas01mc@292 502 if((lshfid = open (indexName, O_RDONLY)) < 0){
mas01mc@292 503 delete[] indexName;
mas01mc@292 504 return false;
mas01mc@292 505 }
mas01mc@292 506
mas01mc@308 507 lsh = index_allocate(indexName, false); // Get the header only here
mas01mc@292 508 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
mas01mc@292 509
mas01mc@311 510 if(lsh!=SERVER_LSH_INDEX_SINGLETON){
mas01mc@308 511 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
mas01mc@308 512 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
mas01mc@327 513 VERB_LOG(1,"INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 514 VERB_LOG(1,"INDEX: R %f\n", lsh->get_radius());
mas01mc@327 515 VERB_LOG(1,"INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 516 VERB_LOG(1,"INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
mas01mc@327 517 VERB_LOG(1,"INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
mas01mc@327 518 VERB_LOG(1,"INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
mas01mc@327 519 VERB_LOG(1,"INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
mas01mc@327 520 VERB_LOG(1,"INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
mas01mc@327 521 VERB_LOG(1,"INDEX: Opened LSH index file %s\n", indexName);
mas01mc@308 522 }
mas01mc@292 523
mas01mc@292 524 // Check to see if we are loading hash tables into core, and do so if true
mas01mc@292 525 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
mas01mc@308 526 if(SERVER_LSH_INDEX_SINGLETON)
mas01mc@308 527 fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName());
mas01mc@308 528 else
mas01mc@327 529 VERB_LOG(1,"INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
mas01mc@308 530 lsh = index_allocate(indexName, true);
mas01mc@292 531 }
mas01mc@292 532
mas01mc@292 533 delete[] indexName;
mas01mc@292 534 return true;
mas01mc@292 535 }
mas01mc@292 536
mas01mc@292 537 // *Static* approximate NN point reporter callback method for lshlib
mas01mc@292 538 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 539 assert(instancePtr); // We need an instance for this callback
mas01mc@292 540 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@324 541 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
mas01mc@324 542 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
mas01mc@292 543 // Skip identity in query_from_key
mas01mc@292 544 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 545 myself->reporter->add_point(trackID, qpos, spos, dist);
mas01mc@292 546 }
mas01mc@292 547
mas01mc@292 548 // *Static* exact NN point reporter callback method for lshlib
mas01mc@292 549 // Maintain a queue of points to pass to query_points() for exact evaluation
mas01mc@292 550 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 551 assert(instancePtr); // We need an instance for this callback
mas01mc@292 552 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@324 553 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
mas01mc@324 554 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
mas01mc@292 555 // Skip identity in query_from_key
mas01mc@292 556 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 557 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
mas01mc@292 558 }
mas01mc@292 559
mas01mc@292 560 void audioDB::initialize_exact_evalutation_queue(){
mas01mc@292 561 if(exact_evaluation_queue)
mas01mc@292 562 delete exact_evaluation_queue;
mas01mc@292 563 exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >;
mas01mc@292 564 }
mas01mc@292 565
mas01mc@292 566 void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){
mas01mc@292 567 PointPair p(trackID, qpos, spos);
mas01mc@292 568 exact_evaluation_queue->push(p);
mas01mc@292 569 }
mas01mc@292 570
mas01mc@292 571 // return 0: if index does not exist
mas01mc@292 572 // return nqv: if index exists
mas01mc@292 573 int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) {
mas01mc@292 574
mas01mc@324 575 unsigned int numVectors = 0;
mas01mc@324 576 double *query = 0, *query_data = 0;
mas01mc@324 577 double *qNorm = 0, *qnPtr = 0, *qPower = 0, *qpPtr = 0;
mas01mc@324 578 double meanQdur = 0;
mas01mc@292 579 void (*add_point_func)(void*,Uns32T,Uns32T,float);
mas01mc@292 580
mas01mc@292 581 // Set the point-reporter callback based on the value of lsh_exact
mas01mc@292 582 if(lsh_exact){
mas01mc@292 583 initialize_exact_evalutation_queue();
mas01mc@292 584 add_point_func = &index_add_point_exact;
mas01mc@292 585 }
mas01mc@292 586 else
mas01mc@292 587 add_point_func = &index_add_point_approximate;
mas01mc@292 588
mas01mc@292 589 if(!index_init_query(dbName)) // sets-up LSH index structures for querying
mas01mc@292 590 return 0;
mas01mc@292 591
mas01mc@292 592 char* database = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 593
mas01mc@292 594 if(query_from_key)
mas01mc@292 595 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
mas01mc@292 596 else
mas01mc@292 597 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors
mas01mc@292 598
mas01mc@292 599 VERB_LOG(1, "retrieving tracks...");
mas01mc@292 600
mas01mc@292 601 assert(pointNN>0 && pointNN<=O2_MAXNN);
mas01mc@292 602 assert(trackNN>0 && trackNN<=O2_MAXNN);
mas01mc@292 603
mas01mc@292 604 gettimeofday(&tv1, NULL);
mas01mc@292 605 // query vector index
mas01mc@292 606 Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1;
mas01mc@292 607 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
mas01mc@327 608 VERB_LOG(1, "Nq=%d", Nq);
mas01mc@292 609 // Construct shingles from query features
mas01mc@292 610 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
mas01mc@292 611 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
mas01mc@292 612
mas01mc@292 613 // Normalize query vectors
mas01mc@292 614 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr );
mas01mc@327 615 VERB_LOG(1, "Nq'=%d\n", numVecsAboveThreshold);
mas01mc@292 616
mas01mc@292 617 // Nq contains number of inspected points in query file,
mas01mc@292 618 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01mc@292 619 double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation
mas01mc@292 620 if(usingQueryPoint && numVecsAboveThreshold){
mas01mc@292 621 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
mas01mc@292 622 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 623 else
mas01mc@292 624 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 625 }
mas01mc@292 626 else if(numVecsAboveThreshold)
mas01mc@292 627 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
mas01cr@370 628 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@370 629 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01mc@292 630 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this);
mas01cr@370 631 } else {
mas01mc@292 632 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this);
mas01cr@370 633 }
mas01cr@370 634 }
mas01mc@292 635
mas01mc@292 636 if(lsh_exact)
mas01mc@292 637 // Perform exact distance computation on point pairs in exact_evaluation_queue
mas01mc@292 638 query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors);
mas01mc@292 639
mas01mc@292 640 gettimeofday(&tv2,NULL);
mas01mc@292 641 VERB_LOG(1,"elapsed time: %ld msec\n",
mas01mc@292 642 (tv2.tv_sec*1000 + tv2.tv_usec/1000) -
mas01mc@292 643 (tv1.tv_sec*1000 + tv1.tv_usec/1000))
mas01mc@292 644
mas01mc@292 645 // Close the index file
mas01mc@292 646 close(lshfid);
mas01mc@292 647
mas01mc@292 648 // Clean up
mas01mc@292 649 if(query_data)
mas01mc@292 650 delete[] query_data;
mas01mc@292 651 if(qNorm)
mas01mc@292 652 delete[] qNorm;
mas01mc@292 653 if(qPower)
mas01mc@292 654 delete[] qPower;
mas01mc@292 655 if(database)
mas01mc@292 656 delete[] database;
mas01mc@292 657
mas01mc@292 658 return Nq;
mas01mc@292 659 }
mas01mc@292 660