annotate index.cpp @ 405:ef4792df8f93 api-inversion

invert audioDB::insert / audiodb_insert(). Start off by removing audioDB::insertDatum, and essentially reusing it as audiodb_insert. We now ignore the fact that the command-line parsing code has "helpfully" opened a std::ifstream for the times file and an fd for the power file, and simply go ahead and do our own dirty work. We can delete audioDB::insertDatum entirely, but unfortunately we can't delete audioDB::insertPowerData and audioDB::insertTimestamps, because the index and query code respectively use them. Instead, move the two methods closer to their single uses. audiodb_insert() is perhaps not as short and simple as it might have been hoped given the existence of audiodb_insert_datum(); some of that is C and its terribly way of making you pay every time you use dynamic memory; some of it is the fact that the three different files (feature, times, power) each requires slightly different treatment. Hey ho. We can implement audiodb_batchinsert() in terms of audiodb_insert(); the function is pleasingly small. We can't quite use it for audioDB::batchinsert yet, as we have to deal with the O2_FLAG_LARGE_ADB case (which codepath is untested in libtests/). This means that we can delete whole swathes of hideous code from audioDB.cpp, including not just the versions of audiodb_insert() and audiodb_batchinsert() but also an entire audioDB constructor. Yay. (audioDB::unitNormAndInsertL2 has also died a deserved death).
author mas01cr
date Fri, 05 Dec 2008 22:32:49 +0000
parents 2d5c3f8e8c22
children 9b13ea44e302
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01mc@292 15 #include "ReporterBase.h"
mas01mc@292 16
mas01mc@292 17
mas01mc@292 18 /************************* LSH point index to audioDB conversion *****************/
mas01mc@324 19 Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
mas01mc@324 20 assert(nPntBits);
mas01mc@324 21 return lshID>>nPntBits;
mas01mc@292 22 }
mas01mc@292 23
mas01mc@324 24 Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
mas01mc@324 25 assert(nPntBits);
mas01mc@324 26 return lshID&((1<<nPntBits)-1);
mas01mc@292 27 }
mas01mc@292 28
mas01mc@324 29 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
mas01mc@324 30 assert(nPntBits);
mas01mc@324 31 return (trackID << nPntBits) | spos;
mas01mc@292 32 }
mas01mc@292 33
mas01mc@292 34 /************************* LSH indexing and query initialization *****************/
mas01mc@292 35
mas01mc@292 36 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
mas01mc@292 37 char* indexName = new char[MAXSTR];
mas01mc@292 38 // Attempt to make new file
mas01mc@292 39 if(strlen(dbName) > (MAXSTR - 32))
mas01mc@292 40 error("dbName is too long for LSH index filename appendages");
mas01mc@292 41 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 42 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 43 return indexName;
mas01mc@292 44 }
mas01mc@292 45
mas01mc@292 46 // return true if index exists else return false
mas01mc@292 47 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
mas01mc@292 48 // Test to see if file exists
mas01mc@292 49 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 50 lshfid = open (indexName, O_RDONLY);
mas01mc@292 51 delete[] indexName;
mas01mc@292 52 close(lshfid);
mas01mc@292 53
mas01mc@292 54 if(lshfid<0)
mas01mc@292 55 return false;
mas01mc@292 56 else
mas01mc@292 57 return true;
mas01mc@292 58 }
mas01mc@292 59
mas01mc@324 60 // If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName())
mas01mc@324 61 // If they match, i.e. path+dbName_resident == path+dbName_requested, use
mas01mc@324 62 // the memory-resident index.
mas01mc@324 63 // Else allocate a new LSH instance and load the index from disk
mas01mc@308 64 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
mas01mc@308 65 LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
mas01mc@308 66 if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
mas01mc@308 67 audioDB::lsh = gIndx; // Use the global SERVER resident index
mas01mc@308 68 else{
mas01mc@308 69 if(audioDB::lsh)
mas01mc@308 70 delete audioDB::lsh;
mas01mc@308 71 audioDB::lsh = new LSH(indexName, load_hashTables);
mas01mc@308 72 }
mas01mc@308 73 assert(audioDB::lsh);
mas01mc@308 74 return audioDB::lsh;
mas01mc@308 75 }
mas01mc@308 76
mas01mc@292 77 vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){
mas01mc@292 78 if(vv)
mas01mc@292 79 delete vv;
mas01mc@292 80 vv = new vector<vector<float> >(sz);
mas01mc@292 81 for(Uns32T i=0 ; i < sz ; i++)
mas01mc@292 82 (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage
mas01mc@292 83 return vv;
mas01mc@292 84 }
mas01mc@292 85
mas01mc@292 86 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 87
mas01mc@292 88 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 89 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 90 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 91 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 92 }
mas01mc@324 93
mas01mc@325 94 double *snpp = 0, *sppp = 0;
mas01mc@324 95
mas01mc@292 96 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 97 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 98 snpp = *snp;
mas01mc@292 99 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 100 sppp = *spp;
mas01mc@325 101
mas01mc@324 102 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 103 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 104
mas01mc@324 105
mas01mc@292 106 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 107 if(trackTable[i] >= sequenceLength) {
mas01mc@292 108 sequence_sum(snpp, trackTable[i], sequenceLength);
mas01mc@292 109 sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 110
mas01mc@292 111 sequence_sum(sppp, trackTable[i], sequenceLength);
mas01mc@292 112 sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 113 }
mas01mc@292 114 snpp += trackTable[i];
mas01mc@292 115 sppp += trackTable[i];
mas01mc@292 116 }
mas01mc@324 117
mas01mc@292 118 *vsnp = *snp;
mas01mc@292 119 *vspp = *spp;
mas01mc@324 120
mas01mc@292 121 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 122 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 123 }
mas01mc@292 124
mas01mc@292 125
mas01mc@292 126 /************************ LSH indexing ***********************************/
mas01mc@292 127 void audioDB::index_index_db(const char* dbName){
mas01mc@292 128 char* newIndexName;
mas01mc@292 129 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 130 Uns32T dbVectors = 0;
mas01mc@292 131
mas01mc@324 132
mas01mc@292 133 printf("INDEX: initializing header\n");
mas01mc@292 134 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 135 forWrite = false;
mas01mc@292 136 initDBHeader(dbName);
mas01mc@292 137
mas01mc@324 138 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 139 usingPower = true;
mas01mc@324 140
mas01mc@324 141 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 142 usingTimes = true;
mas01mc@324 143
mas01mc@292 144 newIndexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 145
mas01mc@292 146 // Set unit norming flag override
mas01mc@292 147 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 148
mas01mc@327 149 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 150 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 151 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 152 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 153 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 154 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 155 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 156 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 157 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 158 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 159
mas01mc@292 160 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 161 printf("INDEX: constructing new LSH index\n");
mas01mc@292 162 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 163 fflush(stdout);
mas01mc@292 164 // Construct new LSH index
mas01mc@292 165 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 166 lsh_param_m,
mas01mc@292 167 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 168 lsh_param_N,
mas01mc@292 169 lsh_param_ncols,
mas01mc@292 170 (float)radius);
mas01mc@292 171 assert(lsh);
mas01mc@292 172
mas01mc@292 173 Uns32T endTrack = lsh_param_b;
mas01mc@292 174 if( endTrack > dbH->numFiles)
mas01mc@292 175 endTrack = dbH->numFiles;
mas01mc@292 176 // Insert up to lsh_param_b tracks
mas01mc@324 177 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 178 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 179 }
mas01mc@324 180 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 181 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 182
mas01mc@292 183 // Clean up
mas01mc@292 184 delete lsh;
mas01mc@308 185 lsh = 0;
mas01mc@292 186 close(lshfid);
mas01mc@292 187 }
mas01mc@292 188
mas01mc@292 189 // Attempt to open LSH file
mas01mc@292 190 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 191 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 192 fflush(stdout);
mas01mc@340 193 char* mergeIndexName = newIndexName;
mas01mc@292 194
mas01mc@292 195 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 196 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 197 assert(lsh);
mas01mc@324 198 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
mas01mc@292 199 delete lsh;
mas01mc@308 200 lsh = 0;
mas01mc@292 201
mas01mc@340 202 // Insert up to lsh_param_b tracks
mas01mc@340 203 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 204 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 205 }
mas01mc@292 206 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 207 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 208
mas01mc@292 209 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 210 if( endTrack > dbH->numFiles)
mas01mc@292 211 endTrack = dbH->numFiles;
mas01mc@292 212 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 213 fflush(stdout);
mas01mc@340 214 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 215 assert(lsh);
mas01mc@292 216
mas01mc@292 217 // Insert up to lsh_param_b database tracks
mas01mc@292 218 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 219
mas01mc@340 220 // Serialize to file (merging is performed here)
mas01mc@340 221 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 222 delete lsh;
mas01mc@308 223 lsh = 0;
mas01mc@340 224 }
mas01mc@292 225
mas01mc@292 226 close(lshfid);
mas01mc@292 227 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 228 fflush(stdout);
mas01mc@292 229
mas01mc@292 230 }
mas01mc@292 231 else{
mas01mc@292 232 error("Something's wrong with LSH index file");
mas01mc@292 233 exit(1);
mas01mc@292 234 }
mas01mc@292 235
mas01mc@324 236 delete[] newIndexName;
mas01mc@324 237 delete[] sNorm;
mas01mc@324 238 delete[] sPower;
mas01mc@324 239 }
mas01mc@292 240
mas01mc@292 241
mas01cr@405 242 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@405 243 if(usingPower){
mas01cr@405 244 int one;
mas01cr@405 245 unsigned int count;
mas01cr@405 246
mas01cr@405 247 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@405 248 if (count != sizeof(unsigned int)) {
mas01cr@405 249 error("powerfd read failed", "int", "read");
mas01cr@405 250 }
mas01cr@405 251 if (one != 1) {
mas01cr@405 252 error("dimensionality of power file not 1", powerFileName);
mas01cr@405 253 }
mas01cr@405 254
mas01cr@405 255 // FIXME: should check that the powerfile is the right size for
mas01cr@405 256 // this. -- CSR, 2007-10-30
mas01cr@405 257 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@405 258 if (count != numVectors * sizeof(double)) {
mas01cr@405 259 error("powerfd read failed", "double", "read");
mas01cr@405 260 }
mas01cr@405 261 }
mas01cr@405 262 }
mas01cr@405 263
mas01mc@324 264 // initialize auxillary track data from filesystem
mas01mc@324 265 // pre-conditions:
mas01mc@324 266 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 267 // feature data allocated and copied (fvp)
mas01mc@324 268 //
mas01mc@324 269 // post-conditions:
mas01mc@324 270 // allocated power data
mas01mc@324 271 // allocated l2norm data
mas01mc@324 272 //
mas01mc@324 273 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 274 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 275 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 276
mas01mc@324 277 // Allocate and read the power sequence
mas01mc@324 278 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 279
mas01mc@324 280 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 281 char* tmpStr = prefixedString;
mas01mc@324 282 // Open and check dimensions of power file
mas01mc@324 283 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 284 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 285 if(prefixedString!=tmpStr)
mas01mc@324 286 delete[] tmpStr;
mas01mc@324 287 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 288 if (powerfd < 0) {
mas01mc@324 289 error("failed to open power file", prefixedString);
mas01mc@324 290 }
mas01mc@324 291 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 292 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 293 }
mas01mc@324 294
mas01mc@324 295 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 296 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 297
mas01mc@324 298 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 299 assert(*sPowerp);
mas01mc@324 300 *spPtrp = *sPowerp;
mas01mc@324 301 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 302 if (0 < powerfd) {
mas01mc@324 303 close(powerfd);
mas01mc@324 304 }
mas01mc@324 305
mas01mc@324 306 sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 307 sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 308 powerTable = 0;
mas01mc@324 309
mas01mc@324 310 // Allocate and calculate the l2norm sequence
mas01mc@324 311 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 312 assert(*sNormpp);
mas01mc@324 313 *snPtrp = *sNormpp;
mas01mc@324 314 unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01mc@324 315 sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 316 sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 317 }
mas01mc@292 318 }
mas01mc@292 319
mas01mc@292 320 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 321 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 322 double** sPowerp, double** spPtrp){
mas01mc@292 323 size_t nfv = 0;
mas01mc@292 324 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 325 Uns32T trackID = 0;
mas01mc@292 326
mas01mc@292 327 VERB_LOG(1, "indexing tracks...");
mas01mc@292 328
mas01mc@324 329 int trackfd = dbfid;
mas01mc@292 330 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 331 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 332 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 333 char* tmpStr = prefixedString;
mas01mc@324 334 // Open and check dimensions of feature file
mas01mc@324 335 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 336 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 337 if(prefixedString!=tmpStr)
mas01mc@324 338 delete[] tmpStr;
mas01mc@324 339 initInputFile(prefixedString, false); // nommap, file pointer at correct position
mas01mc@324 340 trackfd = infid;
mas01mc@324 341 }
mas01mc@324 342 read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv
mas01mc@292 343 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 344
mas01mc@324 345 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 346 // Load power and calculate power and l2norm sequence sums
mas01mc@324 347 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 348
mas01mc@292 349 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 350 break;
mas01mc@324 351 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 352 close(infid);
mas01mc@324 353 delete[] *sNormpp;
mas01mc@324 354 delete[] *sPowerp;
mas01mc@324 355 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 356 }
mas01mc@324 357 } // end for(trackID = start_track ; ... )
mas01mc@292 358 std::cout << "finished inserting." << endl;
mas01mc@292 359 }
mas01mc@292 360
mas01mc@292 361 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 362 // Loop over the current input track's vectors
mas01cr@305 363 Uns32T numVecs = 0;
mas01cr@305 364 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 365 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 366 numVecs = 0;
mas01cr@305 367 } else {
mas01cr@305 368 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 369 }
mas01cr@305 370 } else {
mas01cr@305 371 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 372 numVecs = 0;
mas01cr@305 373 } else {
mas01cr@305 374 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 375 }
mas01cr@305 376 }
mas01mc@292 377
mas01mc@324 378 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 379 if(numVecs){
mas01mc@324 380 vv = index_initialize_shingles(numVecs);
mas01mc@324 381
mas01mc@324 382 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01mc@324 383 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01mc@324 384
mas01mc@324 385 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
mas01mc@324 386 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01mc@324 387 }
mas01mc@292 388 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 389
mas01mc@292 390 /* index_norm_shingles() only goes as far as the end of the
mas01mc@292 391 sequence, which is right, but the space allocated is for the
mas01mc@292 392 whole track. */
mas01mc@292 393
mas01mc@292 394 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 395 * So let's be certain the pointers are in the correct place
mas01mc@292 396 */
mas01mc@292 397
mas01mc@324 398 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 399 *snpp += trackTable[trackID];
mas01mc@324 400 *sppp += trackTable[trackID];
mas01mc@324 401 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 402 }
mas01mc@292 403
mas01mc@292 404 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 405 std::cout.flush();
mas01mc@292 406 return true;
mas01mc@292 407 }
mas01mc@292 408
mas01mc@292 409 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 410 Uns32T collisionCount = 0;
mas01mc@292 411 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 412 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 413 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01mc@324 414 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
mas01mc@324 415 spp+=sequenceHop;
mas01mc@311 416 }
mas01mc@292 417 return collisionCount;
mas01mc@292 418 }
mas01mc@292 419
mas01mc@292 420 /********************* LSH shingle construction ***************************/
mas01mc@292 421
mas01mc@292 422 // Construct shingles out of a feature matrix
mas01mc@292 423 // inputs:
mas01mc@292 424 // idx is vector index in feature matrix
mas01mc@292 425 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01mc@292 426 //
mas01mc@292 427 // pre-conditions:
mas01mc@292 428 // dbH->dim
mas01mc@292 429 // sequenceLength
mas01mc@292 430 // idx < numVectors - sequenceLength + 1
mas01mc@292 431 //
mas01mc@292 432 // post-conditions:
mas01mc@292 433 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01mc@292 434
mas01mc@292 435 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01mc@292 436 assert(idx<(*vv).size());
mas01mc@292 437 vector<float>::iterator ve = (*vv)[idx].end();
mas01mc@292 438 vi=(*vv)[idx].begin(); // shingle iterator
mas01mc@292 439 // First feature vector in shingle
mas01mc@292 440 if(idx==0){
mas01mc@292 441 while(vi!=ve)
mas01mc@292 442 *vi++ = (float)(*fvp++);
mas01mc@292 443 }
mas01mc@292 444 // Not first feature vector in shingle
mas01mc@292 445 else{
mas01mc@292 446 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator
mas01mc@292 447 // Previous seqLen-1 dim-vectors
mas01mc@292 448 while(vi!=ve-dim)
mas01mc@292 449 *vi++=*ui++;
mas01mc@292 450 // Move data pointer to next feature vector
mas01mc@292 451 fvp += ( seqLen + idx - 1 ) * dim ;
mas01mc@292 452 // New d-vector
mas01mc@292 453 while(vi!=ve)
mas01mc@292 454 *vi++ = (float)(*fvp++);
mas01mc@292 455 }
mas01mc@292 456 }
mas01mc@292 457
mas01mc@292 458 // norm shingles
mas01mc@292 459 // in-place norming, no deletions
mas01mc@292 460 // If using power, return number of shingles above power threshold
mas01mc@292 461 int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){
mas01mc@292 462 int z = 0; // number of above-threshold shingles
mas01mc@292 463 float l2norm;
mas01mc@292 464 double power;
mas01mc@292 465 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01mc@292 466 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01mc@292 467 if(!spp)
mas01mc@292 468 error("LSH indexing and query requires a power feature using -w or -W");
mas01mc@292 469 Uns32T shingleSize = sequenceLength*dbH->dim;
mas01mc@292 470 for(Uns32T a=0; a<(*vv).size(); a++){
mas01mc@292 471 l2norm = (float)(*snp++);
mas01mc@292 472 if(audioDB::normalizedDistance)
mas01mc@292 473 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01mc@292 474
mas01mc@292 475 for(Uns32T b=0; b < shingleSize ; b++)
mas01mc@292 476 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01mc@292 477
mas01mc@292 478 power = *spp++;
mas01mc@292 479 if(use_absolute_threshold){
mas01mc@292 480 if ( power >= absolute_threshold )
mas01mc@292 481 z++;
mas01mc@292 482 }
mas01mc@292 483 else
mas01mc@292 484 z++;
mas01mc@292 485 }
mas01mc@292 486 return z;
mas01mc@292 487 }
mas01mc@292 488
mas01mc@292 489
mas01mc@292 490 /*********************** LSH retrieval ****************************/
mas01mc@292 491
mas01mc@292 492
mas01mc@292 493 // return true if indexed query performed else return false
mas01mc@292 494 int audioDB::index_init_query(const char* dbName){
mas01mc@292 495
mas01mc@292 496 if(!(index_exists(dbName, radius, sequenceLength)))
mas01mc@292 497 return false;
mas01mc@292 498
mas01mc@292 499 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 500
mas01mc@292 501 // Test to see if file exists
mas01mc@292 502 if((lshfid = open (indexName, O_RDONLY)) < 0){
mas01mc@292 503 delete[] indexName;
mas01mc@292 504 return false;
mas01mc@292 505 }
mas01mc@292 506
mas01mc@308 507 lsh = index_allocate(indexName, false); // Get the header only here
mas01mc@292 508 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
mas01mc@292 509
mas01mc@311 510 if(lsh!=SERVER_LSH_INDEX_SINGLETON){
mas01mc@308 511 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
mas01mc@308 512 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
mas01mc@327 513 VERB_LOG(1,"INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 514 VERB_LOG(1,"INDEX: R %f\n", lsh->get_radius());
mas01mc@327 515 VERB_LOG(1,"INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 516 VERB_LOG(1,"INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
mas01mc@327 517 VERB_LOG(1,"INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
mas01mc@327 518 VERB_LOG(1,"INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
mas01mc@327 519 VERB_LOG(1,"INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
mas01mc@327 520 VERB_LOG(1,"INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
mas01mc@327 521 VERB_LOG(1,"INDEX: Opened LSH index file %s\n", indexName);
mas01mc@308 522 }
mas01mc@292 523
mas01mc@292 524 // Check to see if we are loading hash tables into core, and do so if true
mas01mc@292 525 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
mas01mc@308 526 if(SERVER_LSH_INDEX_SINGLETON)
mas01mc@308 527 fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName());
mas01mc@308 528 else
mas01mc@327 529 VERB_LOG(1,"INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
mas01mc@308 530 lsh = index_allocate(indexName, true);
mas01mc@292 531 }
mas01mc@292 532
mas01mc@292 533 delete[] indexName;
mas01mc@292 534 return true;
mas01mc@292 535 }
mas01mc@292 536
mas01mc@292 537 // *Static* approximate NN point reporter callback method for lshlib
mas01mc@292 538 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 539 assert(instancePtr); // We need an instance for this callback
mas01mc@292 540 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@324 541 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
mas01mc@324 542 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
mas01mc@292 543 // Skip identity in query_from_key
mas01mc@292 544 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 545 myself->reporter->add_point(trackID, qpos, spos, dist);
mas01mc@292 546 }
mas01mc@292 547
mas01mc@292 548 // *Static* exact NN point reporter callback method for lshlib
mas01mc@292 549 // Maintain a queue of points to pass to query_points() for exact evaluation
mas01mc@292 550 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
mas01mc@292 551 assert(instancePtr); // We need an instance for this callback
mas01mc@292 552 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
mas01mc@324 553 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
mas01mc@324 554 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
mas01mc@292 555 // Skip identity in query_from_key
mas01mc@292 556 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
mas01mc@292 557 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
mas01mc@292 558 }
mas01mc@292 559
mas01mc@292 560 void audioDB::initialize_exact_evalutation_queue(){
mas01mc@292 561 if(exact_evaluation_queue)
mas01mc@292 562 delete exact_evaluation_queue;
mas01mc@292 563 exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >;
mas01mc@292 564 }
mas01mc@292 565
mas01mc@292 566 void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){
mas01mc@292 567 PointPair p(trackID, qpos, spos);
mas01mc@292 568 exact_evaluation_queue->push(p);
mas01mc@292 569 }
mas01mc@292 570
mas01mc@292 571 // return 0: if index does not exist
mas01mc@292 572 // return nqv: if index exists
mas01mc@292 573 int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) {
mas01mc@292 574
mas01mc@324 575 unsigned int numVectors = 0;
mas01mc@324 576 double *query = 0, *query_data = 0;
mas01mc@324 577 double *qNorm = 0, *qnPtr = 0, *qPower = 0, *qpPtr = 0;
mas01mc@324 578 double meanQdur = 0;
mas01mc@292 579 void (*add_point_func)(void*,Uns32T,Uns32T,float);
mas01mc@292 580
mas01mc@292 581 // Set the point-reporter callback based on the value of lsh_exact
mas01mc@292 582 if(lsh_exact){
mas01mc@292 583 initialize_exact_evalutation_queue();
mas01mc@292 584 add_point_func = &index_add_point_exact;
mas01mc@292 585 }
mas01mc@292 586 else
mas01mc@292 587 add_point_func = &index_add_point_approximate;
mas01mc@292 588
mas01mc@292 589 if(!index_init_query(dbName)) // sets-up LSH index structures for querying
mas01mc@292 590 return 0;
mas01mc@292 591
mas01mc@292 592 char* database = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 593
mas01mc@292 594 if(query_from_key)
mas01mc@292 595 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
mas01mc@292 596 else
mas01mc@292 597 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors
mas01mc@292 598
mas01mc@292 599 VERB_LOG(1, "retrieving tracks...");
mas01mc@292 600
mas01mc@292 601 assert(pointNN>0 && pointNN<=O2_MAXNN);
mas01mc@292 602 assert(trackNN>0 && trackNN<=O2_MAXNN);
mas01mc@292 603
mas01mc@292 604 gettimeofday(&tv1, NULL);
mas01mc@292 605 // query vector index
mas01mc@292 606 Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1;
mas01mc@292 607 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
mas01mc@327 608 VERB_LOG(1, "Nq=%d", Nq);
mas01mc@292 609 // Construct shingles from query features
mas01mc@292 610 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
mas01mc@292 611 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
mas01mc@292 612
mas01mc@292 613 // Normalize query vectors
mas01mc@292 614 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr );
mas01mc@327 615 VERB_LOG(1, "Nq'=%d\n", numVecsAboveThreshold);
mas01mc@292 616
mas01mc@292 617 // Nq contains number of inspected points in query file,
mas01mc@292 618 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01mc@292 619 double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation
mas01mc@292 620 if(usingQueryPoint && numVecsAboveThreshold){
mas01mc@292 621 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
mas01mc@292 622 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 623 else
mas01mc@292 624 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this);
mas01mc@292 625 }
mas01mc@292 626 else if(numVecsAboveThreshold)
mas01mc@292 627 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
mas01cr@370 628 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@370 629 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01mc@292 630 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this);
mas01cr@370 631 } else {
mas01mc@292 632 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this);
mas01cr@370 633 }
mas01cr@370 634 }
mas01mc@292 635
mas01mc@292 636 if(lsh_exact)
mas01mc@292 637 // Perform exact distance computation on point pairs in exact_evaluation_queue
mas01mc@292 638 query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors);
mas01mc@292 639
mas01mc@292 640 gettimeofday(&tv2,NULL);
mas01mc@292 641 VERB_LOG(1,"elapsed time: %ld msec\n",
mas01mc@292 642 (tv2.tv_sec*1000 + tv2.tv_usec/1000) -
mas01mc@292 643 (tv1.tv_sec*1000 + tv1.tv_usec/1000))
mas01mc@292 644
mas01mc@292 645 // Close the index file
mas01mc@292 646 close(lshfid);
mas01mc@292 647
mas01mc@292 648 // Clean up
mas01mc@292 649 if(query_data)
mas01mc@292 650 delete[] query_data;
mas01mc@292 651 if(qNorm)
mas01mc@292 652 delete[] qNorm;
mas01mc@292 653 if(qPower)
mas01mc@292 654 delete[] qPower;
mas01mc@292 655 if(database)
mas01mc@292 656 delete[] database;
mas01mc@292 657
mas01mc@292 658 return Nq;
mas01mc@292 659 }
mas01mc@292 660