annotate index.cpp @ 459:fcc6f7c4856b api-inversion

No more global shingle vector of vectors. Convert audioDB::index_initialize_shingles and audioDB::index_norm_shingles to plain old functions. In doing so, the latter in particular acquires a silly argument list; we need that complexity for now because it's called both from audioDB::query (which we're currently inverting) and from audioDB::index (which is out of scope for now). The loss of the global vv thing made me check up on memory discipline [hence the new API function audiodb_query_free_results() as well as the internal audiodb_index_delete_shingles()]. It's not too bad, but there are plenty of leaks for those with time to do AUDIODB="valgrind --leak-check=full ../../audioDB" sh ./run-test.sh on their favourite test case. For example, the Radius reporters leak one triple per hit. (Honestly, C++ memory management is teh suck.)
author mas01cr
date Sun, 28 Dec 2008 22:43:50 +0000
parents 913a95f06998
children 17003dff8127
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01cr@426 15 #include "audioDB-internals.h"
mas01mc@292 16
mas01cr@458 17 typedef struct adb_qcallback {
mas01cr@458 18 adb_t *adb;
mas01cr@458 19 adb_qstate_internal_t *qstate;
mas01cr@458 20 } adb_qcallback_t;
mas01mc@292 21
mas01mc@292 22 /************************* LSH indexing and query initialization *****************/
mas01mc@292 23
mas01mc@292 24 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
mas01mc@292 25 char* indexName = new char[MAXSTR];
mas01mc@292 26 // Attempt to make new file
mas01mc@292 27 if(strlen(dbName) > (MAXSTR - 32))
mas01mc@292 28 error("dbName is too long for LSH index filename appendages");
mas01mc@292 29 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 30 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 31 return indexName;
mas01mc@292 32 }
mas01mc@292 33
mas01mc@292 34 // return true if index exists else return false
mas01mc@292 35 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
mas01mc@292 36 // Test to see if file exists
mas01mc@292 37 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 38 lshfid = open (indexName, O_RDONLY);
mas01mc@292 39 delete[] indexName;
mas01mc@292 40 close(lshfid);
mas01mc@292 41
mas01mc@292 42 if(lshfid<0)
mas01mc@292 43 return false;
mas01mc@292 44 else
mas01mc@292 45 return true;
mas01mc@292 46 }
mas01mc@292 47
mas01mc@324 48 // If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName())
mas01mc@324 49 // If they match, i.e. path+dbName_resident == path+dbName_requested, use
mas01mc@324 50 // the memory-resident index.
mas01mc@324 51 // Else allocate a new LSH instance and load the index from disk
mas01mc@308 52 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
mas01mc@308 53 LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
mas01mc@308 54 if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
mas01mc@308 55 audioDB::lsh = gIndx; // Use the global SERVER resident index
mas01mc@308 56 else{
mas01mc@308 57 if(audioDB::lsh)
mas01mc@308 58 delete audioDB::lsh;
mas01mc@308 59 audioDB::lsh = new LSH(indexName, load_hashTables);
mas01mc@308 60 }
mas01mc@308 61 assert(audioDB::lsh);
mas01mc@308 62 return audioDB::lsh;
mas01mc@308 63 }
mas01mc@308 64
mas01cr@459 65 vector<vector<float> > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) {
mas01cr@459 66 std::vector<std::vector<float> > *vv = new vector<vector<float> >(sz);
mas01cr@459 67 for(Uns32T i=0 ; i < sz ; i++) {
mas01cr@459 68 (*vv)[i]=vector<float>(dim * seqLen);
mas01cr@459 69 }
mas01mc@292 70 return vv;
mas01mc@292 71 }
mas01mc@292 72
mas01cr@459 73 void audiodb_index_delete_shingles(vector<vector<float> > *vv) {
mas01cr@459 74 delete vv;
mas01cr@459 75 }
mas01cr@459 76
mas01mc@292 77 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 78
mas01mc@292 79 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 80 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 81 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 82 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 83 }
mas01mc@324 84
mas01mc@325 85 double *snpp = 0, *sppp = 0;
mas01mc@324 86
mas01mc@292 87 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 88 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 89 snpp = *snp;
mas01mc@292 90 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 91 sppp = *spp;
mas01mc@325 92
mas01mc@324 93 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 94 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 95
mas01mc@324 96
mas01mc@292 97 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 98 if(trackTable[i] >= sequenceLength) {
mas01cr@427 99 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@427 100 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 101
mas01cr@427 102 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@427 103 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 104 }
mas01mc@292 105 snpp += trackTable[i];
mas01mc@292 106 sppp += trackTable[i];
mas01mc@292 107 }
mas01mc@324 108
mas01mc@292 109 *vsnp = *snp;
mas01mc@292 110 *vspp = *spp;
mas01mc@324 111
mas01mc@292 112 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 113 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 114 }
mas01mc@292 115
mas01mc@292 116
mas01cr@456 117 /********************* LSH shingle construction ***************************/
mas01cr@456 118
mas01cr@456 119 // Construct shingles out of a feature matrix
mas01cr@456 120 // inputs:
mas01cr@456 121 // idx is vector index in feature matrix
mas01cr@456 122 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01cr@456 123 //
mas01cr@456 124 // pre-conditions:
mas01cr@456 125 // dbH->dim
mas01cr@456 126 // sequenceLength
mas01cr@456 127 // idx < numVectors - sequenceLength + 1
mas01cr@456 128 //
mas01cr@456 129 // post-conditions:
mas01cr@456 130 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01cr@456 131
mas01cr@456 132 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01cr@456 133 assert(idx<(*vv).size());
mas01cr@456 134 vector<float>::iterator ve = (*vv)[idx].end();
mas01cr@456 135 vector<float>::iterator vi = (*vv)[idx].begin();
mas01cr@456 136 // First feature vector in shingle
mas01cr@456 137 if(idx == 0) {
mas01cr@456 138 while(vi!=ve) {
mas01cr@456 139 *vi++ = (float)(*fvp++);
mas01cr@456 140 }
mas01cr@456 141 } else {
mas01cr@456 142 // Not first feature vector in shingle
mas01cr@456 143 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
mas01cr@456 144 // Previous seqLen-1 dim-vectors
mas01cr@456 145 while(vi!=ve-dim) {
mas01cr@456 146 *vi++ = *ui++;
mas01cr@456 147 }
mas01cr@456 148 // Move data pointer to next feature vector
mas01cr@456 149 fvp += ( seqLen + idx - 1 ) * dim ;
mas01cr@456 150 // New d-vector
mas01cr@456 151 while(vi!=ve) {
mas01cr@456 152 *vi++ = (float)(*fvp++);
mas01cr@456 153 }
mas01cr@456 154 }
mas01cr@456 155 }
mas01cr@456 156
mas01cr@456 157 // norm shingles
mas01cr@456 158 // in-place norming, no deletions
mas01cr@456 159 // If using power, return number of shingles above power threshold
mas01cr@459 160 int audiodb_index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) {
mas01cr@456 161 int z = 0; // number of above-threshold shingles
mas01cr@456 162 float l2norm;
mas01cr@456 163 double power;
mas01cr@456 164 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01cr@456 165 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01cr@459 166 Uns32T shingleSize = seqLen * dim;
mas01cr@459 167
mas01cr@459 168 if(!spp) {
mas01cr@459 169 return -1;
mas01cr@459 170 }
mas01cr@456 171 for(Uns32T a=0; a<(*vv).size(); a++){
mas01cr@456 172 l2norm = (float)(*snp++);
mas01cr@459 173 if(normed_vectors)
mas01cr@456 174 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01cr@456 175
mas01cr@456 176 for(Uns32T b=0; b < shingleSize ; b++)
mas01cr@456 177 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01cr@456 178
mas01cr@456 179 power = *spp++;
mas01cr@459 180 if(use_pthreshold){
mas01cr@459 181 if (power >= pthreshold)
mas01cr@456 182 z++;
mas01cr@456 183 }
mas01cr@456 184 else
mas01cr@456 185 z++;
mas01cr@456 186 }
mas01cr@456 187 return z;
mas01cr@456 188 }
mas01cr@456 189
mas01cr@456 190
mas01mc@292 191 /************************ LSH indexing ***********************************/
mas01mc@292 192 void audioDB::index_index_db(const char* dbName){
mas01mc@292 193 char* newIndexName;
mas01mc@292 194 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 195 Uns32T dbVectors = 0;
mas01mc@292 196
mas01mc@324 197
mas01mc@292 198 printf("INDEX: initializing header\n");
mas01mc@292 199 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 200 forWrite = false;
mas01mc@292 201 initDBHeader(dbName);
mas01mc@292 202
mas01mc@324 203 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 204 usingPower = true;
mas01mc@324 205
mas01mc@324 206 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 207 usingTimes = true;
mas01mc@324 208
mas01mc@292 209 newIndexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 210
mas01mc@292 211 // Set unit norming flag override
mas01mc@292 212 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 213
mas01mc@327 214 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 215 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 216 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 217 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 218 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 219 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 220 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 221 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 222 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 223 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 224
mas01mc@292 225 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 226 printf("INDEX: constructing new LSH index\n");
mas01mc@292 227 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 228 fflush(stdout);
mas01mc@292 229 // Construct new LSH index
mas01mc@292 230 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 231 lsh_param_m,
mas01mc@292 232 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 233 lsh_param_N,
mas01mc@292 234 lsh_param_ncols,
mas01mc@292 235 (float)radius);
mas01mc@292 236 assert(lsh);
mas01mc@292 237
mas01mc@292 238 Uns32T endTrack = lsh_param_b;
mas01mc@292 239 if( endTrack > dbH->numFiles)
mas01mc@292 240 endTrack = dbH->numFiles;
mas01mc@292 241 // Insert up to lsh_param_b tracks
mas01mc@324 242 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 243 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 244 }
mas01mc@324 245 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 246 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 247
mas01mc@292 248 // Clean up
mas01mc@292 249 delete lsh;
mas01mc@308 250 lsh = 0;
mas01mc@292 251 close(lshfid);
mas01mc@292 252 }
mas01mc@292 253
mas01mc@292 254 // Attempt to open LSH file
mas01mc@292 255 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 256 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 257 fflush(stdout);
mas01mc@340 258 char* mergeIndexName = newIndexName;
mas01mc@292 259
mas01mc@292 260 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 261 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 262 assert(lsh);
mas01cr@458 263 Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
mas01mc@292 264 delete lsh;
mas01mc@308 265 lsh = 0;
mas01mc@292 266
mas01mc@340 267 // Insert up to lsh_param_b tracks
mas01mc@340 268 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 269 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 270 }
mas01mc@292 271 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 272 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 273
mas01mc@292 274 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 275 if( endTrack > dbH->numFiles)
mas01mc@292 276 endTrack = dbH->numFiles;
mas01mc@292 277 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 278 fflush(stdout);
mas01mc@340 279 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 280 assert(lsh);
mas01mc@292 281
mas01mc@292 282 // Insert up to lsh_param_b database tracks
mas01mc@292 283 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 284
mas01mc@340 285 // Serialize to file (merging is performed here)
mas01mc@340 286 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 287 delete lsh;
mas01mc@308 288 lsh = 0;
mas01mc@340 289 }
mas01mc@292 290
mas01mc@292 291 close(lshfid);
mas01mc@292 292 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 293 fflush(stdout);
mas01mc@292 294
mas01mc@292 295 }
mas01mc@292 296 else{
mas01mc@292 297 error("Something's wrong with LSH index file");
mas01mc@292 298 exit(1);
mas01mc@292 299 }
mas01mc@292 300
mas01mc@324 301 delete[] newIndexName;
mas01mc@324 302 delete[] sNorm;
mas01mc@324 303 delete[] sPower;
mas01mc@324 304 }
mas01mc@292 305
mas01mc@292 306
mas01cr@405 307 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@405 308 if(usingPower){
mas01cr@405 309 int one;
mas01cr@405 310 unsigned int count;
mas01cr@405 311
mas01cr@405 312 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@405 313 if (count != sizeof(unsigned int)) {
mas01cr@405 314 error("powerfd read failed", "int", "read");
mas01cr@405 315 }
mas01cr@405 316 if (one != 1) {
mas01cr@405 317 error("dimensionality of power file not 1", powerFileName);
mas01cr@405 318 }
mas01cr@405 319
mas01cr@405 320 // FIXME: should check that the powerfile is the right size for
mas01cr@405 321 // this. -- CSR, 2007-10-30
mas01cr@405 322 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@405 323 if (count != numVectors * sizeof(double)) {
mas01cr@405 324 error("powerfd read failed", "double", "read");
mas01cr@405 325 }
mas01cr@405 326 }
mas01cr@405 327 }
mas01cr@405 328
mas01mc@324 329 // initialize auxillary track data from filesystem
mas01mc@324 330 // pre-conditions:
mas01mc@324 331 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 332 // feature data allocated and copied (fvp)
mas01mc@324 333 //
mas01mc@324 334 // post-conditions:
mas01mc@324 335 // allocated power data
mas01mc@324 336 // allocated l2norm data
mas01mc@324 337 //
mas01mc@324 338 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 339 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 340 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 341
mas01mc@324 342 // Allocate and read the power sequence
mas01mc@324 343 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 344
mas01mc@324 345 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 346 char* tmpStr = prefixedString;
mas01mc@324 347 // Open and check dimensions of power file
mas01mc@324 348 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 349 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 350 if(prefixedString!=tmpStr)
mas01mc@324 351 delete[] tmpStr;
mas01mc@324 352 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 353 if (powerfd < 0) {
mas01mc@324 354 error("failed to open power file", prefixedString);
mas01mc@324 355 }
mas01mc@324 356 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 357 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 358 }
mas01mc@324 359
mas01mc@324 360 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 361 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 362
mas01mc@324 363 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 364 assert(*sPowerp);
mas01mc@324 365 *spPtrp = *sPowerp;
mas01mc@324 366 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 367 if (0 < powerfd) {
mas01mc@324 368 close(powerfd);
mas01mc@324 369 }
mas01mc@324 370
mas01cr@427 371 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@427 372 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 373 powerTable = 0;
mas01mc@324 374
mas01mc@324 375 // Allocate and calculate the l2norm sequence
mas01mc@324 376 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 377 assert(*sNormpp);
mas01mc@324 378 *snPtrp = *sNormpp;
mas01cr@426 379 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@427 380 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@427 381 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 382 }
mas01mc@292 383 }
mas01mc@292 384
mas01mc@292 385 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 386 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 387 double** sPowerp, double** spPtrp){
mas01mc@292 388 size_t nfv = 0;
mas01mc@292 389 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 390 Uns32T trackID = 0;
mas01mc@292 391
mas01mc@292 392 VERB_LOG(1, "indexing tracks...");
mas01mc@292 393
mas01mc@324 394 int trackfd = dbfid;
mas01mc@292 395 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 396 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 397 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 398 char* tmpStr = prefixedString;
mas01mc@324 399 // Open and check dimensions of feature file
mas01mc@324 400 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 401 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 402 if(prefixedString!=tmpStr)
mas01mc@324 403 delete[] tmpStr;
mas01cr@454 404 initInputFile(prefixedString);
mas01mc@324 405 trackfd = infid;
mas01mc@324 406 }
mas01cr@433 407 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@433 408 error("failed to read data");
mas01mc@292 409 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 410
mas01mc@324 411 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 412 // Load power and calculate power and l2norm sequence sums
mas01mc@324 413 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 414
mas01mc@292 415 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 416 break;
mas01mc@324 417 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 418 close(infid);
mas01mc@324 419 delete[] *sNormpp;
mas01mc@324 420 delete[] *sPowerp;
mas01mc@324 421 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 422 }
mas01mc@324 423 } // end for(trackID = start_track ; ... )
mas01mc@292 424 std::cout << "finished inserting." << endl;
mas01mc@292 425 }
mas01mc@292 426
mas01mc@292 427 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 428 // Loop over the current input track's vectors
mas01cr@305 429 Uns32T numVecs = 0;
mas01cr@305 430 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 431 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 432 numVecs = 0;
mas01cr@305 433 } else {
mas01cr@305 434 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 435 }
mas01cr@305 436 } else {
mas01cr@305 437 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 438 numVecs = 0;
mas01cr@305 439 } else {
mas01cr@305 440 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 441 }
mas01cr@305 442 }
mas01mc@292 443
mas01mc@324 444 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 445 if(numVecs){
mas01cr@459 446 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324 447
mas01mc@324 448 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@456 449 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@459 450 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@459 451 if(vcount == -1) {
mas01cr@459 452 audiodb_index_delete_shingles(vv);
mas01cr@459 453 error("failed to norm shingles");
mas01cr@459 454 }
mas01cr@459 455 numVecsAboveThreshold = vcount;
mas01mc@324 456 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@459 457 audiodb_index_delete_shingles(vv);
mas01mc@324 458 }
mas01cr@459 459
mas01mc@292 460 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 461
mas01cr@459 462 /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292 463 sequence, which is right, but the space allocated is for the
mas01mc@292 464 whole track. */
mas01mc@292 465
mas01mc@292 466 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 467 * So let's be certain the pointers are in the correct place
mas01mc@292 468 */
mas01mc@292 469
mas01mc@324 470 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 471 *snpp += trackTable[trackID];
mas01mc@324 472 *sppp += trackTable[trackID];
mas01mc@324 473 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 474 }
mas01mc@292 475
mas01mc@292 476 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 477 std::cout.flush();
mas01mc@292 478 return true;
mas01mc@292 479 }
mas01mc@292 480
mas01mc@292 481 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 482 Uns32T collisionCount = 0;
mas01mc@292 483 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 484 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 485 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01cr@458 486 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
mas01mc@324 487 spp+=sequenceHop;
mas01mc@311 488 }
mas01mc@292 489 return collisionCount;
mas01mc@292 490 }
mas01mc@292 491
mas01mc@292 492 /*********************** LSH retrieval ****************************/
mas01mc@292 493
mas01mc@292 494
mas01mc@292 495 // return true if indexed query performed else return false
mas01mc@292 496 int audioDB::index_init_query(const char* dbName){
mas01mc@292 497
mas01mc@292 498 if(!(index_exists(dbName, radius, sequenceLength)))
mas01mc@292 499 return false;
mas01mc@292 500
mas01mc@292 501 char* indexName = index_get_name(dbName, radius, sequenceLength);
mas01mc@292 502
mas01mc@292 503 // Test to see if file exists
mas01mc@292 504 if((lshfid = open (indexName, O_RDONLY)) < 0){
mas01mc@292 505 delete[] indexName;
mas01mc@292 506 return false;
mas01mc@292 507 }
mas01mc@292 508
mas01mc@308 509 lsh = index_allocate(indexName, false); // Get the header only here
mas01mc@292 510 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
mas01mc@292 511
mas01mc@311 512 if(lsh!=SERVER_LSH_INDEX_SINGLETON){
mas01mc@308 513 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
mas01mc@308 514 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
mas01mc@327 515 VERB_LOG(1,"INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 516 VERB_LOG(1,"INDEX: R %f\n", lsh->get_radius());
mas01mc@327 517 VERB_LOG(1,"INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 518 VERB_LOG(1,"INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
mas01mc@327 519 VERB_LOG(1,"INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
mas01mc@327 520 VERB_LOG(1,"INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
mas01mc@327 521 VERB_LOG(1,"INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
mas01cr@458 522 VERB_LOG(1,"INDEX: s %d\n", audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb)));
mas01mc@327 523 VERB_LOG(1,"INDEX: Opened LSH index file %s\n", indexName);
mas01mc@308 524 }
mas01mc@292 525
mas01mc@292 526 // Check to see if we are loading hash tables into core, and do so if true
mas01mc@292 527 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
mas01mc@308 528 if(SERVER_LSH_INDEX_SINGLETON)
mas01mc@308 529 fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName());
mas01mc@308 530 else
mas01mc@327 531 VERB_LOG(1,"INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
mas01mc@308 532 lsh = index_allocate(indexName, true);
mas01mc@292 533 }
mas01mc@292 534
mas01mc@292 535 delete[] indexName;
mas01mc@292 536 return true;
mas01mc@292 537 }
mas01mc@292 538
mas01cr@458 539 void audiodb_index_add_point_approximate(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@458 540 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@458 541 adb_t *adb = data->adb;
mas01cr@458 542 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@458 543 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@458 544 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@458 545 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@458 546 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@458 547 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@424 548 adb_result_t r;
mas01cr@458 549 r.key = (*adb->keys)[trackID].c_str();
mas01cr@424 550 r.dist = dist;
mas01cr@424 551 r.qpos = qpos;
mas01cr@424 552 r.ipos = spos;
mas01cr@458 553 qstate->accumulator->add_point(&r);
mas01cr@424 554 }
mas01mc@292 555 }
mas01mc@292 556
mas01cr@458 557 // Maintain a queue of points to pass to query_loop_points() for exact
mas01cr@458 558 // evaluation
mas01cr@458 559 void audiodb_index_add_point_exact(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@458 560 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@458 561 adb_t *adb = data->adb;
mas01cr@458 562 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@458 563 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@458 564 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@458 565 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@458 566 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@458 567 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@458 568 PointPair p(trackID, qpos, spos);
mas01cr@458 569 qstate->exact_evaluation_queue->push(p);
mas01cr@458 570 }
mas01mc@292 571 }
mas01mc@292 572
mas01mc@292 573 // return 0: if index does not exist
mas01mc@292 574 // return nqv: if index exists
mas01cr@458 575 int audioDB::index_query_loop(adb_t *adb, adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
mas01mc@292 576
mas01mc@324 577 double *query = 0, *query_data = 0;
mas01cr@437 578 adb_qpointers_internal_t qpointers = {0};
mas01cr@437 579
mas01cr@458 580 adb_qcallback_t callback_data;
mas01cr@458 581 callback_data.adb = adb;
mas01cr@458 582 callback_data.qstate = qstate;
mas01cr@458 583
mas01mc@292 584 void (*add_point_func)(void*,Uns32T,Uns32T,float);
mas01mc@292 585
mas01cr@436 586 sequenceLength = spec->qid.sequence_length;
mas01cr@435 587 normalizedDistance = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED);
mas01cr@431 588
mas01mc@292 589 // Set the point-reporter callback based on the value of lsh_exact
mas01cr@458 590 if(lsh_exact) {
mas01cr@458 591 qstate->exact_evaluation_queue = new std::priority_queue<PointPair>;
mas01cr@458 592 add_point_func = &audiodb_index_add_point_exact;
mas01cr@458 593 } else {
mas01cr@458 594 add_point_func = &audiodb_index_add_point_approximate;
mas01mc@292 595 }
mas01mc@292 596
mas01cr@455 597 if(!index_init_query(adb->path)) // sets-up LSH index structures for querying
mas01mc@292 598 return 0;
mas01mc@292 599
mas01cr@455 600 char* database = index_get_name(adb->path, radius, sequenceLength);
mas01mc@292 601
mas01cr@444 602 if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
mas01cr@444 603 error("failed to set up qpointers");
mas01cr@444 604 }
mas01mc@292 605
mas01mc@292 606 // query vector index
mas01cr@437 607 Uns32T Nq = (qpointers.nvectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:qpointers.nvectors) - sequenceLength + 1;
mas01cr@459 608 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequenceLength); // allocate memory to copy query vectors to shingles
mas01cr@447 609
mas01mc@292 610 // Construct shingles from query features
mas01mc@292 611 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
mas01cr@456 612 audiodb_index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
mas01mc@292 613
mas01mc@292 614 // Normalize query vectors
mas01cr@459 615 int vcount = audiodb_index_norm_shingles(vv, qpointers.l2norm, qpointers.power, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@459 616 if(vcount == -1) {
mas01cr@459 617 audiodb_index_delete_shingles(vv);
mas01cr@459 618 error("failed to norm shingles");
mas01cr@459 619 }
mas01cr@459 620 Uns32T numVecsAboveThreshold = vcount;
mas01mc@292 621
mas01mc@292 622 // Nq contains number of inspected points in query file,
mas01mc@292 623 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01cr@437 624 double* qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation
mas01mc@292 625 if(usingQueryPoint && numVecsAboveThreshold){
mas01mc@292 626 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
mas01cr@458 627 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, &callback_data);
mas01mc@292 628 else
mas01cr@458 629 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, &callback_data);
mas01mc@292 630 }
mas01mc@292 631 else if(numVecsAboveThreshold)
mas01mc@292 632 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
mas01cr@370 633 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@370 634 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@458 635 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 636 } else {
mas01cr@458 637 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 638 }
mas01cr@370 639 }
mas01cr@459 640 audiodb_index_delete_shingles(vv);
mas01mc@292 641
mas01mc@292 642 if(lsh_exact)
mas01mc@292 643 // Perform exact distance computation on point pairs in exact_evaluation_queue
mas01cr@458 644 query_loop_points(adb, spec, qstate, query, &qpointers);
mas01mc@292 645
mas01mc@292 646 // Close the index file
mas01mc@292 647 close(lshfid);
mas01mc@292 648
mas01mc@292 649 // Clean up
mas01mc@292 650 if(query_data)
mas01mc@292 651 delete[] query_data;
mas01cr@437 652 if(qpointers.l2norm_data)
mas01cr@437 653 delete[] qpointers.l2norm_data;
mas01cr@437 654 if(qpointers.power_data)
mas01cr@437 655 delete[] qpointers.power_data;
mas01cr@437 656 if(qpointers.mean_duration)
mas01cr@437 657 delete[] qpointers.mean_duration;
mas01mc@292 658 if(database)
mas01mc@292 659 delete[] database;
mas01mc@292 660
mas01mc@292 661 return Nq;
mas01mc@292 662 }
mas01mc@292 663