annotate index.cpp @ 497:9d8aee621afb api-inversion

More libtests fixups. Include audiodb_close() calls everywhere (whoops). Add the facility to run tests under valgrind. Unfortunately the error-exitcode flag doesn't actually cause an error exit if the only thing wrong is memory leaks, but it will if there are actual memory errors, which is a start.
author mas01cr
date Sat, 10 Jan 2009 16:07:43 +0000
parents b2fd8113d8bc
children
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01cr@426 15 #include "audioDB-internals.h"
mas01mc@292 16
mas01cr@458 17 typedef struct adb_qcallback {
mas01cr@458 18 adb_t *adb;
mas01cr@458 19 adb_qstate_internal_t *qstate;
mas01cr@458 20 } adb_qcallback_t;
mas01mc@292 21
mas01mc@292 22 /************************* LSH indexing and query initialization *****************/
mas01mc@292 23
mas01cr@460 24 /* FIXME: there are several things wrong with this: the memory
mas01cr@460 25 * discipline isn't ideal, the radius printing is a bit lame, the name
mas01cr@460 26 * getting will succeed or fail depending on whether the path was
mas01cr@460 27 * relative or absolute -- but most importantly encoding all that
mas01cr@460 28 * information in a filename is going to lose: it's impossible to
mas01cr@460 29 * maintain backwards-compatibility. Instead we should probably store
mas01cr@460 30 * the index metadata inside the audiodb instance. */
mas01cr@460 31 char *audiodb_index_get_name(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@460 32 char *indexName;
mas01cr@460 33 if(strlen(dbName) > (MAXSTR - 32)) {
mas01cr@460 34 return NULL;
mas01cr@460 35 }
mas01cr@460 36 indexName = new char[MAXSTR];
mas01mc@292 37 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 38 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 39 return indexName;
mas01mc@292 40 }
mas01mc@292 41
mas01cr@460 42 bool audiodb_index_exists(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@460 43 char *indexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@460 44 if(!indexName) {
mas01cr@460 45 return false;
mas01cr@460 46 }
mas01cr@460 47 struct stat st;
mas01cr@460 48 if(stat(indexName, &st)) {
mas01cr@460 49 delete [] indexName;
mas01cr@460 50 return false;
mas01cr@460 51 }
mas01cr@460 52 /* FIXME: other stat checks here? */
mas01cr@460 53 /* FIXME: is there any better way to check whether we can open a
mas01cr@460 54 * file for reading than by opening a file for reading? */
mas01cr@460 55 int fd = open(indexName, O_RDONLY);
mas01cr@460 56 delete [] indexName;
mas01cr@460 57 if(fd < 0) {
mas01cr@460 58 return false;
mas01cr@460 59 } else {
mas01cr@460 60 close(fd);
mas01mc@292 61 return true;
mas01cr@460 62 }
mas01mc@292 63 }
mas01mc@292 64
mas01cr@465 65 /* FIXME: the indexName arg should be "const char *", but the LSH
mas01cr@465 66 * library doesn't like that.
mas01cr@465 67 */
mas01cr@465 68 LSH *audiodb_index_allocate(adb_t *adb, char *indexName, bool load_tables) {
mas01cr@465 69 LSH *lsh;
mas01cr@465 70 if(adb->cached_lsh) {
mas01cr@465 71 if(!strncmp(adb->cached_lsh->get_indexName(), indexName, MAXSTR)) {
mas01cr@465 72 return adb->cached_lsh;
mas01cr@465 73 } else {
mas01cr@465 74 delete adb->cached_lsh;
mas01cr@465 75 }
mas01mc@308 76 }
mas01cr@465 77 lsh = new LSH(indexName, load_tables);
mas01cr@465 78 if(load_tables) {
mas01cr@465 79 adb->cached_lsh = lsh;
mas01cr@465 80 }
mas01cr@465 81 return lsh;
mas01mc@308 82 }
mas01mc@308 83
mas01cr@459 84 vector<vector<float> > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) {
mas01cr@459 85 std::vector<std::vector<float> > *vv = new vector<vector<float> >(sz);
mas01cr@459 86 for(Uns32T i=0 ; i < sz ; i++) {
mas01cr@459 87 (*vv)[i]=vector<float>(dim * seqLen);
mas01cr@459 88 }
mas01mc@292 89 return vv;
mas01mc@292 90 }
mas01mc@292 91
mas01cr@459 92 void audiodb_index_delete_shingles(vector<vector<float> > *vv) {
mas01cr@459 93 delete vv;
mas01cr@459 94 }
mas01cr@459 95
mas01mc@292 96 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 97
mas01mc@292 98 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 99 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 100 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 101 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 102 }
mas01mc@324 103
mas01mc@325 104 double *snpp = 0, *sppp = 0;
mas01mc@324 105
mas01mc@292 106 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 107 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 108 snpp = *snp;
mas01mc@292 109 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 110 sppp = *spp;
mas01mc@325 111
mas01mc@324 112 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 113 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 114
mas01mc@324 115
mas01mc@292 116 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 117 if(trackTable[i] >= sequenceLength) {
mas01cr@427 118 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@427 119 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 120
mas01cr@427 121 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@427 122 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 123 }
mas01mc@292 124 snpp += trackTable[i];
mas01mc@292 125 sppp += trackTable[i];
mas01mc@292 126 }
mas01mc@324 127
mas01mc@292 128 *vsnp = *snp;
mas01mc@292 129 *vspp = *spp;
mas01mc@324 130
mas01mc@292 131 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 132 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 133 }
mas01mc@292 134
mas01mc@292 135
mas01cr@456 136 /********************* LSH shingle construction ***************************/
mas01cr@456 137
mas01cr@456 138 // Construct shingles out of a feature matrix
mas01cr@456 139 // inputs:
mas01cr@456 140 // idx is vector index in feature matrix
mas01cr@456 141 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01cr@456 142 //
mas01cr@456 143 // pre-conditions:
mas01cr@456 144 // dbH->dim
mas01cr@456 145 // sequenceLength
mas01cr@456 146 // idx < numVectors - sequenceLength + 1
mas01cr@456 147 //
mas01cr@456 148 // post-conditions:
mas01cr@456 149 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01cr@456 150
mas01cr@456 151 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01cr@456 152 assert(idx<(*vv).size());
mas01cr@456 153 vector<float>::iterator ve = (*vv)[idx].end();
mas01cr@456 154 vector<float>::iterator vi = (*vv)[idx].begin();
mas01cr@456 155 // First feature vector in shingle
mas01cr@456 156 if(idx == 0) {
mas01cr@456 157 while(vi!=ve) {
mas01cr@456 158 *vi++ = (float)(*fvp++);
mas01cr@456 159 }
mas01cr@456 160 } else {
mas01cr@456 161 // Not first feature vector in shingle
mas01cr@456 162 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
mas01cr@456 163 // Previous seqLen-1 dim-vectors
mas01cr@456 164 while(vi!=ve-dim) {
mas01cr@456 165 *vi++ = *ui++;
mas01cr@456 166 }
mas01cr@456 167 // Move data pointer to next feature vector
mas01cr@456 168 fvp += ( seqLen + idx - 1 ) * dim ;
mas01cr@456 169 // New d-vector
mas01cr@456 170 while(vi!=ve) {
mas01cr@456 171 *vi++ = (float)(*fvp++);
mas01cr@456 172 }
mas01cr@456 173 }
mas01cr@456 174 }
mas01cr@456 175
mas01cr@456 176 // norm shingles
mas01cr@456 177 // in-place norming, no deletions
mas01cr@456 178 // If using power, return number of shingles above power threshold
mas01cr@459 179 int audiodb_index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) {
mas01cr@456 180 int z = 0; // number of above-threshold shingles
mas01cr@456 181 float l2norm;
mas01cr@456 182 double power;
mas01cr@456 183 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01cr@456 184 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01cr@459 185 Uns32T shingleSize = seqLen * dim;
mas01cr@459 186
mas01cr@459 187 if(!spp) {
mas01cr@459 188 return -1;
mas01cr@459 189 }
mas01cr@456 190 for(Uns32T a=0; a<(*vv).size(); a++){
mas01cr@456 191 l2norm = (float)(*snp++);
mas01cr@459 192 if(normed_vectors)
mas01cr@456 193 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01cr@456 194
mas01cr@456 195 for(Uns32T b=0; b < shingleSize ; b++)
mas01cr@456 196 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01cr@456 197
mas01cr@456 198 power = *spp++;
mas01cr@459 199 if(use_pthreshold){
mas01cr@459 200 if (power >= pthreshold)
mas01cr@456 201 z++;
mas01cr@456 202 }
mas01cr@456 203 else
mas01cr@456 204 z++;
mas01cr@456 205 }
mas01cr@456 206 return z;
mas01cr@456 207 }
mas01cr@456 208
mas01cr@456 209
mas01mc@292 210 /************************ LSH indexing ***********************************/
mas01mc@292 211 void audioDB::index_index_db(const char* dbName){
mas01mc@292 212 char* newIndexName;
mas01mc@292 213 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 214 Uns32T dbVectors = 0;
mas01mc@292 215
mas01mc@324 216
mas01mc@292 217 printf("INDEX: initializing header\n");
mas01mc@292 218 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 219 forWrite = false;
mas01mc@292 220 initDBHeader(dbName);
mas01mc@292 221
mas01mc@324 222 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 223 usingPower = true;
mas01mc@324 224
mas01mc@324 225 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 226 usingTimes = true;
mas01mc@324 227
mas01cr@460 228 newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@460 229 if(!newIndexName) {
mas01cr@460 230 error("failed to get index name", dbName);
mas01cr@460 231 }
mas01mc@292 232
mas01mc@292 233 // Set unit norming flag override
mas01mc@292 234 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 235
mas01mc@327 236 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 237 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 238 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 239 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 240 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 241 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 242 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 243 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 244 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 245 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 246
mas01mc@292 247 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 248 printf("INDEX: constructing new LSH index\n");
mas01mc@292 249 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 250 fflush(stdout);
mas01mc@292 251 // Construct new LSH index
mas01mc@292 252 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 253 lsh_param_m,
mas01mc@292 254 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 255 lsh_param_N,
mas01mc@292 256 lsh_param_ncols,
mas01mc@292 257 (float)radius);
mas01mc@292 258 assert(lsh);
mas01mc@292 259
mas01mc@292 260 Uns32T endTrack = lsh_param_b;
mas01mc@292 261 if( endTrack > dbH->numFiles)
mas01mc@292 262 endTrack = dbH->numFiles;
mas01mc@292 263 // Insert up to lsh_param_b tracks
mas01mc@324 264 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 265 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 266 }
mas01mc@324 267 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 268 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 269
mas01mc@292 270 // Clean up
mas01mc@292 271 delete lsh;
mas01mc@308 272 lsh = 0;
mas01cr@465 273 } else {
mas01mc@292 274 close(lshfid);
mas01mc@292 275 }
mas01mc@292 276
mas01mc@292 277 // Attempt to open LSH file
mas01mc@292 278 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 279 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 280 fflush(stdout);
mas01mc@340 281 char* mergeIndexName = newIndexName;
mas01mc@292 282
mas01mc@292 283 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 284 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 285 assert(lsh);
mas01cr@458 286 Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
mas01mc@292 287 delete lsh;
mas01mc@308 288 lsh = 0;
mas01mc@292 289
mas01mc@340 290 // Insert up to lsh_param_b tracks
mas01mc@340 291 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 292 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 293 }
mas01mc@292 294 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 295 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 296
mas01mc@292 297 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 298 if( endTrack > dbH->numFiles)
mas01mc@292 299 endTrack = dbH->numFiles;
mas01mc@292 300 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 301 fflush(stdout);
mas01mc@340 302 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 303 assert(lsh);
mas01mc@292 304
mas01mc@292 305 // Insert up to lsh_param_b database tracks
mas01mc@292 306 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 307
mas01mc@340 308 // Serialize to file (merging is performed here)
mas01mc@340 309 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 310 delete lsh;
mas01mc@308 311 lsh = 0;
mas01mc@340 312 }
mas01mc@292 313
mas01mc@292 314 close(lshfid);
mas01mc@292 315 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 316 fflush(stdout);
mas01mc@292 317
mas01mc@292 318 }
mas01mc@292 319 else{
mas01mc@292 320 error("Something's wrong with LSH index file");
mas01mc@292 321 exit(1);
mas01mc@292 322 }
mas01mc@292 323
mas01mc@324 324 delete[] newIndexName;
mas01mc@324 325 delete[] sNorm;
mas01mc@324 326 delete[] sPower;
mas01mc@324 327 }
mas01mc@292 328
mas01mc@292 329
mas01cr@405 330 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@405 331 if(usingPower){
mas01cr@405 332 int one;
mas01cr@405 333 unsigned int count;
mas01cr@405 334
mas01cr@405 335 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@405 336 if (count != sizeof(unsigned int)) {
mas01cr@405 337 error("powerfd read failed", "int", "read");
mas01cr@405 338 }
mas01cr@405 339 if (one != 1) {
mas01cr@405 340 error("dimensionality of power file not 1", powerFileName);
mas01cr@405 341 }
mas01cr@405 342
mas01cr@405 343 // FIXME: should check that the powerfile is the right size for
mas01cr@405 344 // this. -- CSR, 2007-10-30
mas01cr@405 345 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@405 346 if (count != numVectors * sizeof(double)) {
mas01cr@405 347 error("powerfd read failed", "double", "read");
mas01cr@405 348 }
mas01cr@405 349 }
mas01cr@405 350 }
mas01cr@405 351
mas01mc@324 352 // initialize auxillary track data from filesystem
mas01mc@324 353 // pre-conditions:
mas01mc@324 354 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 355 // feature data allocated and copied (fvp)
mas01mc@324 356 //
mas01mc@324 357 // post-conditions:
mas01mc@324 358 // allocated power data
mas01mc@324 359 // allocated l2norm data
mas01mc@324 360 //
mas01mc@324 361 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 362 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 363 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 364
mas01mc@324 365 // Allocate and read the power sequence
mas01mc@324 366 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 367
mas01mc@324 368 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 369 char* tmpStr = prefixedString;
mas01mc@324 370 // Open and check dimensions of power file
mas01mc@324 371 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 372 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 373 if(prefixedString!=tmpStr)
mas01mc@324 374 delete[] tmpStr;
mas01mc@324 375 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 376 if (powerfd < 0) {
mas01mc@324 377 error("failed to open power file", prefixedString);
mas01mc@324 378 }
mas01mc@324 379 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 380 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 381 }
mas01mc@324 382
mas01mc@324 383 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 384 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 385
mas01mc@324 386 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 387 assert(*sPowerp);
mas01mc@324 388 *spPtrp = *sPowerp;
mas01mc@324 389 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 390 if (0 < powerfd) {
mas01mc@324 391 close(powerfd);
mas01mc@324 392 }
mas01mc@324 393
mas01cr@427 394 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@427 395 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 396 powerTable = 0;
mas01mc@324 397
mas01mc@324 398 // Allocate and calculate the l2norm sequence
mas01mc@324 399 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 400 assert(*sNormpp);
mas01mc@324 401 *snPtrp = *sNormpp;
mas01cr@426 402 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@427 403 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@427 404 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 405 }
mas01mc@292 406 }
mas01mc@292 407
mas01mc@292 408 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 409 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 410 double** sPowerp, double** spPtrp){
mas01mc@292 411 size_t nfv = 0;
mas01mc@292 412 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 413 Uns32T trackID = 0;
mas01mc@292 414
mas01mc@292 415 VERB_LOG(1, "indexing tracks...");
mas01mc@292 416
mas01mc@324 417 int trackfd = dbfid;
mas01mc@292 418 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 419 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 420 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 421 char* tmpStr = prefixedString;
mas01mc@324 422 // Open and check dimensions of feature file
mas01mc@324 423 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 424 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 425 if(prefixedString!=tmpStr)
mas01mc@324 426 delete[] tmpStr;
mas01cr@454 427 initInputFile(prefixedString);
mas01mc@324 428 trackfd = infid;
mas01mc@324 429 }
mas01cr@433 430 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@433 431 error("failed to read data");
mas01mc@292 432 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 433
mas01mc@324 434 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 435 // Load power and calculate power and l2norm sequence sums
mas01mc@324 436 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 437
mas01mc@292 438 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 439 break;
mas01mc@324 440 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 441 close(infid);
mas01mc@324 442 delete[] *sNormpp;
mas01mc@324 443 delete[] *sPowerp;
mas01mc@324 444 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 445 }
mas01mc@324 446 } // end for(trackID = start_track ; ... )
mas01mc@292 447 std::cout << "finished inserting." << endl;
mas01mc@292 448 }
mas01mc@292 449
mas01mc@292 450 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 451 // Loop over the current input track's vectors
mas01cr@305 452 Uns32T numVecs = 0;
mas01cr@305 453 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 454 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 455 numVecs = 0;
mas01cr@305 456 } else {
mas01cr@305 457 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 458 }
mas01cr@305 459 } else {
mas01cr@305 460 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 461 numVecs = 0;
mas01cr@305 462 } else {
mas01cr@305 463 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 464 }
mas01cr@305 465 }
mas01mc@292 466
mas01mc@324 467 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 468 if(numVecs){
mas01cr@459 469 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324 470
mas01mc@324 471 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@456 472 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@459 473 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@459 474 if(vcount == -1) {
mas01cr@459 475 audiodb_index_delete_shingles(vv);
mas01cr@459 476 error("failed to norm shingles");
mas01cr@459 477 }
mas01cr@459 478 numVecsAboveThreshold = vcount;
mas01mc@324 479 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@459 480 audiodb_index_delete_shingles(vv);
mas01mc@324 481 }
mas01cr@459 482
mas01mc@292 483 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 484
mas01cr@459 485 /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292 486 sequence, which is right, but the space allocated is for the
mas01mc@292 487 whole track. */
mas01mc@292 488
mas01mc@292 489 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 490 * So let's be certain the pointers are in the correct place
mas01mc@292 491 */
mas01mc@292 492
mas01mc@324 493 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 494 *snpp += trackTable[trackID];
mas01mc@324 495 *sppp += trackTable[trackID];
mas01mc@324 496 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 497 }
mas01mc@292 498
mas01mc@292 499 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 500 std::cout.flush();
mas01mc@292 501 return true;
mas01mc@292 502 }
mas01mc@292 503
mas01mc@292 504 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 505 Uns32T collisionCount = 0;
mas01mc@292 506 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 507 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 508 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01cr@458 509 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
mas01mc@324 510 spp+=sequenceHop;
mas01mc@311 511 }
mas01mc@292 512 return collisionCount;
mas01mc@292 513 }
mas01mc@292 514
mas01mc@292 515 /*********************** LSH retrieval ****************************/
mas01mc@292 516
mas01mc@292 517
mas01mc@292 518 // return true if indexed query performed else return false
mas01cr@473 519 int audiodb_index_init_query(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate, bool corep) {
mas01mc@292 520
mas01cr@465 521 uint32_t sequence_length = spec->qid.sequence_length;
mas01cr@465 522 double radius = spec->refine.radius;
mas01cr@465 523 if(!(audiodb_index_exists(adb->path, radius, sequence_length)))
mas01mc@292 524 return false;
mas01mc@292 525
mas01cr@465 526 char *indexName = audiodb_index_get_name(adb->path, radius, sequence_length);
mas01cr@460 527 if(!indexName) {
mas01cr@465 528 return false;
mas01cr@460 529 }
mas01mc@292 530
mas01cr@465 531 qstate->lsh = audiodb_index_allocate(adb, indexName, corep);
mas01cr@465 532
mas01cr@465 533 /* FIXME: it would be nice if the LSH library didn't make me do
mas01cr@465 534 * this. */
mas01cr@465 535 if((!corep) && (qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2)) {
mas01cr@465 536 delete qstate->lsh;
mas01cr@465 537 qstate->lsh = audiodb_index_allocate(adb, indexName, true);
mas01mc@292 538 }
mas01mc@292 539
mas01mc@292 540 delete[] indexName;
mas01mc@292 541 return true;
mas01mc@292 542 }
mas01mc@292 543
mas01cr@458 544 void audiodb_index_add_point_approximate(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@458 545 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@458 546 adb_t *adb = data->adb;
mas01cr@458 547 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@458 548 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@458 549 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@458 550 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@458 551 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@458 552 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@424 553 adb_result_t r;
mas01cr@458 554 r.key = (*adb->keys)[trackID].c_str();
mas01cr@424 555 r.dist = dist;
mas01cr@424 556 r.qpos = qpos;
mas01cr@424 557 r.ipos = spos;
mas01cr@458 558 qstate->accumulator->add_point(&r);
mas01cr@424 559 }
mas01mc@292 560 }
mas01mc@292 561
mas01cr@463 562 // Maintain a queue of points to pass to audiodb_query_queue_loop()
mas01cr@463 563 // for exact evaluation
mas01cr@458 564 void audiodb_index_add_point_exact(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@458 565 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@458 566 adb_t *adb = data->adb;
mas01cr@458 567 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@458 568 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@458 569 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@458 570 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@458 571 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@458 572 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@458 573 PointPair p(trackID, qpos, spos);
mas01cr@458 574 qstate->exact_evaluation_queue->push(p);
mas01cr@458 575 }
mas01mc@292 576 }
mas01mc@292 577
mas01cr@466 578 // return -1 on error
mas01mc@292 579 // return 0: if index does not exist
mas01mc@292 580 // return nqv: if index exists
mas01cr@473 581 int audiodb_index_query_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
mas01mc@292 582
mas01mc@324 583 double *query = 0, *query_data = 0;
mas01cr@437 584 adb_qpointers_internal_t qpointers = {0};
mas01cr@437 585
mas01cr@458 586 adb_qcallback_t callback_data;
mas01cr@458 587 callback_data.adb = adb;
mas01cr@458 588 callback_data.qstate = qstate;
mas01cr@458 589
mas01cr@466 590 void (*add_point_func)(void *, uint32_t, uint32_t, float);
mas01mc@292 591
mas01cr@466 592 uint32_t sequence_length = spec->qid.sequence_length;
mas01cr@466 593 bool normalized = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED);
mas01cr@466 594 double radius = spec->refine.radius;
mas01cr@466 595 bool use_absolute_threshold = spec->refine.flags & ADB_REFINE_ABSOLUTE_THRESHOLD;
mas01cr@466 596 double absolute_threshold = spec->refine.absolute_threshold;
mas01cr@431 597
mas01cr@468 598 if(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES) {
mas01cr@468 599 add_point_func = &audiodb_index_add_point_approximate;
mas01cr@468 600 } else {
mas01cr@458 601 qstate->exact_evaluation_queue = new std::priority_queue<PointPair>;
mas01cr@458 602 add_point_func = &audiodb_index_add_point_exact;
mas01mc@292 603 }
mas01mc@292 604
mas01cr@468 605 /* FIXME: this hardwired lsh_in_core is here to allow for a
mas01cr@468 606 * transition period while the need for the argument is worked
mas01cr@468 607 * through. Hopefully it will disappear again eventually. */
mas01cr@468 608 bool lsh_in_core = true;
mas01cr@468 609
mas01cr@465 610 if(!audiodb_index_init_query(adb, spec, qstate, lsh_in_core)) {
mas01mc@292 611 return 0;
mas01cr@465 612 }
mas01mc@292 613
mas01cr@466 614 char *database = audiodb_index_get_name(adb->path, radius, sequence_length);
mas01cr@460 615 if(!database) {
mas01cr@466 616 return -1;
mas01cr@460 617 }
mas01mc@292 618
mas01cr@444 619 if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
mas01cr@466 620 delete [] database;
mas01cr@466 621 return -1;
mas01cr@444 622 }
mas01mc@292 623
mas01cr@466 624 uint32_t Nq = (qpointers.nvectors > O2_MAXTRACKLEN ? O2_MAXTRACKLEN : qpointers.nvectors) - sequence_length + 1;
mas01cr@466 625 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequence_length);
mas01cr@447 626
mas01mc@292 627 // Construct shingles from query features
mas01cr@468 628 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
mas01cr@466 629 audiodb_index_make_shingle(vv, pointID, query, adb->header->dim, sequence_length);
mas01cr@468 630 }
mas01mc@292 631
mas01mc@292 632 // Normalize query vectors
mas01cr@466 633 int vcount = audiodb_index_norm_shingles(vv, qpointers.l2norm, qpointers.power, adb->header->dim, sequence_length, radius, normalized, use_absolute_threshold, absolute_threshold);
mas01cr@459 634 if(vcount == -1) {
mas01cr@459 635 audiodb_index_delete_shingles(vv);
mas01cr@466 636 delete [] database;
mas01cr@466 637 return -1;
mas01cr@459 638 }
mas01cr@466 639 uint32_t numVecsAboveThreshold = vcount;
mas01mc@292 640
mas01mc@292 641 // Nq contains number of inspected points in query file,
mas01mc@292 642 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01cr@466 643 double *qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation
mas01cr@468 644 if(!(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) && numVecsAboveThreshold) {
mas01cr@466 645 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@466 646 qstate->lsh->retrieve_point((*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
mas01cr@466 647 } else {
mas01cr@466 648 qstate->lsh->serial_retrieve_point(database, (*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
mas01cr@466 649 }
mas01cr@466 650 } else if(numVecsAboveThreshold) {
mas01cr@466 651 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
mas01cr@370 652 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@466 653 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@465 654 qstate->lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 655 } else {
mas01cr@465 656 qstate->lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 657 }
mas01cr@370 658 }
mas01cr@466 659 }
mas01cr@466 660 }
mas01cr@459 661 audiodb_index_delete_shingles(vv);
mas01mc@292 662
mas01cr@468 663 if(!(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES)) {
mas01cr@463 664 audiodb_query_queue_loop(adb, spec, qstate, query, &qpointers);
mas01cr@466 665 }
mas01mc@292 666
mas01mc@292 667 // Clean up
mas01mc@292 668 if(query_data)
mas01mc@292 669 delete[] query_data;
mas01cr@437 670 if(qpointers.l2norm_data)
mas01cr@437 671 delete[] qpointers.l2norm_data;
mas01cr@437 672 if(qpointers.power_data)
mas01cr@437 673 delete[] qpointers.power_data;
mas01cr@437 674 if(qpointers.mean_duration)
mas01cr@437 675 delete[] qpointers.mean_duration;
mas01mc@292 676 if(database)
mas01mc@292 677 delete[] database;
mas01cr@465 678 if(qstate->lsh != adb->cached_lsh)
mas01cr@465 679 delete qstate->lsh;
mas01mc@292 680
mas01mc@292 681 return Nq;
mas01mc@292 682 }
mas01mc@292 683