annotate index.cpp @ 473:b2fd8113d8bc api-inversion

const declarations for some API arguments. This should make it slightly clearer whose responsibility (the user's) it is to manage the memory pointed to by the corresponding arguments. Suggested by Chris Cannam.
author mas01cr
date Tue, 06 Jan 2009 16:27:01 +0000
parents 4dbd7917bf9e
children
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01cr@426 15 #include "audioDB-internals.h"
mas01mc@292 16
mas01cr@458 17 typedef struct adb_qcallback {
mas01cr@458 18 adb_t *adb;
mas01cr@458 19 adb_qstate_internal_t *qstate;
mas01cr@458 20 } adb_qcallback_t;
mas01mc@292 21
mas01mc@292 22 /************************* LSH indexing and query initialization *****************/
mas01mc@292 23
mas01cr@460 24 /* FIXME: there are several things wrong with this: the memory
mas01cr@460 25 * discipline isn't ideal, the radius printing is a bit lame, the name
mas01cr@460 26 * getting will succeed or fail depending on whether the path was
mas01cr@460 27 * relative or absolute -- but most importantly encoding all that
mas01cr@460 28 * information in a filename is going to lose: it's impossible to
mas01cr@460 29 * maintain backwards-compatibility. Instead we should probably store
mas01cr@460 30 * the index metadata inside the audiodb instance. */
mas01cr@460 31 char *audiodb_index_get_name(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@460 32 char *indexName;
mas01cr@460 33 if(strlen(dbName) > (MAXSTR - 32)) {
mas01cr@460 34 return NULL;
mas01cr@460 35 }
mas01cr@460 36 indexName = new char[MAXSTR];
mas01mc@292 37 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 38 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 39 return indexName;
mas01mc@292 40 }
mas01mc@292 41
mas01cr@460 42 bool audiodb_index_exists(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@460 43 char *indexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@460 44 if(!indexName) {
mas01cr@460 45 return false;
mas01cr@460 46 }
mas01cr@460 47 struct stat st;
mas01cr@460 48 if(stat(indexName, &st)) {
mas01cr@460 49 delete [] indexName;
mas01cr@460 50 return false;
mas01cr@460 51 }
mas01cr@460 52 /* FIXME: other stat checks here? */
mas01cr@460 53 /* FIXME: is there any better way to check whether we can open a
mas01cr@460 54 * file for reading than by opening a file for reading? */
mas01cr@460 55 int fd = open(indexName, O_RDONLY);
mas01cr@460 56 delete [] indexName;
mas01cr@460 57 if(fd < 0) {
mas01cr@460 58 return false;
mas01cr@460 59 } else {
mas01cr@460 60 close(fd);
mas01mc@292 61 return true;
mas01cr@460 62 }
mas01mc@292 63 }
mas01mc@292 64
mas01cr@465 65 /* FIXME: the indexName arg should be "const char *", but the LSH
mas01cr@465 66 * library doesn't like that.
mas01cr@465 67 */
mas01cr@465 68 LSH *audiodb_index_allocate(adb_t *adb, char *indexName, bool load_tables) {
mas01cr@465 69 LSH *lsh;
mas01cr@465 70 if(adb->cached_lsh) {
mas01cr@465 71 if(!strncmp(adb->cached_lsh->get_indexName(), indexName, MAXSTR)) {
mas01cr@465 72 return adb->cached_lsh;
mas01cr@465 73 } else {
mas01cr@465 74 delete adb->cached_lsh;
mas01cr@465 75 }
mas01mc@308 76 }
mas01cr@465 77 lsh = new LSH(indexName, load_tables);
mas01cr@465 78 if(load_tables) {
mas01cr@465 79 adb->cached_lsh = lsh;
mas01cr@465 80 }
mas01cr@465 81 return lsh;
mas01mc@308 82 }
mas01mc@308 83
mas01cr@459 84 vector<vector<float> > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) {
mas01cr@459 85 std::vector<std::vector<float> > *vv = new vector<vector<float> >(sz);
mas01cr@459 86 for(Uns32T i=0 ; i < sz ; i++) {
mas01cr@459 87 (*vv)[i]=vector<float>(dim * seqLen);
mas01cr@459 88 }
mas01mc@292 89 return vv;
mas01mc@292 90 }
mas01mc@292 91
mas01cr@459 92 void audiodb_index_delete_shingles(vector<vector<float> > *vv) {
mas01cr@459 93 delete vv;
mas01cr@459 94 }
mas01cr@459 95
mas01mc@292 96 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 97
mas01mc@292 98 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 99 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 100 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 101 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 102 }
mas01mc@324 103
mas01mc@325 104 double *snpp = 0, *sppp = 0;
mas01mc@324 105
mas01mc@292 106 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 107 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 108 snpp = *snp;
mas01mc@292 109 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 110 sppp = *spp;
mas01mc@325 111
mas01mc@324 112 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 113 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 114
mas01mc@324 115
mas01mc@292 116 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 117 if(trackTable[i] >= sequenceLength) {
mas01cr@427 118 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@427 119 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 120
mas01cr@427 121 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@427 122 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 123 }
mas01mc@292 124 snpp += trackTable[i];
mas01mc@292 125 sppp += trackTable[i];
mas01mc@292 126 }
mas01mc@324 127
mas01mc@292 128 *vsnp = *snp;
mas01mc@292 129 *vspp = *spp;
mas01mc@324 130
mas01mc@292 131 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 132 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 133 }
mas01mc@292 134
mas01mc@292 135
mas01cr@456 136 /********************* LSH shingle construction ***************************/
mas01cr@456 137
mas01cr@456 138 // Construct shingles out of a feature matrix
mas01cr@456 139 // inputs:
mas01cr@456 140 // idx is vector index in feature matrix
mas01cr@456 141 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01cr@456 142 //
mas01cr@456 143 // pre-conditions:
mas01cr@456 144 // dbH->dim
mas01cr@456 145 // sequenceLength
mas01cr@456 146 // idx < numVectors - sequenceLength + 1
mas01cr@456 147 //
mas01cr@456 148 // post-conditions:
mas01cr@456 149 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01cr@456 150
mas01cr@456 151 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01cr@456 152 assert(idx<(*vv).size());
mas01cr@456 153 vector<float>::iterator ve = (*vv)[idx].end();
mas01cr@456 154 vector<float>::iterator vi = (*vv)[idx].begin();
mas01cr@456 155 // First feature vector in shingle
mas01cr@456 156 if(idx == 0) {
mas01cr@456 157 while(vi!=ve) {
mas01cr@456 158 *vi++ = (float)(*fvp++);
mas01cr@456 159 }
mas01cr@456 160 } else {
mas01cr@456 161 // Not first feature vector in shingle
mas01cr@456 162 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
mas01cr@456 163 // Previous seqLen-1 dim-vectors
mas01cr@456 164 while(vi!=ve-dim) {
mas01cr@456 165 *vi++ = *ui++;
mas01cr@456 166 }
mas01cr@456 167 // Move data pointer to next feature vector
mas01cr@456 168 fvp += ( seqLen + idx - 1 ) * dim ;
mas01cr@456 169 // New d-vector
mas01cr@456 170 while(vi!=ve) {
mas01cr@456 171 *vi++ = (float)(*fvp++);
mas01cr@456 172 }
mas01cr@456 173 }
mas01cr@456 174 }
mas01cr@456 175
mas01cr@456 176 // norm shingles
mas01cr@456 177 // in-place norming, no deletions
mas01cr@456 178 // If using power, return number of shingles above power threshold
mas01cr@459 179 int audiodb_index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) {
mas01cr@456 180 int z = 0; // number of above-threshold shingles
mas01cr@456 181 float l2norm;
mas01cr@456 182 double power;
mas01cr@456 183 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01cr@456 184 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01cr@459 185 Uns32T shingleSize = seqLen * dim;
mas01cr@459 186
mas01cr@459 187 if(!spp) {
mas01cr@459 188 return -1;
mas01cr@459 189 }
mas01cr@456 190 for(Uns32T a=0; a<(*vv).size(); a++){
mas01cr@456 191 l2norm = (float)(*snp++);
mas01cr@459 192 if(normed_vectors)
mas01cr@456 193 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01cr@456 194
mas01cr@456 195 for(Uns32T b=0; b < shingleSize ; b++)
mas01cr@456 196 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01cr@456 197
mas01cr@456 198 power = *spp++;
mas01cr@459 199 if(use_pthreshold){
mas01cr@459 200 if (power >= pthreshold)
mas01cr@456 201 z++;
mas01cr@456 202 }
mas01cr@456 203 else
mas01cr@456 204 z++;
mas01cr@456 205 }
mas01cr@456 206 return z;
mas01cr@456 207 }
mas01cr@456 208
mas01cr@456 209
mas01mc@292 210 /************************ LSH indexing ***********************************/
mas01mc@292 211 void audioDB::index_index_db(const char* dbName){
mas01mc@292 212 char* newIndexName;
mas01mc@292 213 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 214 Uns32T dbVectors = 0;
mas01mc@292 215
mas01mc@324 216
mas01mc@292 217 printf("INDEX: initializing header\n");
mas01mc@292 218 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 219 forWrite = false;
mas01mc@292 220 initDBHeader(dbName);
mas01mc@292 221
mas01mc@324 222 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 223 usingPower = true;
mas01mc@324 224
mas01mc@324 225 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 226 usingTimes = true;
mas01mc@324 227
mas01cr@460 228 newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@460 229 if(!newIndexName) {
mas01cr@460 230 error("failed to get index name", dbName);
mas01cr@460 231 }
mas01mc@292 232
mas01mc@292 233 // Set unit norming flag override
mas01mc@292 234 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 235
mas01mc@327 236 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 237 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 238 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 239 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 240 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 241 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 242 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 243 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 244 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 245 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 246
mas01mc@292 247 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 248 printf("INDEX: constructing new LSH index\n");
mas01mc@292 249 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 250 fflush(stdout);
mas01mc@292 251 // Construct new LSH index
mas01mc@292 252 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 253 lsh_param_m,
mas01mc@292 254 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 255 lsh_param_N,
mas01mc@292 256 lsh_param_ncols,
mas01mc@292 257 (float)radius);
mas01mc@292 258 assert(lsh);
mas01mc@292 259
mas01mc@292 260 Uns32T endTrack = lsh_param_b;
mas01mc@292 261 if( endTrack > dbH->numFiles)
mas01mc@292 262 endTrack = dbH->numFiles;
mas01mc@292 263 // Insert up to lsh_param_b tracks
mas01mc@324 264 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 265 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 266 }
mas01mc@324 267 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 268 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 269
mas01mc@292 270 // Clean up
mas01mc@292 271 delete lsh;
mas01mc@308 272 lsh = 0;
mas01cr@465 273 } else {
mas01mc@292 274 close(lshfid);
mas01mc@292 275 }
mas01mc@292 276
mas01mc@292 277 // Attempt to open LSH file
mas01mc@292 278 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 279 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 280 fflush(stdout);
mas01mc@340 281 char* mergeIndexName = newIndexName;
mas01mc@292 282
mas01mc@292 283 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 284 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 285 assert(lsh);
mas01cr@458 286 Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
mas01mc@292 287 delete lsh;
mas01mc@308 288 lsh = 0;
mas01mc@292 289
mas01mc@340 290 // Insert up to lsh_param_b tracks
mas01mc@340 291 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 292 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 293 }
mas01mc@292 294 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 295 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 296
mas01mc@292 297 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 298 if( endTrack > dbH->numFiles)
mas01mc@292 299 endTrack = dbH->numFiles;
mas01mc@292 300 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 301 fflush(stdout);
mas01mc@340 302 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 303 assert(lsh);
mas01mc@292 304
mas01mc@292 305 // Insert up to lsh_param_b database tracks
mas01mc@292 306 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 307
mas01mc@340 308 // Serialize to file (merging is performed here)
mas01mc@340 309 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 310 delete lsh;
mas01mc@308 311 lsh = 0;
mas01mc@340 312 }
mas01mc@292 313
mas01mc@292 314 close(lshfid);
mas01mc@292 315 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 316 fflush(stdout);
mas01mc@292 317
mas01mc@292 318 }
mas01mc@292 319 else{
mas01mc@292 320 error("Something's wrong with LSH index file");
mas01mc@292 321 exit(1);
mas01mc@292 322 }
mas01mc@292 323
mas01mc@324 324 delete[] newIndexName;
mas01mc@324 325 delete[] sNorm;
mas01mc@324 326 delete[] sPower;
mas01mc@324 327 }
mas01mc@292 328
mas01mc@292 329
mas01cr@405 330 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@405 331 if(usingPower){
mas01cr@405 332 int one;
mas01cr@405 333 unsigned int count;
mas01cr@405 334
mas01cr@405 335 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@405 336 if (count != sizeof(unsigned int)) {
mas01cr@405 337 error("powerfd read failed", "int", "read");
mas01cr@405 338 }
mas01cr@405 339 if (one != 1) {
mas01cr@405 340 error("dimensionality of power file not 1", powerFileName);
mas01cr@405 341 }
mas01cr@405 342
mas01cr@405 343 // FIXME: should check that the powerfile is the right size for
mas01cr@405 344 // this. -- CSR, 2007-10-30
mas01cr@405 345 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@405 346 if (count != numVectors * sizeof(double)) {
mas01cr@405 347 error("powerfd read failed", "double", "read");
mas01cr@405 348 }
mas01cr@405 349 }
mas01cr@405 350 }
mas01cr@405 351
mas01mc@324 352 // initialize auxillary track data from filesystem
mas01mc@324 353 // pre-conditions:
mas01mc@324 354 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 355 // feature data allocated and copied (fvp)
mas01mc@324 356 //
mas01mc@324 357 // post-conditions:
mas01mc@324 358 // allocated power data
mas01mc@324 359 // allocated l2norm data
mas01mc@324 360 //
mas01mc@324 361 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 362 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 363 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 364
mas01mc@324 365 // Allocate and read the power sequence
mas01mc@324 366 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 367
mas01mc@324 368 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 369 char* tmpStr = prefixedString;
mas01mc@324 370 // Open and check dimensions of power file
mas01mc@324 371 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 372 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 373 if(prefixedString!=tmpStr)
mas01mc@324 374 delete[] tmpStr;
mas01mc@324 375 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 376 if (powerfd < 0) {
mas01mc@324 377 error("failed to open power file", prefixedString);
mas01mc@324 378 }
mas01mc@324 379 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 380 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 381 }
mas01mc@324 382
mas01mc@324 383 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 384 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 385
mas01mc@324 386 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 387 assert(*sPowerp);
mas01mc@324 388 *spPtrp = *sPowerp;
mas01mc@324 389 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 390 if (0 < powerfd) {
mas01mc@324 391 close(powerfd);
mas01mc@324 392 }
mas01mc@324 393
mas01cr@427 394 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@427 395 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 396 powerTable = 0;
mas01mc@324 397
mas01mc@324 398 // Allocate and calculate the l2norm sequence
mas01mc@324 399 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 400 assert(*sNormpp);
mas01mc@324 401 *snPtrp = *sNormpp;
mas01cr@426 402 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@427 403 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@427 404 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 405 }
mas01mc@292 406 }
mas01mc@292 407
mas01mc@292 408 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 409 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 410 double** sPowerp, double** spPtrp){
mas01mc@292 411 size_t nfv = 0;
mas01mc@292 412 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 413 Uns32T trackID = 0;
mas01mc@292 414
mas01mc@292 415 VERB_LOG(1, "indexing tracks...");
mas01mc@292 416
mas01mc@324 417 int trackfd = dbfid;
mas01mc@292 418 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 419 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 420 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 421 char* tmpStr = prefixedString;
mas01mc@324 422 // Open and check dimensions of feature file
mas01mc@324 423 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 424 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 425 if(prefixedString!=tmpStr)
mas01mc@324 426 delete[] tmpStr;
mas01cr@454 427 initInputFile(prefixedString);
mas01mc@324 428 trackfd = infid;
mas01mc@324 429 }
mas01cr@433 430 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@433 431 error("failed to read data");
mas01mc@292 432 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 433
mas01mc@324 434 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 435 // Load power and calculate power and l2norm sequence sums
mas01mc@324 436 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 437
mas01mc@292 438 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 439 break;
mas01mc@324 440 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 441 close(infid);
mas01mc@324 442 delete[] *sNormpp;
mas01mc@324 443 delete[] *sPowerp;
mas01mc@324 444 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 445 }
mas01mc@324 446 } // end for(trackID = start_track ; ... )
mas01mc@292 447 std::cout << "finished inserting." << endl;
mas01mc@292 448 }
mas01mc@292 449
mas01mc@292 450 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 451 // Loop over the current input track's vectors
mas01cr@305 452 Uns32T numVecs = 0;
mas01cr@305 453 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 454 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 455 numVecs = 0;
mas01cr@305 456 } else {
mas01cr@305 457 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 458 }
mas01cr@305 459 } else {
mas01cr@305 460 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 461 numVecs = 0;
mas01cr@305 462 } else {
mas01cr@305 463 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 464 }
mas01cr@305 465 }
mas01mc@292 466
mas01mc@324 467 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 468 if(numVecs){
mas01cr@459 469 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324 470
mas01mc@324 471 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@456 472 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@459 473 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@459 474 if(vcount == -1) {
mas01cr@459 475 audiodb_index_delete_shingles(vv);
mas01cr@459 476 error("failed to norm shingles");
mas01cr@459 477 }
mas01cr@459 478 numVecsAboveThreshold = vcount;
mas01mc@324 479 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@459 480 audiodb_index_delete_shingles(vv);
mas01mc@324 481 }
mas01cr@459 482
mas01mc@292 483 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 484
mas01cr@459 485 /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292 486 sequence, which is right, but the space allocated is for the
mas01mc@292 487 whole track. */
mas01mc@292 488
mas01mc@292 489 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 490 * So let's be certain the pointers are in the correct place
mas01mc@292 491 */
mas01mc@292 492
mas01mc@324 493 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 494 *snpp += trackTable[trackID];
mas01mc@324 495 *sppp += trackTable[trackID];
mas01mc@324 496 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 497 }
mas01mc@292 498
mas01mc@292 499 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 500 std::cout.flush();
mas01mc@292 501 return true;
mas01mc@292 502 }
mas01mc@292 503
mas01mc@292 504 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 505 Uns32T collisionCount = 0;
mas01mc@292 506 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 507 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 508 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01cr@458 509 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
mas01mc@324 510 spp+=sequenceHop;
mas01mc@311 511 }
mas01mc@292 512 return collisionCount;
mas01mc@292 513 }
mas01mc@292 514
mas01mc@292 515 /*********************** LSH retrieval ****************************/
mas01mc@292 516
mas01mc@292 517
mas01mc@292 518 // return true if indexed query performed else return false
mas01cr@473 519 int audiodb_index_init_query(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate, bool corep) {
mas01mc@292 520
mas01cr@465 521 uint32_t sequence_length = spec->qid.sequence_length;
mas01cr@465 522 double radius = spec->refine.radius;
mas01cr@465 523 if(!(audiodb_index_exists(adb->path, radius, sequence_length)))
mas01mc@292 524 return false;
mas01mc@292 525
mas01cr@465 526 char *indexName = audiodb_index_get_name(adb->path, radius, sequence_length);
mas01cr@460 527 if(!indexName) {
mas01cr@465 528 return false;
mas01cr@460 529 }
mas01mc@292 530
mas01cr@465 531 qstate->lsh = audiodb_index_allocate(adb, indexName, corep);
mas01cr@465 532
mas01cr@465 533 /* FIXME: it would be nice if the LSH library didn't make me do
mas01cr@465 534 * this. */
mas01cr@465 535 if((!corep) && (qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2)) {
mas01cr@465 536 delete qstate->lsh;
mas01cr@465 537 qstate->lsh = audiodb_index_allocate(adb, indexName, true);
mas01mc@292 538 }
mas01mc@292 539
mas01mc@292 540 delete[] indexName;
mas01mc@292 541 return true;
mas01mc@292 542 }
mas01mc@292 543
mas01cr@458 544 void audiodb_index_add_point_approximate(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@458 545 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@458 546 adb_t *adb = data->adb;
mas01cr@458 547 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@458 548 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@458 549 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@458 550 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@458 551 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@458 552 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@424 553 adb_result_t r;
mas01cr@458 554 r.key = (*adb->keys)[trackID].c_str();
mas01cr@424 555 r.dist = dist;
mas01cr@424 556 r.qpos = qpos;
mas01cr@424 557 r.ipos = spos;
mas01cr@458 558 qstate->accumulator->add_point(&r);
mas01cr@424 559 }
mas01mc@292 560 }
mas01mc@292 561
mas01cr@463 562 // Maintain a queue of points to pass to audiodb_query_queue_loop()
mas01cr@463 563 // for exact evaluation
mas01cr@458 564 void audiodb_index_add_point_exact(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@458 565 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@458 566 adb_t *adb = data->adb;
mas01cr@458 567 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@458 568 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@458 569 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@458 570 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@458 571 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@458 572 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@458 573 PointPair p(trackID, qpos, spos);
mas01cr@458 574 qstate->exact_evaluation_queue->push(p);
mas01cr@458 575 }
mas01mc@292 576 }
mas01mc@292 577
mas01cr@466 578 // return -1 on error
mas01mc@292 579 // return 0: if index does not exist
mas01mc@292 580 // return nqv: if index exists
mas01cr@473 581 int audiodb_index_query_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
mas01mc@292 582
mas01mc@324 583 double *query = 0, *query_data = 0;
mas01cr@437 584 adb_qpointers_internal_t qpointers = {0};
mas01cr@437 585
mas01cr@458 586 adb_qcallback_t callback_data;
mas01cr@458 587 callback_data.adb = adb;
mas01cr@458 588 callback_data.qstate = qstate;
mas01cr@458 589
mas01cr@466 590 void (*add_point_func)(void *, uint32_t, uint32_t, float);
mas01mc@292 591
mas01cr@466 592 uint32_t sequence_length = spec->qid.sequence_length;
mas01cr@466 593 bool normalized = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED);
mas01cr@466 594 double radius = spec->refine.radius;
mas01cr@466 595 bool use_absolute_threshold = spec->refine.flags & ADB_REFINE_ABSOLUTE_THRESHOLD;
mas01cr@466 596 double absolute_threshold = spec->refine.absolute_threshold;
mas01cr@431 597
mas01cr@468 598 if(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES) {
mas01cr@468 599 add_point_func = &audiodb_index_add_point_approximate;
mas01cr@468 600 } else {
mas01cr@458 601 qstate->exact_evaluation_queue = new std::priority_queue<PointPair>;
mas01cr@458 602 add_point_func = &audiodb_index_add_point_exact;
mas01mc@292 603 }
mas01mc@292 604
mas01cr@468 605 /* FIXME: this hardwired lsh_in_core is here to allow for a
mas01cr@468 606 * transition period while the need for the argument is worked
mas01cr@468 607 * through. Hopefully it will disappear again eventually. */
mas01cr@468 608 bool lsh_in_core = true;
mas01cr@468 609
mas01cr@465 610 if(!audiodb_index_init_query(adb, spec, qstate, lsh_in_core)) {
mas01mc@292 611 return 0;
mas01cr@465 612 }
mas01mc@292 613
mas01cr@466 614 char *database = audiodb_index_get_name(adb->path, radius, sequence_length);
mas01cr@460 615 if(!database) {
mas01cr@466 616 return -1;
mas01cr@460 617 }
mas01mc@292 618
mas01cr@444 619 if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
mas01cr@466 620 delete [] database;
mas01cr@466 621 return -1;
mas01cr@444 622 }
mas01mc@292 623
mas01cr@466 624 uint32_t Nq = (qpointers.nvectors > O2_MAXTRACKLEN ? O2_MAXTRACKLEN : qpointers.nvectors) - sequence_length + 1;
mas01cr@466 625 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequence_length);
mas01cr@447 626
mas01mc@292 627 // Construct shingles from query features
mas01cr@468 628 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
mas01cr@466 629 audiodb_index_make_shingle(vv, pointID, query, adb->header->dim, sequence_length);
mas01cr@468 630 }
mas01mc@292 631
mas01mc@292 632 // Normalize query vectors
mas01cr@466 633 int vcount = audiodb_index_norm_shingles(vv, qpointers.l2norm, qpointers.power, adb->header->dim, sequence_length, radius, normalized, use_absolute_threshold, absolute_threshold);
mas01cr@459 634 if(vcount == -1) {
mas01cr@459 635 audiodb_index_delete_shingles(vv);
mas01cr@466 636 delete [] database;
mas01cr@466 637 return -1;
mas01cr@459 638 }
mas01cr@466 639 uint32_t numVecsAboveThreshold = vcount;
mas01mc@292 640
mas01mc@292 641 // Nq contains number of inspected points in query file,
mas01mc@292 642 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01cr@466 643 double *qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation
mas01cr@468 644 if(!(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) && numVecsAboveThreshold) {
mas01cr@466 645 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@466 646 qstate->lsh->retrieve_point((*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
mas01cr@466 647 } else {
mas01cr@466 648 qstate->lsh->serial_retrieve_point(database, (*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
mas01cr@466 649 }
mas01cr@466 650 } else if(numVecsAboveThreshold) {
mas01cr@466 651 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
mas01cr@370 652 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@466 653 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@465 654 qstate->lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 655 } else {
mas01cr@465 656 qstate->lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 657 }
mas01cr@370 658 }
mas01cr@466 659 }
mas01cr@466 660 }
mas01cr@459 661 audiodb_index_delete_shingles(vv);
mas01mc@292 662
mas01cr@468 663 if(!(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES)) {
mas01cr@463 664 audiodb_query_queue_loop(adb, spec, qstate, query, &qpointers);
mas01cr@466 665 }
mas01mc@292 666
mas01mc@292 667 // Clean up
mas01mc@292 668 if(query_data)
mas01mc@292 669 delete[] query_data;
mas01cr@437 670 if(qpointers.l2norm_data)
mas01cr@437 671 delete[] qpointers.l2norm_data;
mas01cr@437 672 if(qpointers.power_data)
mas01cr@437 673 delete[] qpointers.power_data;
mas01cr@437 674 if(qpointers.mean_duration)
mas01cr@437 675 delete[] qpointers.mean_duration;
mas01mc@292 676 if(database)
mas01mc@292 677 delete[] database;
mas01cr@465 678 if(qstate->lsh != adb->cached_lsh)
mas01cr@465 679 delete qstate->lsh;
mas01mc@292 680
mas01mc@292 681 return Nq;
mas01mc@292 682 }
mas01mc@292 683