annotate index.cpp @ 507:e7fd50483311

Free bits of the datum constructed in audioDB::query. We're not quite safe: error calls between allocation of some of these bits and pieces and their use will cause failure... but not freeing things here is definitely wrong.
author mas01cr
date Tue, 13 Jan 2009 21:37:10 +0000
parents 342822c2d49a
children cc2b97d020b1
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01cr@498 15 #include "audioDB-internals.h"
mas01mc@292 16
mas01cr@498 17 typedef struct adb_qcallback {
mas01cr@498 18 adb_t *adb;
mas01cr@498 19 adb_qstate_internal_t *qstate;
mas01cr@498 20 } adb_qcallback_t;
mas01mc@292 21
mas01mc@292 22 /************************* LSH indexing and query initialization *****************/
mas01mc@292 23
mas01cr@498 24 /* FIXME: there are several things wrong with this: the memory
mas01cr@498 25 * discipline isn't ideal, the radius printing is a bit lame, the name
mas01cr@498 26 * getting will succeed or fail depending on whether the path was
mas01cr@498 27 * relative or absolute -- but most importantly encoding all that
mas01cr@498 28 * information in a filename is going to lose: it's impossible to
mas01cr@498 29 * maintain backwards-compatibility. Instead we should probably store
mas01cr@498 30 * the index metadata inside the audiodb instance. */
mas01cr@498 31 char *audiodb_index_get_name(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@498 32 char *indexName;
mas01cr@498 33 if(strlen(dbName) > (MAXSTR - 32)) {
mas01cr@498 34 return NULL;
mas01cr@498 35 }
mas01cr@498 36 indexName = new char[MAXSTR];
mas01mc@292 37 strncpy(indexName, dbName, MAXSTR);
mas01mc@292 38 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01mc@292 39 return indexName;
mas01mc@292 40 }
mas01mc@292 41
mas01cr@498 42 bool audiodb_index_exists(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@498 43 char *indexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@498 44 if(!indexName) {
mas01cr@498 45 return false;
mas01cr@498 46 }
mas01cr@498 47 struct stat st;
mas01cr@498 48 if(stat(indexName, &st)) {
mas01cr@498 49 delete [] indexName;
mas01cr@498 50 return false;
mas01cr@498 51 }
mas01cr@498 52 /* FIXME: other stat checks here? */
mas01cr@498 53 /* FIXME: is there any better way to check whether we can open a
mas01cr@498 54 * file for reading than by opening a file for reading? */
mas01cr@498 55 int fd = open(indexName, O_RDONLY);
mas01cr@498 56 delete [] indexName;
mas01cr@498 57 if(fd < 0) {
mas01cr@498 58 return false;
mas01cr@498 59 } else {
mas01cr@498 60 close(fd);
mas01mc@292 61 return true;
mas01cr@498 62 }
mas01mc@292 63 }
mas01mc@292 64
mas01cr@498 65 /* FIXME: the indexName arg should be "const char *", but the LSH
mas01cr@498 66 * library doesn't like that.
mas01cr@498 67 */
mas01cr@498 68 LSH *audiodb_index_allocate(adb_t *adb, char *indexName, bool load_tables) {
mas01cr@498 69 LSH *lsh;
mas01cr@498 70 if(adb->cached_lsh) {
mas01cr@498 71 if(!strncmp(adb->cached_lsh->get_indexName(), indexName, MAXSTR)) {
mas01cr@498 72 return adb->cached_lsh;
mas01cr@498 73 } else {
mas01cr@498 74 delete adb->cached_lsh;
mas01cr@498 75 }
mas01mc@308 76 }
mas01cr@498 77 lsh = new LSH(indexName, load_tables);
mas01cr@498 78 if(load_tables) {
mas01cr@498 79 adb->cached_lsh = lsh;
mas01cr@498 80 }
mas01cr@498 81 return lsh;
mas01mc@308 82 }
mas01mc@308 83
mas01cr@498 84 vector<vector<float> > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) {
mas01cr@498 85 std::vector<std::vector<float> > *vv = new vector<vector<float> >(sz);
mas01cr@498 86 for(Uns32T i=0 ; i < sz ; i++) {
mas01cr@498 87 (*vv)[i]=vector<float>(dim * seqLen);
mas01cr@498 88 }
mas01mc@292 89 return vv;
mas01mc@292 90 }
mas01mc@292 91
mas01cr@498 92 void audiodb_index_delete_shingles(vector<vector<float> > *vv) {
mas01cr@498 93 delete vv;
mas01cr@498 94 }
mas01cr@498 95
mas01mc@292 96 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
mas01mc@292 97
mas01mc@292 98 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 99 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 100 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 101 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 102 }
mas01mc@324 103
mas01mc@325 104 double *snpp = 0, *sppp = 0;
mas01mc@324 105
mas01mc@292 106 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 107 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 108 snpp = *snp;
mas01mc@292 109 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 110 sppp = *spp;
mas01mc@325 111
mas01mc@324 112 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 113 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 114
mas01mc@324 115
mas01mc@292 116 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 117 if(trackTable[i] >= sequenceLength) {
mas01cr@498 118 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@498 119 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 120
mas01cr@498 121 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@498 122 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 123 }
mas01mc@292 124 snpp += trackTable[i];
mas01mc@292 125 sppp += trackTable[i];
mas01mc@292 126 }
mas01mc@324 127
mas01mc@292 128 *vsnp = *snp;
mas01mc@292 129 *vspp = *spp;
mas01mc@324 130
mas01mc@292 131 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 132 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 133 }
mas01mc@292 134
mas01mc@292 135
mas01cr@498 136 /********************* LSH shingle construction ***************************/
mas01cr@498 137
mas01cr@498 138 // Construct shingles out of a feature matrix
mas01cr@498 139 // inputs:
mas01cr@498 140 // idx is vector index in feature matrix
mas01cr@498 141 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
mas01cr@498 142 //
mas01cr@498 143 // pre-conditions:
mas01cr@498 144 // dbH->dim
mas01cr@498 145 // sequenceLength
mas01cr@498 146 // idx < numVectors - sequenceLength + 1
mas01cr@498 147 //
mas01cr@498 148 // post-conditions:
mas01cr@498 149 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
mas01cr@498 150
mas01cr@498 151 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01cr@498 152 assert(idx<(*vv).size());
mas01cr@498 153 vector<float>::iterator ve = (*vv)[idx].end();
mas01cr@498 154 vector<float>::iterator vi = (*vv)[idx].begin();
mas01cr@498 155 // First feature vector in shingle
mas01cr@498 156 if(idx == 0) {
mas01cr@498 157 while(vi!=ve) {
mas01cr@498 158 *vi++ = (float)(*fvp++);
mas01cr@498 159 }
mas01cr@498 160 } else {
mas01cr@498 161 // Not first feature vector in shingle
mas01cr@498 162 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
mas01cr@498 163 // Previous seqLen-1 dim-vectors
mas01cr@498 164 while(vi!=ve-dim) {
mas01cr@498 165 *vi++ = *ui++;
mas01cr@498 166 }
mas01cr@498 167 // Move data pointer to next feature vector
mas01cr@498 168 fvp += ( seqLen + idx - 1 ) * dim ;
mas01cr@498 169 // New d-vector
mas01cr@498 170 while(vi!=ve) {
mas01cr@498 171 *vi++ = (float)(*fvp++);
mas01cr@498 172 }
mas01cr@498 173 }
mas01cr@498 174 }
mas01cr@498 175
mas01cr@498 176 // norm shingles
mas01cr@498 177 // in-place norming, no deletions
mas01cr@498 178 // If using power, return number of shingles above power threshold
mas01cr@498 179 int audiodb_index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) {
mas01cr@498 180 int z = 0; // number of above-threshold shingles
mas01cr@498 181 float l2norm;
mas01cr@498 182 double power;
mas01cr@498 183 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01cr@498 184 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01cr@498 185 Uns32T shingleSize = seqLen * dim;
mas01cr@498 186
mas01cr@498 187 if(!spp) {
mas01cr@498 188 return -1;
mas01cr@498 189 }
mas01cr@498 190 for(Uns32T a=0; a<(*vv).size(); a++){
mas01cr@498 191 l2norm = (float)(*snp++);
mas01cr@498 192 if(normed_vectors)
mas01cr@498 193 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01cr@498 194
mas01cr@498 195 for(Uns32T b=0; b < shingleSize ; b++)
mas01cr@498 196 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01cr@498 197
mas01cr@498 198 power = *spp++;
mas01cr@498 199 if(use_pthreshold){
mas01cr@498 200 if (power >= pthreshold)
mas01cr@498 201 z++;
mas01cr@498 202 }
mas01cr@498 203 else
mas01cr@498 204 z++;
mas01cr@498 205 }
mas01cr@498 206 return z;
mas01cr@498 207 }
mas01cr@498 208
mas01cr@498 209
mas01mc@292 210 /************************ LSH indexing ***********************************/
mas01mc@292 211 void audioDB::index_index_db(const char* dbName){
mas01mc@292 212 char* newIndexName;
mas01mc@292 213 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 214 Uns32T dbVectors = 0;
mas01mc@292 215
mas01mc@324 216
mas01mc@292 217 printf("INDEX: initializing header\n");
mas01mc@292 218 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 219 forWrite = false;
mas01mc@292 220 initDBHeader(dbName);
mas01mc@292 221
mas01mc@324 222 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 223 usingPower = true;
mas01mc@324 224
mas01mc@324 225 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 226 usingTimes = true;
mas01mc@324 227
mas01cr@498 228 newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@498 229 if(!newIndexName) {
mas01cr@498 230 error("failed to get index name", dbName);
mas01cr@498 231 }
mas01mc@292 232
mas01mc@292 233 // Set unit norming flag override
mas01mc@292 234 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 235
mas01mc@327 236 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 237 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 238 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 239 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 240 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 241 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 242 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 243 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 244 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 245 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 246
mas01mc@292 247 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 248 printf("INDEX: constructing new LSH index\n");
mas01mc@292 249 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 250 fflush(stdout);
mas01mc@292 251 // Construct new LSH index
mas01mc@292 252 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 253 lsh_param_m,
mas01mc@292 254 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 255 lsh_param_N,
mas01mc@292 256 lsh_param_ncols,
mas01mc@292 257 (float)radius);
mas01mc@292 258 assert(lsh);
mas01mc@292 259
mas01mc@292 260 Uns32T endTrack = lsh_param_b;
mas01mc@292 261 if( endTrack > dbH->numFiles)
mas01mc@292 262 endTrack = dbH->numFiles;
mas01mc@292 263 // Insert up to lsh_param_b tracks
mas01mc@324 264 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 265 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 266 }
mas01mc@324 267 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 268 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 269
mas01mc@292 270 // Clean up
mas01mc@292 271 delete lsh;
mas01mc@308 272 lsh = 0;
mas01cr@498 273 } else {
mas01mc@292 274 close(lshfid);
mas01mc@292 275 }
mas01mc@292 276
mas01mc@292 277 // Attempt to open LSH file
mas01mc@292 278 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 279 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 280 fflush(stdout);
mas01mc@340 281 char* mergeIndexName = newIndexName;
mas01mc@292 282
mas01mc@292 283 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 284 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 285 assert(lsh);
mas01cr@498 286 Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
mas01mc@292 287 delete lsh;
mas01mc@308 288 lsh = 0;
mas01mc@292 289
mas01mc@340 290 // Insert up to lsh_param_b tracks
mas01mc@340 291 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 292 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 293 }
mas01mc@292 294 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 295 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 296
mas01mc@292 297 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 298 if( endTrack > dbH->numFiles)
mas01mc@292 299 endTrack = dbH->numFiles;
mas01mc@292 300 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 301 fflush(stdout);
mas01mc@340 302 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 303 assert(lsh);
mas01mc@292 304
mas01mc@292 305 // Insert up to lsh_param_b database tracks
mas01mc@292 306 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 307
mas01mc@340 308 // Serialize to file (merging is performed here)
mas01mc@340 309 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 310 delete lsh;
mas01mc@308 311 lsh = 0;
mas01mc@340 312 }
mas01mc@292 313
mas01mc@292 314 close(lshfid);
mas01mc@292 315 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 316 fflush(stdout);
mas01mc@292 317
mas01mc@292 318 }
mas01mc@292 319 else{
mas01mc@292 320 error("Something's wrong with LSH index file");
mas01mc@292 321 exit(1);
mas01mc@292 322 }
mas01mc@292 323
mas01mc@324 324 delete[] newIndexName;
mas01mc@324 325 delete[] sNorm;
mas01mc@324 326 delete[] sPower;
mas01mc@324 327 }
mas01mc@292 328
mas01mc@292 329
mas01cr@498 330 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@498 331 if(usingPower){
mas01cr@498 332 int one;
mas01cr@498 333 unsigned int count;
mas01cr@498 334
mas01cr@498 335 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@498 336 if (count != sizeof(unsigned int)) {
mas01cr@498 337 error("powerfd read failed", "int", "read");
mas01cr@498 338 }
mas01cr@498 339 if (one != 1) {
mas01cr@498 340 error("dimensionality of power file not 1", powerFileName);
mas01cr@498 341 }
mas01cr@498 342
mas01cr@498 343 // FIXME: should check that the powerfile is the right size for
mas01cr@498 344 // this. -- CSR, 2007-10-30
mas01cr@498 345 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@498 346 if (count != numVectors * sizeof(double)) {
mas01cr@498 347 error("powerfd read failed", "double", "read");
mas01cr@498 348 }
mas01cr@498 349 }
mas01cr@498 350 }
mas01cr@498 351
mas01mc@324 352 // initialize auxillary track data from filesystem
mas01mc@324 353 // pre-conditions:
mas01mc@324 354 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 355 // feature data allocated and copied (fvp)
mas01mc@324 356 //
mas01mc@324 357 // post-conditions:
mas01mc@324 358 // allocated power data
mas01mc@324 359 // allocated l2norm data
mas01mc@324 360 //
mas01mc@324 361 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 362 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 363 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 364
mas01mc@324 365 // Allocate and read the power sequence
mas01mc@324 366 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 367
mas01mc@324 368 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 369 char* tmpStr = prefixedString;
mas01mc@324 370 // Open and check dimensions of power file
mas01mc@324 371 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 372 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 373 if(prefixedString!=tmpStr)
mas01mc@324 374 delete[] tmpStr;
mas01mc@324 375 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 376 if (powerfd < 0) {
mas01mc@324 377 error("failed to open power file", prefixedString);
mas01mc@324 378 }
mas01mc@324 379 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 380 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 381 }
mas01mc@324 382
mas01mc@324 383 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 384 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 385
mas01mc@324 386 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 387 assert(*sPowerp);
mas01mc@324 388 *spPtrp = *sPowerp;
mas01mc@324 389 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 390 if (0 < powerfd) {
mas01mc@324 391 close(powerfd);
mas01mc@324 392 }
mas01mc@324 393
mas01cr@498 394 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@498 395 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 396 powerTable = 0;
mas01mc@324 397
mas01mc@324 398 // Allocate and calculate the l2norm sequence
mas01mc@324 399 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 400 assert(*sNormpp);
mas01mc@324 401 *snPtrp = *sNormpp;
mas01cr@498 402 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@498 403 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@498 404 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 405 }
mas01mc@292 406 }
mas01mc@292 407
mas01mc@292 408 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 409 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 410 double** sPowerp, double** spPtrp){
mas01mc@292 411 size_t nfv = 0;
mas01mc@292 412 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 413 Uns32T trackID = 0;
mas01mc@292 414
mas01mc@292 415 VERB_LOG(1, "indexing tracks...");
mas01mc@292 416
mas01mc@324 417 int trackfd = dbfid;
mas01mc@292 418 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 419 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 420 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 421 char* tmpStr = prefixedString;
mas01mc@324 422 // Open and check dimensions of feature file
mas01mc@324 423 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 424 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 425 if(prefixedString!=tmpStr)
mas01mc@324 426 delete[] tmpStr;
mas01cr@498 427 initInputFile(prefixedString);
mas01mc@324 428 trackfd = infid;
mas01mc@324 429 }
mas01cr@498 430 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@498 431 error("failed to read data");
mas01mc@292 432 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 433
mas01mc@324 434 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 435 // Load power and calculate power and l2norm sequence sums
mas01mc@324 436 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 437
mas01mc@292 438 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 439 break;
mas01mc@324 440 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 441 close(infid);
mas01mc@324 442 delete[] *sNormpp;
mas01mc@324 443 delete[] *sPowerp;
mas01mc@324 444 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 445 }
mas01mc@324 446 } // end for(trackID = start_track ; ... )
mas01mc@292 447 std::cout << "finished inserting." << endl;
mas01mc@292 448 }
mas01mc@292 449
mas01mc@292 450 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 451 // Loop over the current input track's vectors
mas01cr@305 452 Uns32T numVecs = 0;
mas01cr@305 453 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 454 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 455 numVecs = 0;
mas01cr@305 456 } else {
mas01cr@305 457 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 458 }
mas01cr@305 459 } else {
mas01cr@305 460 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 461 numVecs = 0;
mas01cr@305 462 } else {
mas01cr@305 463 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 464 }
mas01cr@305 465 }
mas01mc@292 466
mas01mc@324 467 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 468 if(numVecs){
mas01cr@498 469 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324 470
mas01mc@324 471 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@498 472 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@498 473 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@498 474 if(vcount == -1) {
mas01cr@498 475 audiodb_index_delete_shingles(vv);
mas01cr@498 476 error("failed to norm shingles");
mas01cr@498 477 }
mas01cr@498 478 numVecsAboveThreshold = vcount;
mas01mc@324 479 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@498 480 audiodb_index_delete_shingles(vv);
mas01mc@324 481 }
mas01cr@498 482
mas01mc@292 483 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 484
mas01cr@498 485 /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292 486 sequence, which is right, but the space allocated is for the
mas01mc@292 487 whole track. */
mas01mc@292 488
mas01mc@292 489 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 490 * So let's be certain the pointers are in the correct place
mas01mc@292 491 */
mas01mc@292 492
mas01mc@324 493 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 494 *snpp += trackTable[trackID];
mas01mc@324 495 *sppp += trackTable[trackID];
mas01mc@324 496 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 497 }
mas01mc@292 498
mas01mc@292 499 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 500 std::cout.flush();
mas01mc@292 501 return true;
mas01mc@292 502 }
mas01mc@292 503
mas01mc@292 504 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 505 Uns32T collisionCount = 0;
mas01mc@292 506 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 507 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 508 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01cr@498 509 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
mas01mc@324 510 spp+=sequenceHop;
mas01mc@311 511 }
mas01mc@292 512 return collisionCount;
mas01mc@292 513 }
mas01mc@292 514
mas01mc@292 515 /*********************** LSH retrieval ****************************/
mas01mc@292 516
mas01mc@292 517
mas01mc@292 518 // return true if indexed query performed else return false
mas01cr@498 519 int audiodb_index_init_query(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate, bool corep) {
mas01mc@292 520
mas01cr@498 521 uint32_t sequence_length = spec->qid.sequence_length;
mas01cr@498 522 double radius = spec->refine.radius;
mas01cr@498 523 if(!(audiodb_index_exists(adb->path, radius, sequence_length)))
mas01mc@292 524 return false;
mas01mc@292 525
mas01cr@498 526 char *indexName = audiodb_index_get_name(adb->path, radius, sequence_length);
mas01cr@498 527 if(!indexName) {
mas01cr@498 528 return false;
mas01mc@292 529 }
mas01mc@292 530
mas01cr@498 531 qstate->lsh = audiodb_index_allocate(adb, indexName, corep);
mas01cr@498 532
mas01cr@498 533 /* FIXME: it would be nice if the LSH library didn't make me do
mas01cr@498 534 * this. */
mas01cr@498 535 if((!corep) && (qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2)) {
mas01cr@498 536 delete qstate->lsh;
mas01cr@498 537 qstate->lsh = audiodb_index_allocate(adb, indexName, true);
mas01mc@308 538 }
mas01mc@292 539
mas01mc@292 540 delete[] indexName;
mas01mc@292 541 return true;
mas01mc@292 542 }
mas01mc@292 543
mas01cr@498 544 void audiodb_index_add_point_approximate(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@498 545 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@498 546 adb_t *adb = data->adb;
mas01cr@498 547 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@498 548 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@498 549 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@498 550 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@498 551 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@498 552 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@498 553 adb_result_t r;
mas01cr@498 554 r.key = (*adb->keys)[trackID].c_str();
mas01cr@498 555 r.dist = dist;
mas01cr@498 556 r.qpos = qpos;
mas01cr@498 557 r.ipos = spos;
mas01cr@498 558 qstate->accumulator->add_point(&r);
mas01cr@498 559 }
mas01mc@292 560 }
mas01mc@292 561
mas01cr@498 562 // Maintain a queue of points to pass to audiodb_query_queue_loop()
mas01cr@498 563 // for exact evaluation
mas01cr@498 564 void audiodb_index_add_point_exact(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
mas01cr@498 565 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
mas01cr@498 566 adb_t *adb = data->adb;
mas01cr@498 567 adb_qstate_internal_t *qstate = data->qstate;
mas01cr@498 568 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
mas01cr@498 569 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
mas01cr@498 570 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
mas01cr@498 571 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
mas01cr@498 572 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
mas01cr@498 573 PointPair p(trackID, qpos, spos);
mas01cr@498 574 qstate->exact_evaluation_queue->push(p);
mas01cr@498 575 }
mas01mc@292 576 }
mas01mc@292 577
mas01cr@498 578 // return -1 on error
mas01mc@292 579 // return 0: if index does not exist
mas01mc@292 580 // return nqv: if index exists
mas01cr@498 581 int audiodb_index_query_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
mas01mc@292 582
mas01mc@324 583 double *query = 0, *query_data = 0;
mas01cr@498 584 adb_qpointers_internal_t qpointers = {0};
mas01mc@414 585
mas01cr@498 586 adb_qcallback_t callback_data;
mas01cr@498 587 callback_data.adb = adb;
mas01cr@498 588 callback_data.qstate = qstate;
mas01mc@292 589
mas01cr@498 590 void (*add_point_func)(void *, uint32_t, uint32_t, float);
mas01cr@498 591
mas01cr@498 592 uint32_t sequence_length = spec->qid.sequence_length;
mas01cr@498 593 bool normalized = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED);
mas01cr@498 594 double radius = spec->refine.radius;
mas01cr@498 595 bool use_absolute_threshold = spec->refine.flags & ADB_REFINE_ABSOLUTE_THRESHOLD;
mas01cr@498 596 double absolute_threshold = spec->refine.absolute_threshold;
mas01cr@498 597
mas01cr@498 598 if(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES) {
mas01cr@498 599 add_point_func = &audiodb_index_add_point_approximate;
mas01cr@498 600 } else {
mas01cr@498 601 qstate->exact_evaluation_queue = new std::priority_queue<PointPair>;
mas01cr@498 602 add_point_func = &audiodb_index_add_point_exact;
mas01cr@498 603 }
mas01cr@498 604
mas01cr@498 605 /* FIXME: this hardwired lsh_in_core is here to allow for a
mas01cr@498 606 * transition period while the need for the argument is worked
mas01cr@498 607 * through. Hopefully it will disappear again eventually. */
mas01cr@498 608 bool lsh_in_core = true;
mas01cr@498 609
mas01cr@498 610 if(!audiodb_index_init_query(adb, spec, qstate, lsh_in_core)) {
mas01mc@292 611 return 0;
mas01cr@498 612 }
mas01mc@292 613
mas01cr@498 614 char *database = audiodb_index_get_name(adb->path, radius, sequence_length);
mas01cr@498 615 if(!database) {
mas01cr@498 616 return -1;
mas01cr@498 617 }
mas01mc@292 618
mas01cr@498 619 if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
mas01cr@498 620 delete [] database;
mas01cr@498 621 return -1;
mas01cr@498 622 }
mas01mc@292 623
mas01cr@498 624 uint32_t Nq = (qpointers.nvectors > O2_MAXTRACKLEN ? O2_MAXTRACKLEN : qpointers.nvectors) - sequence_length + 1;
mas01cr@498 625 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequence_length);
mas01mc@414 626
mas01mc@292 627 // Construct shingles from query features
mas01cr@498 628 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
mas01cr@498 629 audiodb_index_make_shingle(vv, pointID, query, adb->header->dim, sequence_length);
mas01cr@498 630 }
mas01mc@292 631
mas01mc@292 632 // Normalize query vectors
mas01cr@498 633 int vcount = audiodb_index_norm_shingles(vv, qpointers.l2norm, qpointers.power, adb->header->dim, sequence_length, radius, normalized, use_absolute_threshold, absolute_threshold);
mas01cr@498 634 if(vcount == -1) {
mas01cr@498 635 audiodb_index_delete_shingles(vv);
mas01cr@498 636 delete [] database;
mas01cr@498 637 return -1;
mas01cr@498 638 }
mas01cr@498 639 uint32_t numVecsAboveThreshold = vcount;
mas01mc@292 640
mas01mc@292 641 // Nq contains number of inspected points in query file,
mas01mc@292 642 // numVecsAboveThreshold is number of points with power >= absolute_threshold
mas01cr@498 643 double *qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation
mas01cr@498 644 if(!(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) && numVecsAboveThreshold) {
mas01cr@498 645 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@498 646 qstate->lsh->retrieve_point((*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
mas01cr@498 647 } else {
mas01cr@498 648 qstate->lsh->serial_retrieve_point(database, (*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
mas01cr@498 649 }
mas01cr@498 650 } else if(numVecsAboveThreshold) {
mas01cr@498 651 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
mas01cr@370 652 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
mas01cr@498 653 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
mas01cr@498 654 qstate->lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 655 } else {
mas01cr@498 656 qstate->lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, &callback_data);
mas01cr@370 657 }
mas01cr@370 658 }
mas01cr@498 659 }
mas01cr@498 660 }
mas01cr@498 661 audiodb_index_delete_shingles(vv);
mas01mc@292 662
mas01cr@498 663 if(!(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES)) {
mas01cr@498 664 audiodb_query_queue_loop(adb, spec, qstate, query, &qpointers);
mas01cr@498 665 }
mas01mc@292 666
mas01mc@292 667 // Clean up
mas01mc@292 668 if(query_data)
mas01mc@292 669 delete[] query_data;
mas01cr@498 670 if(qpointers.l2norm_data)
mas01cr@498 671 delete[] qpointers.l2norm_data;
mas01cr@498 672 if(qpointers.power_data)
mas01cr@498 673 delete[] qpointers.power_data;
mas01cr@498 674 if(qpointers.mean_duration)
mas01cr@498 675 delete[] qpointers.mean_duration;
mas01mc@292 676 if(database)
mas01mc@292 677 delete[] database;
mas01cr@498 678 if(qstate->lsh != adb->cached_lsh)
mas01cr@498 679 delete qstate->lsh;
mas01mc@292 680
mas01mc@292 681 return Nq;
mas01mc@292 682 }
mas01mc@292 683