annotate index.cpp @ 770:c54bc2ffbf92 tip

update tags
author convert-repo
date Fri, 16 Dec 2011 11:34:01 +0000
parents 57e459f62788
children
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01mc@292 15
mas01cr@509 16 /******* LSH indexing audioDB database access forall s \in {S} *******/
mas01mc@292 17
mas01mc@292 18 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 19 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 20 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 21 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 22 }
mas01mc@324 23
mas01mc@325 24 double *snpp = 0, *sppp = 0;
mas01mc@324 25
mas01mc@292 26 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 27 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 28 snpp = *snp;
mas01mc@292 29 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 30 sppp = *spp;
mas01mc@325 31
mas01mc@324 32 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 33 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 34
mas01mc@324 35
mas01mc@292 36 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 37 if(trackTable[i] >= sequenceLength) {
mas01cr@498 38 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@498 39 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 40
mas01cr@498 41 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@498 42 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 43 }
mas01mc@292 44 snpp += trackTable[i];
mas01mc@292 45 sppp += trackTable[i];
mas01mc@292 46 }
mas01mc@324 47
mas01mc@292 48 *vsnp = *snp;
mas01mc@292 49 *vspp = *spp;
mas01mc@324 50
mas01mc@292 51 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 52 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 53 }
mas01mc@292 54
mas01mc@292 55 /************************ LSH indexing ***********************************/
mas01mc@292 56 void audioDB::index_index_db(const char* dbName){
mas01mc@292 57 char* newIndexName;
mas01mc@292 58 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 59 Uns32T dbVectors = 0;
mas01mc@292 60
mas01mc@324 61
mas01mc@292 62 printf("INDEX: initializing header\n");
mas01mc@292 63 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 64 forWrite = false;
mas01mc@292 65 initDBHeader(dbName);
mas01mc@292 66
mas01mc@324 67 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 68 usingPower = true;
mas01mc@324 69
mas01mc@324 70 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 71 usingTimes = true;
mas01mc@324 72
mas01cr@498 73 newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@498 74 if(!newIndexName) {
mas01cr@498 75 error("failed to get index name", dbName);
mas01cr@498 76 }
mas01mc@292 77
mas01mc@292 78 // Set unit norming flag override
mas01mc@292 79 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 80
mas01mc@327 81 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 82 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 83 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 84 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 85 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 86 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 87 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 88 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 89 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 90 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 91
mas01mc@292 92 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 93 printf("INDEX: constructing new LSH index\n");
mas01mc@292 94 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 95 fflush(stdout);
mas01mc@292 96 // Construct new LSH index
mas01mc@292 97 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 98 lsh_param_m,
mas01mc@292 99 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 100 lsh_param_N,
mas01mc@292 101 lsh_param_ncols,
mas01mc@292 102 (float)radius);
mas01mc@292 103 assert(lsh);
mas01mc@292 104
mas01mc@292 105 Uns32T endTrack = lsh_param_b;
mas01mc@292 106 if( endTrack > dbH->numFiles)
mas01mc@292 107 endTrack = dbH->numFiles;
mas01mc@292 108 // Insert up to lsh_param_b tracks
mas01mc@324 109 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 110 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 111 }
mas01mc@324 112 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 113 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 114
mas01mc@292 115 // Clean up
mas01mc@292 116 delete lsh;
mas01mc@308 117 lsh = 0;
mas01cr@498 118 } else {
mas01mc@292 119 close(lshfid);
mas01mc@292 120 }
mas01mc@292 121
mas01mc@292 122 // Attempt to open LSH file
mas01mc@292 123 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 124 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 125 fflush(stdout);
mas01mc@340 126 char* mergeIndexName = newIndexName;
mas01mc@292 127
mas01mc@292 128 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 129 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 130 assert(lsh);
mas01mc@534 131 Uns32T maxs = audiodb_index_to_track_id(adb, lsh->get_maxp())+1;
mas01mc@292 132 delete lsh;
mas01mc@308 133 lsh = 0;
mas01mc@292 134
mas01mc@340 135 // Insert up to lsh_param_b tracks
mas01mc@340 136 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 137 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 138 }
mas01mc@292 139 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 140 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 141
mas01mc@292 142 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 143 if( endTrack > dbH->numFiles)
mas01mc@292 144 endTrack = dbH->numFiles;
mas01mc@292 145 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 146 fflush(stdout);
mas01mc@340 147 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 148 assert(lsh);
mas01mc@292 149
mas01mc@292 150 // Insert up to lsh_param_b database tracks
mas01mc@292 151 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 152
mas01mc@340 153 // Serialize to file (merging is performed here)
mas01mc@340 154 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 155 delete lsh;
mas01mc@308 156 lsh = 0;
mas01mc@340 157 }
mas01mc@292 158
mas01mc@292 159 close(lshfid);
mas01mc@292 160 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 161 fflush(stdout);
mas01mc@292 162
mas01mc@292 163 }
mas01mc@292 164 else{
mas01mc@292 165 error("Something's wrong with LSH index file");
mas01mc@292 166 exit(1);
mas01mc@292 167 }
mas01mc@292 168
mas01mc@324 169 delete[] newIndexName;
mas01mc@324 170 delete[] sNorm;
mas01mc@324 171 delete[] sPower;
mas01mc@324 172 }
mas01mc@292 173
mas01mc@292 174
mas01cr@498 175 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@498 176 if(usingPower){
mas01cr@498 177 int one;
mas01cr@498 178 unsigned int count;
mas01cr@498 179
mas01cr@498 180 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@498 181 if (count != sizeof(unsigned int)) {
mas01cr@498 182 error("powerfd read failed", "int", "read");
mas01cr@498 183 }
mas01cr@498 184 if (one != 1) {
mas01cr@498 185 error("dimensionality of power file not 1", powerFileName);
mas01cr@498 186 }
mas01cr@498 187
mas01cr@498 188 // FIXME: should check that the powerfile is the right size for
mas01cr@498 189 // this. -- CSR, 2007-10-30
mas01cr@498 190 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@498 191 if (count != numVectors * sizeof(double)) {
mas01cr@498 192 error("powerfd read failed", "double", "read");
mas01cr@498 193 }
mas01cr@498 194 }
mas01cr@498 195 }
mas01cr@498 196
mas01mc@324 197 // initialize auxillary track data from filesystem
mas01mc@324 198 // pre-conditions:
mas01mc@324 199 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 200 // feature data allocated and copied (fvp)
mas01mc@324 201 //
mas01mc@324 202 // post-conditions:
mas01mc@324 203 // allocated power data
mas01mc@324 204 // allocated l2norm data
mas01mc@324 205 //
mas01mc@324 206 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 207 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 208 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 209
mas01mc@324 210 // Allocate and read the power sequence
mas01mc@324 211 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 212
mas01mc@324 213 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 214 char* tmpStr = prefixedString;
mas01mc@324 215 // Open and check dimensions of power file
mas01mc@324 216 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 217 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 218 if(prefixedString!=tmpStr)
mas01mc@324 219 delete[] tmpStr;
mas01mc@324 220 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 221 if (powerfd < 0) {
mas01mc@324 222 error("failed to open power file", prefixedString);
mas01mc@324 223 }
mas01mc@324 224 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 225 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 226 }
mas01mc@324 227
mas01mc@324 228 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 229 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 230
mas01mc@324 231 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 232 assert(*sPowerp);
mas01mc@324 233 *spPtrp = *sPowerp;
mas01mc@324 234 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 235 if (0 < powerfd) {
mas01mc@324 236 close(powerfd);
mas01mc@324 237 }
mas01mc@324 238
mas01cr@498 239 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@498 240 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 241 powerTable = 0;
mas01mc@324 242
mas01mc@324 243 // Allocate and calculate the l2norm sequence
mas01mc@324 244 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 245 assert(*sNormpp);
mas01mc@324 246 *snPtrp = *sNormpp;
mas01cr@498 247 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@498 248 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@498 249 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 250 }
mas01mc@292 251 }
mas01mc@292 252
mas01mc@292 253 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 254 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 255 double** sPowerp, double** spPtrp){
mas01mc@292 256 size_t nfv = 0;
mas01mc@292 257 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 258 Uns32T trackID = 0;
mas01mc@292 259
mas01mc@292 260 VERB_LOG(1, "indexing tracks...");
mas01mc@292 261
mas01mc@324 262 int trackfd = dbfid;
mas01mc@292 263 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 264 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 265 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 266 char* tmpStr = prefixedString;
mas01mc@324 267 // Open and check dimensions of feature file
mas01mc@324 268 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 269 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 270 if(prefixedString!=tmpStr)
mas01mc@324 271 delete[] tmpStr;
mas01cr@498 272 initInputFile(prefixedString);
mas01mc@324 273 trackfd = infid;
mas01mc@324 274 }
mas01cr@498 275 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@498 276 error("failed to read data");
mas01mc@292 277 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 278
mas01mc@324 279 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 280 // Load power and calculate power and l2norm sequence sums
mas01mc@324 281 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 282
mas01mc@292 283 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 284 break;
mas01mc@324 285 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 286 close(infid);
mas01mc@324 287 delete[] *sNormpp;
mas01mc@324 288 delete[] *sPowerp;
mas01mc@324 289 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 290 }
mas01mc@324 291 } // end for(trackID = start_track ; ... )
mas01mc@292 292 std::cout << "finished inserting." << endl;
mas01mc@292 293 }
mas01mc@292 294
mas01mc@292 295 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 296 // Loop over the current input track's vectors
mas01cr@305 297 Uns32T numVecs = 0;
mas01mc@534 298
mas01mc@534 299 if (trackTable[trackID] < sequenceLength - 1) {
mas01mc@534 300 numVecs = 0;
mas01cr@305 301 } else {
mas01mc@534 302 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 303 }
mas01mc@534 304
mas01mc@292 305
mas01mc@324 306 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 307 if(numVecs){
mas01cr@498 308 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324 309
mas01mc@324 310 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@498 311 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@498 312 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@498 313 if(vcount == -1) {
mas01cr@498 314 audiodb_index_delete_shingles(vv);
mas01cr@498 315 error("failed to norm shingles");
mas01cr@498 316 }
mas01cr@498 317 numVecsAboveThreshold = vcount;
mas01mc@324 318 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@498 319 audiodb_index_delete_shingles(vv);
mas01mc@324 320 }
mas01cr@498 321
mas01mc@292 322 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 323
mas01cr@498 324 /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292 325 sequence, which is right, but the space allocated is for the
mas01mc@292 326 whole track. */
mas01mc@292 327
mas01mc@292 328 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 329 * So let's be certain the pointers are in the correct place
mas01mc@292 330 */
mas01mc@292 331
mas01mc@324 332 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 333 *snpp += trackTable[trackID];
mas01mc@324 334 *sppp += trackTable[trackID];
mas01mc@324 335 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 336 }
mas01mc@292 337
mas01mc@292 338 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 339 std::cout.flush();
mas01mc@292 340 return true;
mas01mc@292 341 }
mas01mc@292 342
mas01mc@292 343 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 344 Uns32T collisionCount = 0;
mas01mc@292 345 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 346 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 347 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01mc@534 348 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(adb, trackID, pointID));
mas01mc@324 349 spp+=sequenceHop;
mas01mc@311 350 }
mas01mc@292 351 return collisionCount;
mas01mc@292 352 }