annotate index.cpp @ 509:cc2b97d020b1

Code rearrangements to tease apart library code from C++ audioDB code. There should be precisely no functional changes in this commit. Instead, the only thing that has happened is that all the abstraction violation and other horribleness is concentrated in one place: the include of "audioDB-internals.h" in audioDB.h -- the separation will be complete once that include can be removed. This include is necessary because the command-line binary / SOAP server still does some things directly rather than through an API: not least of which the operations that have not yet been integrated into the API yet, but also some messing around with constants, flags and nominally internal functions. The intent is to remove as many of these as possible and think quite hard about the rest. In the meantime, the library is now much more self-contained: the only things it uses are in the audioDB_API.h and audioDB-internals.h headers; thus there are fewer nasty surprises lurking for readers of the code. The Makefile has been adjusted to take advantage of this rearrangement in the dependencies.
author mas01cr
date Thu, 15 Jan 2009 13:57:33 +0000
parents 342822c2d49a
children 06409b6e268f
rev   line source
mas01mc@292 1 // LSH indexing
mas01mc@292 2 //
mas01mc@292 3 // Construct a persistent LSH table structure
mas01mc@292 4 // Store at the same location as dbName
mas01mc@292 5 // Naming convention:
mas01mc@292 6 // dbName.lsh.${radius}.${sequenceLength}
mas01mc@292 7 //
mas01mc@292 8 //
mas01mc@292 9 // Author: Michael Casey
mas01mc@292 10 // Date: 23 June 2008
mas01mc@324 11 //
mas01mc@324 12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
mas01mc@292 13
mas01mc@292 14 #include "audioDB.h"
mas01mc@292 15
mas01cr@509 16 /******* LSH indexing audioDB database access forall s \in {S} *******/
mas01mc@292 17
mas01mc@292 18 // Prepare the AudioDB database for read access and allocate auxillary memory
mas01mc@292 19 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
mas01mc@324 20 if (!(dbH->flags & O2_FLAG_POWER)) {
mas01mc@324 21 error("INDEXed database must be power-enabled", dbName);
mas01mc@324 22 }
mas01mc@324 23
mas01mc@325 24 double *snpp = 0, *sppp = 0;
mas01mc@324 25
mas01mc@292 26 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
mas01mc@292 27 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
mas01mc@325 28 snpp = *snp;
mas01mc@292 29 *spp = new double[*dvp]; // song powertable pointer
mas01mc@292 30 sppp = *spp;
mas01mc@325 31
mas01mc@324 32 memcpy(*snp, l2normTable, *dvp * sizeof(double));
mas01mc@292 33 memcpy(*spp, powerTable, *dvp * sizeof(double));
mas01mc@324 34
mas01mc@324 35
mas01mc@292 36 for(Uns32T i = 0; i < dbH->numFiles; i++){
mas01mc@292 37 if(trackTable[i] >= sequenceLength) {
mas01cr@498 38 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
mas01cr@498 39 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
mas01mc@292 40
mas01cr@498 41 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
mas01cr@498 42 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
mas01mc@292 43 }
mas01mc@292 44 snpp += trackTable[i];
mas01mc@292 45 sppp += trackTable[i];
mas01mc@292 46 }
mas01mc@324 47
mas01mc@292 48 *vsnp = *snp;
mas01mc@292 49 *vspp = *spp;
mas01mc@324 50
mas01mc@292 51 // Move the feature vector read pointer to start of fetures in database
mas01mc@292 52 lseek(dbfid, dbH->dataOffset, SEEK_SET);
mas01mc@292 53 }
mas01mc@292 54
mas01mc@292 55 /************************ LSH indexing ***********************************/
mas01mc@292 56 void audioDB::index_index_db(const char* dbName){
mas01mc@292 57 char* newIndexName;
mas01mc@292 58 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
mas01mc@292 59 Uns32T dbVectors = 0;
mas01mc@292 60
mas01mc@324 61
mas01mc@292 62 printf("INDEX: initializing header\n");
mas01mc@292 63 // Check if audioDB exists, initialize header and open database for read
mas01mc@292 64 forWrite = false;
mas01mc@292 65 initDBHeader(dbName);
mas01mc@292 66
mas01mc@324 67 if(dbH->flags & O2_FLAG_POWER)
mas01mc@324 68 usingPower = true;
mas01mc@324 69
mas01mc@324 70 if(dbH->flags & O2_FLAG_TIMES)
mas01mc@324 71 usingTimes = true;
mas01mc@324 72
mas01cr@498 73 newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@498 74 if(!newIndexName) {
mas01cr@498 75 error("failed to get index name", dbName);
mas01cr@498 76 }
mas01mc@292 77
mas01mc@292 78 // Set unit norming flag override
mas01mc@292 79 audioDB::normalizedDistance = !audioDB::no_unit_norming;
mas01mc@292 80
mas01mc@327 81 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
mas01mc@327 82 VERB_LOG(1, "INDEX: R %f\n", radius);
mas01mc@327 83 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
mas01mc@327 84 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
mas01mc@327 85 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
mas01mc@327 86 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
mas01mc@327 87 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
mas01mc@327 88 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
mas01mc@327 89 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
mas01mc@327 90 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
mas01mc@292 91
mas01mc@292 92 if((lshfid = open(newIndexName,O_RDONLY))<0){
mas01mc@292 93 printf("INDEX: constructing new LSH index\n");
mas01mc@292 94 printf("INDEX: making index file %s\n", newIndexName);
mas01mc@292 95 fflush(stdout);
mas01mc@292 96 // Construct new LSH index
mas01mc@292 97 lsh = new LSH((float)lsh_param_w, lsh_param_k,
mas01mc@292 98 lsh_param_m,
mas01mc@292 99 (Uns32T)(sequenceLength*dbH->dim),
mas01mc@292 100 lsh_param_N,
mas01mc@292 101 lsh_param_ncols,
mas01mc@292 102 (float)radius);
mas01mc@292 103 assert(lsh);
mas01mc@292 104
mas01mc@292 105 Uns32T endTrack = lsh_param_b;
mas01mc@292 106 if( endTrack > dbH->numFiles)
mas01mc@292 107 endTrack = dbH->numFiles;
mas01mc@292 108 // Insert up to lsh_param_b tracks
mas01mc@324 109 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 110 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@324 111 }
mas01mc@324 112 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 113 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
mas01mc@292 114
mas01mc@292 115 // Clean up
mas01mc@292 116 delete lsh;
mas01mc@308 117 lsh = 0;
mas01cr@498 118 } else {
mas01mc@292 119 close(lshfid);
mas01mc@292 120 }
mas01mc@292 121
mas01mc@292 122 // Attempt to open LSH file
mas01mc@292 123 if((lshfid = open(newIndexName,O_RDONLY))>0){
mas01mc@292 124 printf("INDEX: merging with existing LSH index\n");
mas01mc@292 125 fflush(stdout);
mas01mc@340 126 char* mergeIndexName = newIndexName;
mas01mc@292 127
mas01mc@292 128 // Get the lsh header info and find how many tracks are inserted already
mas01mc@340 129 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
mas01mc@292 130 assert(lsh);
mas01cr@498 131 Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
mas01mc@292 132 delete lsh;
mas01mc@308 133 lsh = 0;
mas01mc@292 134
mas01mc@340 135 // Insert up to lsh_param_b tracks
mas01mc@340 136 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@340 137 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
mas01mc@340 138 }
mas01mc@292 139 // This allows for updating index after more tracks are inserted into audioDB
mas01mc@292 140 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
mas01mc@292 141
mas01mc@292 142 Uns32T endTrack = startTrack + lsh_param_b;
mas01mc@292 143 if( endTrack > dbH->numFiles)
mas01mc@292 144 endTrack = dbH->numFiles;
mas01mc@292 145 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
mas01mc@292 146 fflush(stdout);
mas01mc@340 147 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
mas01mc@292 148 assert(lsh);
mas01mc@292 149
mas01mc@292 150 // Insert up to lsh_param_b database tracks
mas01mc@292 151 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
mas01mc@292 152
mas01mc@340 153 // Serialize to file (merging is performed here)
mas01mc@340 154 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
mas01mc@292 155 delete lsh;
mas01mc@308 156 lsh = 0;
mas01mc@340 157 }
mas01mc@292 158
mas01mc@292 159 close(lshfid);
mas01mc@292 160 printf("INDEX: done constructing LSH index.\n");
mas01mc@292 161 fflush(stdout);
mas01mc@292 162
mas01mc@292 163 }
mas01mc@292 164 else{
mas01mc@292 165 error("Something's wrong with LSH index file");
mas01mc@292 166 exit(1);
mas01mc@292 167 }
mas01mc@292 168
mas01mc@324 169 delete[] newIndexName;
mas01mc@324 170 delete[] sNorm;
mas01mc@324 171 delete[] sPower;
mas01mc@324 172 }
mas01mc@292 173
mas01mc@292 174
mas01cr@498 175 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
mas01cr@498 176 if(usingPower){
mas01cr@498 177 int one;
mas01cr@498 178 unsigned int count;
mas01cr@498 179
mas01cr@498 180 count = read(powerfd, &one, sizeof(unsigned int));
mas01cr@498 181 if (count != sizeof(unsigned int)) {
mas01cr@498 182 error("powerfd read failed", "int", "read");
mas01cr@498 183 }
mas01cr@498 184 if (one != 1) {
mas01cr@498 185 error("dimensionality of power file not 1", powerFileName);
mas01cr@498 186 }
mas01cr@498 187
mas01cr@498 188 // FIXME: should check that the powerfile is the right size for
mas01cr@498 189 // this. -- CSR, 2007-10-30
mas01cr@498 190 count = read(powerfd, powerdata, numVectors * sizeof(double));
mas01cr@498 191 if (count != numVectors * sizeof(double)) {
mas01cr@498 192 error("powerfd read failed", "double", "read");
mas01cr@498 193 }
mas01cr@498 194 }
mas01cr@498 195 }
mas01cr@498 196
mas01mc@324 197 // initialize auxillary track data from filesystem
mas01mc@324 198 // pre-conditions:
mas01mc@324 199 // dbH->flags & O2_FLAG_LARGE_ADB
mas01mc@324 200 // feature data allocated and copied (fvp)
mas01mc@324 201 //
mas01mc@324 202 // post-conditions:
mas01mc@324 203 // allocated power data
mas01mc@324 204 // allocated l2norm data
mas01mc@324 205 //
mas01mc@324 206 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
mas01mc@324 207 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
mas01mc@324 208 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
mas01mc@292 209
mas01mc@324 210 // Allocate and read the power sequence
mas01mc@324 211 if(trackTable[trackID]>=sequenceLength){
mas01mc@324 212
mas01mc@324 213 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 214 char* tmpStr = prefixedString;
mas01mc@324 215 // Open and check dimensions of power file
mas01mc@324 216 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 217 prefix_name((char ** const)&prefixedString, adb_feature_root);
mas01mc@324 218 if(prefixedString!=tmpStr)
mas01mc@324 219 delete[] tmpStr;
mas01mc@324 220 powerfd = open(prefixedString, O_RDONLY);
mas01mc@324 221 if (powerfd < 0) {
mas01mc@324 222 error("failed to open power file", prefixedString);
mas01mc@324 223 }
mas01mc@324 224 if (fstat(powerfd, &statbuf) < 0) {
mas01mc@324 225 error("fstat error finding size of power file", prefixedString, "fstat");
mas01mc@324 226 }
mas01mc@324 227
mas01mc@324 228 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
mas01mc@324 229 error("Dimension mismatch: numPowers != numVectors", prefixedString);
mas01mc@324 230
mas01mc@324 231 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
mas01mc@324 232 assert(*sPowerp);
mas01mc@324 233 *spPtrp = *sPowerp;
mas01mc@324 234 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
mas01mc@324 235 if (0 < powerfd) {
mas01mc@324 236 close(powerfd);
mas01mc@324 237 }
mas01mc@324 238
mas01cr@498 239 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
mas01cr@498 240 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
mas01mc@324 241 powerTable = 0;
mas01mc@324 242
mas01mc@324 243 // Allocate and calculate the l2norm sequence
mas01mc@324 244 *sNormpp = new double[trackTable[trackID]];
mas01mc@324 245 assert(*sNormpp);
mas01mc@324 246 *snPtrp = *sNormpp;
mas01cr@498 247 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
mas01cr@498 248 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
mas01cr@498 249 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
mas01mc@324 250 }
mas01mc@292 251 }
mas01mc@292 252
mas01mc@292 253 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
mas01mc@292 254 double** fvpp, double** sNormpp,double** snPtrp,
mas01mc@292 255 double** sPowerp, double** spPtrp){
mas01mc@292 256 size_t nfv = 0;
mas01mc@292 257 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
mas01mc@292 258 Uns32T trackID = 0;
mas01mc@292 259
mas01mc@292 260 VERB_LOG(1, "indexing tracks...");
mas01mc@292 261
mas01mc@324 262 int trackfd = dbfid;
mas01mc@292 263 for(trackID = start_track ; trackID < end_track ; trackID++ ){
mas01mc@324 264 if( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 265 char* prefixedString = new char[O2_MAXFILESTR];
mas01mc@324 266 char* tmpStr = prefixedString;
mas01mc@324 267 // Open and check dimensions of feature file
mas01mc@324 268 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
mas01mc@324 269 prefix_name((char ** const) &prefixedString, adb_feature_root);
mas01mc@324 270 if(prefixedString!=tmpStr)
mas01mc@324 271 delete[] tmpStr;
mas01cr@498 272 initInputFile(prefixedString);
mas01mc@324 273 trackfd = infid;
mas01mc@324 274 }
mas01cr@498 275 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
mas01cr@498 276 error("failed to read data");
mas01mc@292 277 *fvpp = fvp; // Protect memory allocation and free() for track data
mas01mc@324 278
mas01mc@324 279 if( dbH->flags & O2_FLAG_LARGE_ADB )
mas01mc@324 280 // Load power and calculate power and l2norm sequence sums
mas01mc@324 281 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
mas01mc@324 282
mas01mc@292 283 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
mas01mc@292 284 break;
mas01mc@324 285 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
mas01mc@324 286 close(infid);
mas01mc@324 287 delete[] *sNormpp;
mas01mc@324 288 delete[] *sPowerp;
mas01mc@324 289 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
mas01mc@324 290 }
mas01mc@324 291 } // end for(trackID = start_track ; ... )
mas01mc@292 292 std::cout << "finished inserting." << endl;
mas01mc@292 293 }
mas01mc@292 294
mas01mc@292 295 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
mas01mc@292 296 // Loop over the current input track's vectors
mas01cr@305 297 Uns32T numVecs = 0;
mas01cr@305 298 if (trackTable[trackID] > O2_MAXTRACKLEN) {
mas01cr@305 299 if (O2_MAXTRACKLEN < sequenceLength - 1) {
mas01cr@305 300 numVecs = 0;
mas01cr@305 301 } else {
mas01cr@305 302 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
mas01cr@305 303 }
mas01cr@305 304 } else {
mas01cr@305 305 if (trackTable[trackID] < sequenceLength - 1) {
mas01cr@305 306 numVecs = 0;
mas01cr@305 307 } else {
mas01cr@305 308 numVecs = trackTable[trackID] - sequenceLength + 1;
mas01cr@305 309 }
mas01cr@305 310 }
mas01mc@292 311
mas01mc@324 312 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
mas01mc@324 313 if(numVecs){
mas01cr@498 314 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
mas01mc@324 315
mas01mc@324 316 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
mas01cr@498 317 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
mas01cr@498 318 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
mas01cr@498 319 if(vcount == -1) {
mas01cr@498 320 audiodb_index_delete_shingles(vv);
mas01cr@498 321 error("failed to norm shingles");
mas01cr@498 322 }
mas01cr@498 323 numVecsAboveThreshold = vcount;
mas01mc@324 324 collisionCount = index_insert_shingles(vv, trackID, *sppp);
mas01cr@498 325 audiodb_index_delete_shingles(vv);
mas01mc@324 326 }
mas01cr@498 327
mas01mc@292 328 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
mas01mc@292 329
mas01cr@498 330 /* audiodb_index_norm_shingles() only goes as far as the end of the
mas01mc@292 331 sequence, which is right, but the space allocated is for the
mas01mc@292 332 whole track. */
mas01mc@292 333
mas01mc@292 334 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
mas01mc@292 335 * So let's be certain the pointers are in the correct place
mas01mc@292 336 */
mas01mc@292 337
mas01mc@324 338 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
mas01mc@324 339 *snpp += trackTable[trackID];
mas01mc@324 340 *sppp += trackTable[trackID];
mas01mc@324 341 *fvpp += trackTable[trackID] * dbH->dim;
mas01mc@324 342 }
mas01mc@292 343
mas01mc@292 344 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
mas01mc@292 345 std::cout.flush();
mas01mc@292 346 return true;
mas01mc@292 347 }
mas01mc@292 348
mas01mc@292 349 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
mas01mc@292 350 Uns32T collisionCount = 0;
mas01mc@292 351 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
mas01mc@324 352 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
mas01mc@324 353 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
mas01cr@498 354 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
mas01mc@324 355 spp+=sequenceHop;
mas01mc@311 356 }
mas01mc@292 357 return collisionCount;
mas01mc@292 358 }