comparison index.cpp @ 319:b9eff6896943 large_adb

Added indexing support for O2_FLAG_LARGE_ADB. Tested on indexed query by features. No indexed query-by-key yet. No --lsh_exact yet.
author mas01mc
date Tue, 19 Aug 2008 20:27:15 +0000
parents cac5b3465318
children a995e5ad999a
comparison
equal deleted inserted replaced
318:c270d9e4659a 319:b9eff6896943
6 // dbName.lsh.${radius}.${sequenceLength} 6 // dbName.lsh.${radius}.${sequenceLength}
7 // 7 //
8 // 8 //
9 // Author: Michael Casey 9 // Author: Michael Casey
10 // Date: 23 June 2008 10 // Date: 23 June 2008
11 //
12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
11 13
12 #include "audioDB.h" 14 #include "audioDB.h"
13 #include "ReporterBase.h" 15 #include "ReporterBase.h"
14 16
15 17
16 /************************* LSH point index to audioDB conversion *****************/ 18 /************************* LSH point index to audioDB conversion *****************/
17 Uns32T audioDB::index_to_trackID(Uns32T lshID){ 19 Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
18 return lshID>>LSH_N_POINT_BITS; 20 assert(nPntBits);
19 } 21 return lshID>>nPntBits;
20 22 }
21 Uns32T audioDB::index_to_trackPos(Uns32T lshID){ 23
22 return lshID&LSH_POINT_MASK; 24 Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
23 } 25 assert(nPntBits);
24 26 return lshID&((1<<nPntBits)-1);
25 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){ 27 }
26 return (trackID << LSH_N_POINT_BITS) | spos; 28
29 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
30 assert(nPntBits);
31 return (trackID << nPntBits) | spos;
27 } 32 }
28 33
29 /************************* LSH indexing and query initialization *****************/ 34 /************************* LSH indexing and query initialization *****************/
30 35
31 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){ 36 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
76 81
77 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/ 82 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
78 83
79 // Prepare the AudioDB database for read access and allocate auxillary memory 84 // Prepare the AudioDB database for read access and allocate auxillary memory
80 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { 85 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
86 if (!(dbH->flags & O2_FLAG_POWER)) {
87 error("INDEXed database must be power-enabled", dbName);
88 }
89
90 double *snpp = *snp, *sppp = 0;
91
81 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors 92 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
82 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector 93 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
83
84 double *snpp = *snp, *sppp = 0;
85 memcpy(*snp, l2normTable, *dvp * sizeof(double));
86
87 if (!(dbH->flags & O2_FLAG_POWER)) {
88 error("database not power-enabled", dbName);
89 }
90 *spp = new double[*dvp]; // song powertable pointer 94 *spp = new double[*dvp]; // song powertable pointer
91 sppp = *spp; 95 sppp = *spp;
96 memcpy(*snp, l2normTable, *dvp * sizeof(double));
92 memcpy(*spp, powerTable, *dvp * sizeof(double)); 97 memcpy(*spp, powerTable, *dvp * sizeof(double));
93 98
99
94 for(Uns32T i = 0; i < dbH->numFiles; i++){ 100 for(Uns32T i = 0; i < dbH->numFiles; i++){
95 if(trackTable[i] >= sequenceLength) { 101 if(trackTable[i] >= sequenceLength) {
96 sequence_sum(snpp, trackTable[i], sequenceLength); 102 sequence_sum(snpp, trackTable[i], sequenceLength);
97 sequence_sqrt(snpp, trackTable[i], sequenceLength); 103 sequence_sqrt(snpp, trackTable[i], sequenceLength);
98 104
100 sequence_average(sppp, trackTable[i], sequenceLength); 106 sequence_average(sppp, trackTable[i], sequenceLength);
101 } 107 }
102 snpp += trackTable[i]; 108 snpp += trackTable[i];
103 sppp += trackTable[i]; 109 sppp += trackTable[i];
104 } 110 }
105 111
106 *vsnp = *snp; 112 *vsnp = *snp;
107 *vspp = *spp; 113 *vspp = *spp;
108 114
109 // Move the feature vector read pointer to start of fetures in database 115 // Move the feature vector read pointer to start of fetures in database
110 lseek(dbfid, dbH->dataOffset, SEEK_SET); 116 lseek(dbfid, dbH->dataOffset, SEEK_SET);
111 } 117 }
112 118
113 119
139 printf("INDEX: lsh_b %d\n", lsh_param_b); 145 printf("INDEX: lsh_b %d\n", lsh_param_b);
140 printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); 146 printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false");
141 fflush(stdout); 147 fflush(stdout);
142 148
143 149
144 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
145
146 if((lshfid = open(newIndexName,O_RDONLY))<0){ 150 if((lshfid = open(newIndexName,O_RDONLY))<0){
147 printf("INDEX: constructing new LSH index\n"); 151 printf("INDEX: constructing new LSH index\n");
148 printf("INDEX: making index file %s\n", newIndexName); 152 printf("INDEX: making index file %s\n", newIndexName);
149 fflush(stdout); 153 fflush(stdout);
150 // Construct new LSH index 154 // Construct new LSH index
158 162
159 Uns32T endTrack = lsh_param_b; 163 Uns32T endTrack = lsh_param_b;
160 if( endTrack > dbH->numFiles) 164 if( endTrack > dbH->numFiles)
161 endTrack = dbH->numFiles; 165 endTrack = dbH->numFiles;
162 // Insert up to lsh_param_b tracks 166 // Insert up to lsh_param_b tracks
163 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); 167 if( dbH->flags & O2_FLAG_LARGE_ADB ){
168 }
169 else{
170 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
171 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
172 }
164 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); 173 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
165 174
166 // Clean up 175 // Clean up
167 delete lsh; 176 delete lsh;
168 lsh = 0; 177 lsh = 0;
175 fflush(stdout); 184 fflush(stdout);
176 185
177 // Get the lsh header info and find how many tracks are inserted already 186 // Get the lsh header info and find how many tracks are inserted already
178 lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here 187 lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here
179 assert(lsh); 188 assert(lsh);
180 Uns32T maxs = index_to_trackID(lsh->get_maxp())+1; 189 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
181 delete lsh; 190 delete lsh;
182 lsh = 0; 191 lsh = 0;
183 192
184 // This allows for updating index after more tracks are inserted into audioDB 193 // This allows for updating index after more tracks are inserted into audioDB
185 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ 194 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
219 delete[] sPower; 228 delete[] sPower;
220 229
221 230
222 } 231 }
223 232
233
234 // initialize auxillary track data from filesystem
235 // pre-conditions:
236 // dbH->flags & O2_FLAG_LARGE_ADB
237 // feature data allocated and copied (fvp)
238 //
239 // post-conditions:
240 // allocated power data
241 // allocated l2norm data
242 //
243 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
244 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
245 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
246
247 // Allocate and read the power sequence
248 if(trackTable[trackID]>=sequenceLength){
249
250 // Open and check dimensions of power file
251 powerfd = open(powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O_RDONLY);
252 if (powerfd < 0) {
253 error("failed to open power file", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE);
254 }
255 if (fstat(powerfd, &statbuf) < 0) {
256 error("fstat error finding size of power file", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, "fstat");
257 }
258
259 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
260 error("Dimension mismatch: numPowers != numVectors", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE);
261
262 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
263 assert(*sPowerp);
264 *spPtrp = *sPowerp;
265 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
266 if (0 < powerfd) {
267 close(powerfd);
268 }
269
270 sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
271 sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
272 powerTable = 0;
273
274 // Allocate and calculate the l2norm sequence
275 *sNormpp = new double[trackTable[trackID]];
276 assert(*sNormpp);
277 *snPtrp = *sNormpp;
278 unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp);
279 sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
280 sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
281 }
282 }
283
224 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, 284 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
225 double** fvpp, double** sNormpp,double** snPtrp, 285 double** fvpp, double** sNormpp,double** snPtrp,
226 double** sPowerp, double** spPtrp){ 286 double** sPowerp, double** spPtrp){
227 size_t nfv = 0; 287 size_t nfv = 0;
228 double* fvp = 0; // Keep pointer for memory allocation and free() for track data 288 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
229 Uns32T trackID = 0; 289 Uns32T trackID = 0;
230 290
231 VERB_LOG(1, "indexing tracks..."); 291 VERB_LOG(1, "indexing tracks...");
232 292
233 293 int trackfd = dbfid;
234 for(trackID = start_track ; trackID < end_track ; trackID++ ){ 294 for(trackID = start_track ; trackID < end_track ; trackID++ ){
235 read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv 295 if( dbH->flags & O2_FLAG_LARGE_ADB ){
296 // Open and check dimensions of feature file
297 initInputFile(featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, false); // nommap, file pointer at correct position
298 trackfd = infid;
299 }
300 read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv
236 *fvpp = fvp; // Protect memory allocation and free() for track data 301 *fvpp = fvp; // Protect memory allocation and free() for track data
302
303 if( dbH->flags & O2_FLAG_LARGE_ADB )
304 // Load power and calculate power and l2norm sequence sums
305 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
306
237 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) 307 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
238 break; 308 break;
239 } 309 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
310 close(infid);
311 delete *sNormpp;
312 delete *sPowerp;
313 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
314 }
315 } // end for(trackID = start_track ; ... )
240 std::cout << "finished inserting." << endl; 316 std::cout << "finished inserting." << endl;
241 } 317 }
242 318
243 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ 319 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
244 // Loop over the current input track's vectors 320 // Loop over the current input track's vectors
254 numVecs = 0; 330 numVecs = 0;
255 } else { 331 } else {
256 numVecs = trackTable[trackID] - sequenceLength + 1; 332 numVecs = trackTable[trackID] - sequenceLength + 1;
257 } 333 }
258 } 334 }
259 vv = index_initialize_shingles(numVecs); 335
260 336 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
261 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) 337 if(numVecs){
262 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); 338 vv = index_initialize_shingles(numVecs);
263 339
264 Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); 340 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
265 Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp); 341 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
342
343 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
344 collisionCount = index_insert_shingles(vv, trackID, *sppp);
345 }
266 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; 346 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
267 347
268 /* index_norm_shingles() only goes as far as the end of the 348 /* index_norm_shingles() only goes as far as the end of the
269 sequence, which is right, but the space allocated is for the 349 sequence, which is right, but the space allocated is for the
270 whole track. */ 350 whole track. */
271 351
272 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN 352 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
273 * So let's be certain the pointers are in the correct place 353 * So let's be certain the pointers are in the correct place
274 */ 354 */
275 355
276 *snpp += trackTable[trackID]; 356 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
277 *sppp += trackTable[trackID]; 357 *snpp += trackTable[trackID];
278 *fvpp += trackTable[trackID] * dbH->dim; 358 *sppp += trackTable[trackID];
359 *fvpp += trackTable[trackID] * dbH->dim;
360 }
279 361
280 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; 362 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
281 std::cout.flush(); 363 std::cout.flush();
282 return true; 364 return true;
283 } 365 }
285 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){ 367 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
286 Uns32T collisionCount = 0; 368 Uns32T collisionCount = 0;
287 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; 369 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
288 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop) 370 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop)
289 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){ 371 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){
290 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID)); 372 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
291 spp+=sequenceHop; 373 spp+=sequenceHop;
292 } 374 }
293 return collisionCount; 375 return collisionCount;
294 } 376 }
295 377
391 printf("INDEX: seqlen %d\n", sequenceLength); 473 printf("INDEX: seqlen %d\n", sequenceLength);
392 printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth()); 474 printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
393 printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns()); 475 printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
394 printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables()); 476 printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
395 printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows()); 477 printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
396 printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp())); 478 printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
397 printf("INDEX: Opened LSH index file %s\n", indexName); 479 printf("INDEX: Opened LSH index file %s\n", indexName);
398 fflush(stdout); 480 fflush(stdout);
399 } 481 }
400 482
401 // Check to see if we are loading hash tables into core, and do so if true 483 // Check to see if we are loading hash tables into core, and do so if true
413 495
414 // *Static* approximate NN point reporter callback method for lshlib 496 // *Static* approximate NN point reporter callback method for lshlib
415 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ 497 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
416 assert(instancePtr); // We need an instance for this callback 498 assert(instancePtr); // We need an instance for this callback
417 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance 499 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
418 Uns32T trackID = index_to_trackID(pointID); 500 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
419 Uns32T spos = index_to_trackPos(pointID); 501 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
420 // Skip identity in query_from_key 502 // Skip identity in query_from_key
421 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) 503 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
422 myself->reporter->add_point(trackID, qpos, spos, dist); 504 myself->reporter->add_point(trackID, qpos, spos, dist);
423 } 505 }
424 506
425 // *Static* exact NN point reporter callback method for lshlib 507 // *Static* exact NN point reporter callback method for lshlib
426 // Maintain a queue of points to pass to query_points() for exact evaluation 508 // Maintain a queue of points to pass to query_points() for exact evaluation
427 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ 509 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
428 assert(instancePtr); // We need an instance for this callback 510 assert(instancePtr); // We need an instance for this callback
429 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance 511 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
430 Uns32T trackID = index_to_trackID(pointID); 512 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
431 Uns32T spos = index_to_trackPos(pointID); 513 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
432 // Skip identity in query_from_key 514 // Skip identity in query_from_key
433 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) 515 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
434 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos); 516 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
435 } 517 }
436 518