Mercurial > hg > audiodb
comparison index.cpp @ 319:b9eff6896943 large_adb
Added indexing support for O2_FLAG_LARGE_ADB. Tested on indexed query by features. No indexed query-by-key yet. No --lsh_exact yet.
author | mas01mc |
---|---|
date | Tue, 19 Aug 2008 20:27:15 +0000 |
parents | cac5b3465318 |
children | a995e5ad999a |
comparison
equal
deleted
inserted
replaced
318:c270d9e4659a | 319:b9eff6896943 |
---|---|
6 // dbName.lsh.${radius}.${sequenceLength} | 6 // dbName.lsh.${radius}.${sequenceLength} |
7 // | 7 // |
8 // | 8 // |
9 // Author: Michael Casey | 9 // Author: Michael Casey |
10 // Date: 23 June 2008 | 10 // Date: 23 June 2008 |
11 // | |
12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support | |
11 | 13 |
12 #include "audioDB.h" | 14 #include "audioDB.h" |
13 #include "ReporterBase.h" | 15 #include "ReporterBase.h" |
14 | 16 |
15 | 17 |
16 /************************* LSH point index to audioDB conversion *****************/ | 18 /************************* LSH point index to audioDB conversion *****************/ |
17 Uns32T audioDB::index_to_trackID(Uns32T lshID){ | 19 Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){ |
18 return lshID>>LSH_N_POINT_BITS; | 20 assert(nPntBits); |
19 } | 21 return lshID>>nPntBits; |
20 | 22 } |
21 Uns32T audioDB::index_to_trackPos(Uns32T lshID){ | 23 |
22 return lshID&LSH_POINT_MASK; | 24 Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){ |
23 } | 25 assert(nPntBits); |
24 | 26 return lshID&((1<<nPntBits)-1); |
25 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){ | 27 } |
26 return (trackID << LSH_N_POINT_BITS) | spos; | 28 |
29 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){ | |
30 assert(nPntBits); | |
31 return (trackID << nPntBits) | spos; | |
27 } | 32 } |
28 | 33 |
29 /************************* LSH indexing and query initialization *****************/ | 34 /************************* LSH indexing and query initialization *****************/ |
30 | 35 |
31 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){ | 36 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){ |
76 | 81 |
77 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/ | 82 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/ |
78 | 83 |
79 // Prepare the AudioDB database for read access and allocate auxillary memory | 84 // Prepare the AudioDB database for read access and allocate auxillary memory |
80 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { | 85 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) { |
86 if (!(dbH->flags & O2_FLAG_POWER)) { | |
87 error("INDEXed database must be power-enabled", dbName); | |
88 } | |
89 | |
90 double *snpp = *snp, *sppp = 0; | |
91 | |
81 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors | 92 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors |
82 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector | 93 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector |
83 | |
84 double *snpp = *snp, *sppp = 0; | |
85 memcpy(*snp, l2normTable, *dvp * sizeof(double)); | |
86 | |
87 if (!(dbH->flags & O2_FLAG_POWER)) { | |
88 error("database not power-enabled", dbName); | |
89 } | |
90 *spp = new double[*dvp]; // song powertable pointer | 94 *spp = new double[*dvp]; // song powertable pointer |
91 sppp = *spp; | 95 sppp = *spp; |
96 memcpy(*snp, l2normTable, *dvp * sizeof(double)); | |
92 memcpy(*spp, powerTable, *dvp * sizeof(double)); | 97 memcpy(*spp, powerTable, *dvp * sizeof(double)); |
93 | 98 |
99 | |
94 for(Uns32T i = 0; i < dbH->numFiles; i++){ | 100 for(Uns32T i = 0; i < dbH->numFiles; i++){ |
95 if(trackTable[i] >= sequenceLength) { | 101 if(trackTable[i] >= sequenceLength) { |
96 sequence_sum(snpp, trackTable[i], sequenceLength); | 102 sequence_sum(snpp, trackTable[i], sequenceLength); |
97 sequence_sqrt(snpp, trackTable[i], sequenceLength); | 103 sequence_sqrt(snpp, trackTable[i], sequenceLength); |
98 | 104 |
100 sequence_average(sppp, trackTable[i], sequenceLength); | 106 sequence_average(sppp, trackTable[i], sequenceLength); |
101 } | 107 } |
102 snpp += trackTable[i]; | 108 snpp += trackTable[i]; |
103 sppp += trackTable[i]; | 109 sppp += trackTable[i]; |
104 } | 110 } |
105 | 111 |
106 *vsnp = *snp; | 112 *vsnp = *snp; |
107 *vspp = *spp; | 113 *vspp = *spp; |
108 | 114 |
109 // Move the feature vector read pointer to start of fetures in database | 115 // Move the feature vector read pointer to start of fetures in database |
110 lseek(dbfid, dbH->dataOffset, SEEK_SET); | 116 lseek(dbfid, dbH->dataOffset, SEEK_SET); |
111 } | 117 } |
112 | 118 |
113 | 119 |
139 printf("INDEX: lsh_b %d\n", lsh_param_b); | 145 printf("INDEX: lsh_b %d\n", lsh_param_b); |
140 printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); | 146 printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false"); |
141 fflush(stdout); | 147 fflush(stdout); |
142 | 148 |
143 | 149 |
144 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); | |
145 | |
146 if((lshfid = open(newIndexName,O_RDONLY))<0){ | 150 if((lshfid = open(newIndexName,O_RDONLY))<0){ |
147 printf("INDEX: constructing new LSH index\n"); | 151 printf("INDEX: constructing new LSH index\n"); |
148 printf("INDEX: making index file %s\n", newIndexName); | 152 printf("INDEX: making index file %s\n", newIndexName); |
149 fflush(stdout); | 153 fflush(stdout); |
150 // Construct new LSH index | 154 // Construct new LSH index |
158 | 162 |
159 Uns32T endTrack = lsh_param_b; | 163 Uns32T endTrack = lsh_param_b; |
160 if( endTrack > dbH->numFiles) | 164 if( endTrack > dbH->numFiles) |
161 endTrack = dbH->numFiles; | 165 endTrack = dbH->numFiles; |
162 // Insert up to lsh_param_b tracks | 166 // Insert up to lsh_param_b tracks |
163 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); | 167 if( dbH->flags & O2_FLAG_LARGE_ADB ){ |
168 } | |
169 else{ | |
170 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); | |
171 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); | |
172 } | |
164 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); | 173 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); |
165 | 174 |
166 // Clean up | 175 // Clean up |
167 delete lsh; | 176 delete lsh; |
168 lsh = 0; | 177 lsh = 0; |
175 fflush(stdout); | 184 fflush(stdout); |
176 | 185 |
177 // Get the lsh header info and find how many tracks are inserted already | 186 // Get the lsh header info and find how many tracks are inserted already |
178 lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here | 187 lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here |
179 assert(lsh); | 188 assert(lsh); |
180 Uns32T maxs = index_to_trackID(lsh->get_maxp())+1; | 189 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1; |
181 delete lsh; | 190 delete lsh; |
182 lsh = 0; | 191 lsh = 0; |
183 | 192 |
184 // This allows for updating index after more tracks are inserted into audioDB | 193 // This allows for updating index after more tracks are inserted into audioDB |
185 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ | 194 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ |
219 delete[] sPower; | 228 delete[] sPower; |
220 | 229 |
221 | 230 |
222 } | 231 } |
223 | 232 |
233 | |
234 // initialize auxillary track data from filesystem | |
235 // pre-conditions: | |
236 // dbH->flags & O2_FLAG_LARGE_ADB | |
237 // feature data allocated and copied (fvp) | |
238 // | |
239 // post-conditions: | |
240 // allocated power data | |
241 // allocated l2norm data | |
242 // | |
243 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ | |
244 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) | |
245 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); | |
246 | |
247 // Allocate and read the power sequence | |
248 if(trackTable[trackID]>=sequenceLength){ | |
249 | |
250 // Open and check dimensions of power file | |
251 powerfd = open(powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O_RDONLY); | |
252 if (powerfd < 0) { | |
253 error("failed to open power file", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE); | |
254 } | |
255 if (fstat(powerfd, &statbuf) < 0) { | |
256 error("fstat error finding size of power file", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, "fstat"); | |
257 } | |
258 | |
259 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) | |
260 error("Dimension mismatch: numPowers != numVectors", powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE); | |
261 | |
262 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values | |
263 assert(*sPowerp); | |
264 *spPtrp = *sPowerp; | |
265 insertPowerData(trackTable[trackID], powerfd, *sPowerp); | |
266 if (0 < powerfd) { | |
267 close(powerfd); | |
268 } | |
269 | |
270 sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); | |
271 sequence_average(*sPowerp, trackTable[trackID], sequenceLength); | |
272 powerTable = 0; | |
273 | |
274 // Allocate and calculate the l2norm sequence | |
275 *sNormpp = new double[trackTable[trackID]]; | |
276 assert(*sNormpp); | |
277 *snPtrp = *sNormpp; | |
278 unitNorm(fvp, dbH->dim, trackTable[trackID], *sNormpp); | |
279 sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); | |
280 sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); | |
281 } | |
282 } | |
283 | |
224 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, | 284 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, |
225 double** fvpp, double** sNormpp,double** snPtrp, | 285 double** fvpp, double** sNormpp,double** snPtrp, |
226 double** sPowerp, double** spPtrp){ | 286 double** sPowerp, double** spPtrp){ |
227 size_t nfv = 0; | 287 size_t nfv = 0; |
228 double* fvp = 0; // Keep pointer for memory allocation and free() for track data | 288 double* fvp = 0; // Keep pointer for memory allocation and free() for track data |
229 Uns32T trackID = 0; | 289 Uns32T trackID = 0; |
230 | 290 |
231 VERB_LOG(1, "indexing tracks..."); | 291 VERB_LOG(1, "indexing tracks..."); |
232 | 292 |
233 | 293 int trackfd = dbfid; |
234 for(trackID = start_track ; trackID < end_track ; trackID++ ){ | 294 for(trackID = start_track ; trackID < end_track ; trackID++ ){ |
235 read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv | 295 if( dbH->flags & O2_FLAG_LARGE_ADB ){ |
296 // Open and check dimensions of feature file | |
297 initInputFile(featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, false); // nommap, file pointer at correct position | |
298 trackfd = infid; | |
299 } | |
300 read_data(trackfd, trackID, &fvp, &nfv); // over-writes fvp and nfv | |
236 *fvpp = fvp; // Protect memory allocation and free() for track data | 301 *fvpp = fvp; // Protect memory allocation and free() for track data |
302 | |
303 if( dbH->flags & O2_FLAG_LARGE_ADB ) | |
304 // Load power and calculate power and l2norm sequence sums | |
305 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); | |
306 | |
237 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) | 307 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) |
238 break; | 308 break; |
239 } | 309 if ( dbH->flags & O2_FLAG_LARGE_ADB ){ |
310 close(infid); | |
311 delete *sNormpp; | |
312 delete *sPowerp; | |
313 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; | |
314 } | |
315 } // end for(trackID = start_track ; ... ) | |
240 std::cout << "finished inserting." << endl; | 316 std::cout << "finished inserting." << endl; |
241 } | 317 } |
242 | 318 |
243 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ | 319 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ |
244 // Loop over the current input track's vectors | 320 // Loop over the current input track's vectors |
254 numVecs = 0; | 330 numVecs = 0; |
255 } else { | 331 } else { |
256 numVecs = trackTable[trackID] - sequenceLength + 1; | 332 numVecs = trackTable[trackID] - sequenceLength + 1; |
257 } | 333 } |
258 } | 334 } |
259 vv = index_initialize_shingles(numVecs); | 335 |
260 | 336 Uns32T numVecsAboveThreshold = 0, collisionCount = 0; |
261 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) | 337 if(numVecs){ |
262 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); | 338 vv = index_initialize_shingles(numVecs); |
263 | 339 |
264 Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); | 340 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) |
265 Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp); | 341 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); |
342 | |
343 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); | |
344 collisionCount = index_insert_shingles(vv, trackID, *sppp); | |
345 } | |
266 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; | 346 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; |
267 | 347 |
268 /* index_norm_shingles() only goes as far as the end of the | 348 /* index_norm_shingles() only goes as far as the end of the |
269 sequence, which is right, but the space allocated is for the | 349 sequence, which is right, but the space allocated is for the |
270 whole track. */ | 350 whole track. */ |
271 | 351 |
272 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN | 352 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN |
273 * So let's be certain the pointers are in the correct place | 353 * So let's be certain the pointers are in the correct place |
274 */ | 354 */ |
275 | 355 |
276 *snpp += trackTable[trackID]; | 356 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ |
277 *sppp += trackTable[trackID]; | 357 *snpp += trackTable[trackID]; |
278 *fvpp += trackTable[trackID] * dbH->dim; | 358 *sppp += trackTable[trackID]; |
359 *fvpp += trackTable[trackID] * dbH->dim; | |
360 } | |
279 | 361 |
280 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; | 362 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; |
281 std::cout.flush(); | 363 std::cout.flush(); |
282 return true; | 364 return true; |
283 } | 365 } |
285 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){ | 367 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){ |
286 Uns32T collisionCount = 0; | 368 Uns32T collisionCount = 0; |
287 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; | 369 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; |
288 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop) | 370 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop) |
289 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){ | 371 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))){ |
290 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID)); | 372 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits)); |
291 spp+=sequenceHop; | 373 spp+=sequenceHop; |
292 } | 374 } |
293 return collisionCount; | 375 return collisionCount; |
294 } | 376 } |
295 | 377 |
391 printf("INDEX: seqlen %d\n", sequenceLength); | 473 printf("INDEX: seqlen %d\n", sequenceLength); |
392 printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth()); | 474 printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth()); |
393 printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns()); | 475 printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns()); |
394 printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables()); | 476 printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables()); |
395 printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows()); | 477 printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows()); |
396 printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp())); | 478 printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)); |
397 printf("INDEX: Opened LSH index file %s\n", indexName); | 479 printf("INDEX: Opened LSH index file %s\n", indexName); |
398 fflush(stdout); | 480 fflush(stdout); |
399 } | 481 } |
400 | 482 |
401 // Check to see if we are loading hash tables into core, and do so if true | 483 // Check to see if we are loading hash tables into core, and do so if true |
413 | 495 |
414 // *Static* approximate NN point reporter callback method for lshlib | 496 // *Static* approximate NN point reporter callback method for lshlib |
415 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ | 497 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ |
416 assert(instancePtr); // We need an instance for this callback | 498 assert(instancePtr); // We need an instance for this callback |
417 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance | 499 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance |
418 Uns32T trackID = index_to_trackID(pointID); | 500 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits); |
419 Uns32T spos = index_to_trackPos(pointID); | 501 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits); |
420 // Skip identity in query_from_key | 502 // Skip identity in query_from_key |
421 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) | 503 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) |
422 myself->reporter->add_point(trackID, qpos, spos, dist); | 504 myself->reporter->add_point(trackID, qpos, spos, dist); |
423 } | 505 } |
424 | 506 |
425 // *Static* exact NN point reporter callback method for lshlib | 507 // *Static* exact NN point reporter callback method for lshlib |
426 // Maintain a queue of points to pass to query_points() for exact evaluation | 508 // Maintain a queue of points to pass to query_points() for exact evaluation |
427 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ | 509 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){ |
428 assert(instancePtr); // We need an instance for this callback | 510 assert(instancePtr); // We need an instance for this callback |
429 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance | 511 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance |
430 Uns32T trackID = index_to_trackID(pointID); | 512 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits); |
431 Uns32T spos = index_to_trackPos(pointID); | 513 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits); |
432 // Skip identity in query_from_key | 514 // Skip identity in query_from_key |
433 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) | 515 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) |
434 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos); | 516 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos); |
435 } | 517 } |
436 | 518 |