comparison index.cpp @ 292:d9a88cfd4ab6

Completed merge of lshlib back to current version of the trunk.
author mas01mc
date Tue, 29 Jul 2008 22:01:17 +0000
parents
children f922c234462f
comparison
equal deleted inserted replaced
291:63ae0dfc1767 292:d9a88cfd4ab6
1 // LSH indexing
2 //
3 // Construct a persistent LSH table structure
4 // Store at the same location as dbName
5 // Naming convention:
6 // dbName.lsh.${radius}.${sequenceLength}
7 //
8 //
9 // Author: Michael Casey
10 // Date: 23 June 2008
11
12 #include "audioDB.h"
13 #include "ReporterBase.h"
14
15
16 /************************* LSH point index to audioDB conversion *****************/
17 Uns32T audioDB::index_to_trackID(Uns32T lshID){
18 return lshID>>LSH_N_POINT_BITS;
19 }
20
21 Uns32T audioDB::index_to_trackPos(Uns32T lshID){
22 return lshID&LSH_POINT_MASK;
23 }
24
25 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos){
26 return (trackID << LSH_N_POINT_BITS) | spos;
27 }
28
29 /************************* LSH indexing and query initialization *****************/
30
31 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
32 char* indexName = new char[MAXSTR];
33 // Attempt to make new file
34 if(strlen(dbName) > (MAXSTR - 32))
35 error("dbName is too long for LSH index filename appendages");
36 strncpy(indexName, dbName, MAXSTR);
37 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
38 return indexName;
39 }
40
41 // return true if index exists else return false
42 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
43 // Test to see if file exists
44 char* indexName = index_get_name(dbName, radius, sequenceLength);
45 lshfid = open (indexName, O_RDONLY);
46 delete[] indexName;
47 close(lshfid);
48
49 if(lshfid<0)
50 return false;
51 else
52 return true;
53 }
54
55 vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){
56 if(vv)
57 delete vv;
58 vv = new vector<vector<float> >(sz);
59 for(Uns32T i=0 ; i < sz ; i++)
60 (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage
61 return vv;
62 }
63
64 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
65
66 // Prepare the AudioDB database for read access and allocate auxillary memory
67 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
68 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
69 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
70
71 double *snpp = *snp, *sppp = 0;
72 memcpy(*snp, l2normTable, *dvp * sizeof(double));
73
74 if (!(dbH->flags & O2_FLAG_POWER)) {
75 error("database not power-enabled", dbName);
76 }
77 *spp = new double[*dvp]; // song powertable pointer
78 sppp = *spp;
79 memcpy(*spp, powerTable, *dvp * sizeof(double));
80
81 for(Uns32T i = 0; i < dbH->numFiles; i++){
82 if(trackTable[i] >= sequenceLength) {
83 sequence_sum(snpp, trackTable[i], sequenceLength);
84 sequence_sqrt(snpp, trackTable[i], sequenceLength);
85
86 sequence_sum(sppp, trackTable[i], sequenceLength);
87 sequence_average(sppp, trackTable[i], sequenceLength);
88 }
89 snpp += trackTable[i];
90 sppp += trackTable[i];
91 }
92
93 *vsnp = *snp;
94 *vspp = *spp;
95
96 // Move the feature vector read pointer to start of fetures in database
97 lseek(dbfid, dbH->dataOffset, SEEK_SET);
98 }
99
100
101 /************************ LSH indexing ***********************************/
102 void audioDB::index_index_db(const char* dbName){
103
104 char* newIndexName;
105 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
106 Uns32T dbVectors = 0;
107
108 printf("INDEX: initializing header\n");
109 // Check if audioDB exists, initialize header and open database for read
110 forWrite = false;
111 initDBHeader(dbName);
112
113 newIndexName = index_get_name(dbName, radius, sequenceLength);
114
115 // Set unit norming flag override
116 audioDB::normalizedDistance = !audioDB::no_unit_norming;
117
118 printf("INDEX: dim %d\n", dbH->dim);
119 printf("INDEX: R %f\n", radius);
120 printf("INDEX: seqlen %d\n", sequenceLength);
121 printf("INDEX: lsh_w %f\n", lsh_param_w);
122 printf("INDEX: lsh_k %d\n", lsh_param_k);
123 printf("INDEX: lsh_m %d\n", lsh_param_m);
124 printf("INDEX: lsh_N %d\n", lsh_param_N);
125 printf("INDEX: lsh_b %d\n", lsh_param_b);
126 printf("INDEX: normalized? %s\n", normalizedDistance?"true":"false");
127 fflush(stdout);
128
129
130 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
131
132 if((lshfid = open(newIndexName,O_RDONLY))<0){
133 printf("INDEX: constructing new LSH index\n");
134 printf("INDEX: making index file %s\n", newIndexName);
135 fflush(stdout);
136 // Construct new LSH index
137 lsh = new LSH((float)lsh_param_w, lsh_param_k,
138 lsh_param_m,
139 (Uns32T)(sequenceLength*dbH->dim),
140 lsh_param_N,
141 lsh_param_ncols,
142 (float)radius);
143 assert(lsh);
144
145 Uns32T endTrack = lsh_param_b;
146 if( endTrack > dbH->numFiles)
147 endTrack = dbH->numFiles;
148 // Insert up to lsh_param_b tracks
149 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
150 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
151
152 // Clean up
153 delete lsh;
154 close(lshfid);
155 }
156
157 // Attempt to open LSH file
158 if((lshfid = open(newIndexName,O_RDONLY))>0){
159 printf("INDEX: merging with existing LSH index\n");
160 fflush(stdout);
161
162 // Get the lsh header info and find how many tracks are inserted already
163 lsh = new LSH(newIndexName, false); // lshInCore=false to avoid loading hashTables here
164 assert(lsh);
165 Uns32T maxs = index_to_trackID(lsh->get_maxp())+1;
166 delete lsh;
167
168 // This allows for updating index after more tracks are inserted into audioDB
169 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
170
171 Uns32T endTrack = startTrack + lsh_param_b;
172 if( endTrack > dbH->numFiles)
173 endTrack = dbH->numFiles;
174 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
175 fflush(stdout);
176 lsh = new LSH(newIndexName, lsh_in_core); // Initialize core memory for LSH tables
177 assert(lsh);
178
179 // Insert up to lsh_param_b database tracks
180 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
181
182 // Serialize to file
183 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
184 delete lsh;
185 }
186
187 close(lshfid);
188 printf("INDEX: done constructing LSH index.\n");
189 fflush(stdout);
190
191 }
192 else{
193 error("Something's wrong with LSH index file");
194 exit(1);
195 }
196
197
198 delete[] newIndexName;
199 if(sNorm)
200 delete[] sNorm;
201 if(sPower)
202 delete[] sPower;
203
204
205 }
206
207 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
208 double** fvpp, double** sNormpp,double** snPtrp,
209 double** sPowerp, double** spPtrp){
210 size_t nfv = 0;
211 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
212 Uns32T trackID = 0;
213
214 VERB_LOG(1, "indexing tracks...");
215
216
217 for(trackID = start_track ; trackID < end_track ; trackID++ ){
218 read_data(trackID, &fvp, &nfv); // over-writes fvp and nfv
219 *fvpp = fvp; // Protect memory allocation and free() for track data
220 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
221 break;
222 }
223 std::cout << "finished inserting." << endl;
224 }
225
226 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
227 // Loop over the current input track's vectors
228 Uns32T numVecs = (trackTable[trackID]>O2_MAXTRACKLEN?O2_MAXTRACKLEN:trackTable[trackID]) - sequenceLength + 1 ;
229 vv = index_initialize_shingles(numVecs);
230
231 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
232 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
233
234 Uns32T numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
235 Uns32T collisionCount = index_insert_shingles(vv, trackID, *sppp);
236 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
237
238 /* index_norm_shingles() only goes as far as the end of the
239 sequence, which is right, but the space allocated is for the
240 whole track. */
241
242 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
243 * So let's be certain the pointers are in the correct place
244 */
245
246 *snpp += trackTable[trackID];
247 *sppp += trackTable[trackID];
248 *fvpp += trackTable[trackID] * dbH->dim;
249
250 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
251 std::cout.flush();
252 return true;
253 }
254
255 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
256 Uns32T collisionCount = 0;
257 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
258 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID++)
259 if(!use_absolute_threshold || (use_absolute_threshold && (*spp++ >= absolute_threshold)))
260 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID));
261 return collisionCount;
262 }
263
264 /********************* LSH shingle construction ***************************/
265
266 // Construct shingles out of a feature matrix
267 // inputs:
268 // idx is vector index in feature matrix
269 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
270 //
271 // pre-conditions:
272 // dbH->dim
273 // sequenceLength
274 // idx < numVectors - sequenceLength + 1
275 //
276 // post-conditions:
277 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
278
279 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
280 assert(idx<(*vv).size());
281 vector<float>::iterator ve = (*vv)[idx].end();
282 vi=(*vv)[idx].begin(); // shingle iterator
283 // First feature vector in shingle
284 if(idx==0){
285 while(vi!=ve)
286 *vi++ = (float)(*fvp++);
287 }
288 // Not first feature vector in shingle
289 else{
290 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator
291 // Previous seqLen-1 dim-vectors
292 while(vi!=ve-dim)
293 *vi++=*ui++;
294 // Move data pointer to next feature vector
295 fvp += ( seqLen + idx - 1 ) * dim ;
296 // New d-vector
297 while(vi!=ve)
298 *vi++ = (float)(*fvp++);
299 }
300 }
301
302 // norm shingles
303 // in-place norming, no deletions
304 // If using power, return number of shingles above power threshold
305 int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){
306 int z = 0; // number of above-threshold shingles
307 float l2norm;
308 double power;
309 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
310 float oneOverSqrtl2NormDivRad = oneOverRadius;
311 if(!spp)
312 error("LSH indexing and query requires a power feature using -w or -W");
313 Uns32T shingleSize = sequenceLength*dbH->dim;
314 for(Uns32T a=0; a<(*vv).size(); a++){
315 l2norm = (float)(*snp++);
316 if(audioDB::normalizedDistance)
317 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
318
319 for(Uns32T b=0; b < shingleSize ; b++)
320 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
321
322 power = *spp++;
323 if(use_absolute_threshold){
324 if ( power >= absolute_threshold )
325 z++;
326 }
327 else
328 z++;
329 }
330 return z;
331 }
332
333
334 /*********************** LSH retrieval ****************************/
335
336
337 // return true if indexed query performed else return false
338 int audioDB::index_init_query(const char* dbName){
339
340 if(!(index_exists(dbName, radius, sequenceLength)))
341 return false;
342
343 char* indexName = index_get_name(dbName, radius, sequenceLength);
344
345 // Test to see if file exists
346 if((lshfid = open (indexName, O_RDONLY)) < 0){
347 delete[] indexName;
348 return false;
349 }
350
351 printf("INDEX: initializing header\n");
352
353 lsh = new LSH(indexName, false); // Get the header only here
354 assert(lsh);
355 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
356
357
358 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
359 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
360
361 printf("INDEX: dim %d\n", dbH->dim);
362 printf("INDEX: R %f\n", lsh->get_radius());
363 printf("INDEX: seqlen %d\n", sequenceLength);
364 printf("INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
365 printf("INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
366 printf("INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
367 printf("INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
368 printf("INDEX: s %d\n", index_to_trackID(lsh->get_maxp()));
369 printf("INDEX: Opened LSH index file %s\n", indexName);
370 fflush(stdout);
371
372 // Check to see if we are loading hash tables into core, and do so if true
373 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
374 printf("INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
375 delete lsh;
376 lsh = new LSH(indexName, true);
377 }
378
379 delete[] indexName;
380 return true;
381 }
382
383 // *Static* approximate NN point reporter callback method for lshlib
384 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
385 assert(instancePtr); // We need an instance for this callback
386 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
387 Uns32T trackID = index_to_trackID(pointID);
388 Uns32T spos = index_to_trackPos(pointID);
389 // Skip identity in query_from_key
390 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
391 myself->reporter->add_point(trackID, qpos, spos, dist);
392 }
393
394 // *Static* exact NN point reporter callback method for lshlib
395 // Maintain a queue of points to pass to query_points() for exact evaluation
396 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
397 assert(instancePtr); // We need an instance for this callback
398 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
399 Uns32T trackID = index_to_trackID(pointID);
400 Uns32T spos = index_to_trackPos(pointID);
401 // Skip identity in query_from_key
402 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
403 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
404 }
405
406 void audioDB::initialize_exact_evalutation_queue(){
407 if(exact_evaluation_queue)
408 delete exact_evaluation_queue;
409 exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >;
410 }
411
412 void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){
413 PointPair p(trackID, qpos, spos);
414 exact_evaluation_queue->push(p);
415 }
416
417 // return 0: if index does not exist
418 // return nqv: if index exists
419 int audioDB::index_query_loop(const char* dbName, Uns32T queryIndex) {
420
421 unsigned int numVectors;
422 double *query, *query_data;
423 double *qNorm, *qnPtr, *qPower = 0, *qpPtr = 0;
424 double meanQdur;
425 void (*add_point_func)(void*,Uns32T,Uns32T,float);
426
427 // Set the point-reporter callback based on the value of lsh_exact
428 if(lsh_exact){
429 initialize_exact_evalutation_queue();
430 add_point_func = &index_add_point_exact;
431 }
432 else
433 add_point_func = &index_add_point_approximate;
434
435 if(!index_init_query(dbName)) // sets-up LSH index structures for querying
436 return 0;
437
438 char* database = index_get_name(dbName, radius, sequenceLength);
439
440 if(query_from_key)
441 set_up_query_from_key(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors, queryIndex);
442 else
443 set_up_query(&query_data, &query, &qNorm, &qnPtr, &qPower, &qpPtr, &meanQdur, &numVectors); // get query vectors
444
445 VERB_LOG(1, "retrieving tracks...");
446
447 assert(pointNN>0 && pointNN<=O2_MAXNN);
448 assert(trackNN>0 && trackNN<=O2_MAXNN);
449
450 gettimeofday(&tv1, NULL);
451 // query vector index
452 Uns32T Nq = (numVectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:numVectors) - sequenceLength + 1;
453 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
454 cout << "Nq=" << Nq; cout.flush();
455 // Construct shingles from query features
456 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
457 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
458
459 // Normalize query vectors
460 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qnPtr, qpPtr );
461 cout << " Nq'=" << numVecsAboveThreshold << endl; cout.flush();
462
463 // Nq contains number of inspected points in query file,
464 // numVecsAboveThreshold is number of points with power >= absolute_threshold
465 double* qpp = qpPtr; // Keep original qpPtr for possible exact evaluation
466 if(usingQueryPoint && numVecsAboveThreshold){
467 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
468 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this);
469 else
470 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this);
471 }
472 else if(numVecsAboveThreshold)
473 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
474 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold)))
475 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
476 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this);
477 else
478 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this);
479
480 if(lsh_exact)
481 // Perform exact distance computation on point pairs in exact_evaluation_queue
482 query_loop_points(query, qnPtr, qpPtr, meanQdur, numVectors);
483
484 gettimeofday(&tv2,NULL);
485 VERB_LOG(1,"elapsed time: %ld msec\n",
486 (tv2.tv_sec*1000 + tv2.tv_usec/1000) -
487 (tv1.tv_sec*1000 + tv1.tv_usec/1000))
488
489 // Close the index file
490 close(lshfid);
491
492 // Clean up
493 if(query_data)
494 delete[] query_data;
495 if(qNorm)
496 delete[] qNorm;
497 if(qPower)
498 delete[] qPower;
499 if(database)
500 delete[] database;
501
502 return Nq;
503 }
504