comparison index.cpp @ 456:0ef029232213 api-inversion

Baby steps with index.cpp audioDB::index_make_shingle uses almost no shared state. Make it use none at all, and then remove it from the audioDB class.
author mas01cr
date Wed, 24 Dec 2008 10:57:27 +0000
parents 93ce12fe2f76
children 913a95f06998
comparison
equal deleted inserted replaced
455:93ce12fe2f76 456:0ef029232213
120 // Move the feature vector read pointer to start of fetures in database 120 // Move the feature vector read pointer to start of fetures in database
121 lseek(dbfid, dbH->dataOffset, SEEK_SET); 121 lseek(dbfid, dbH->dataOffset, SEEK_SET);
122 } 122 }
123 123
124 124
125 /************************ LSH indexing ***********************************/
126 void audioDB::index_index_db(const char* dbName){
127 char* newIndexName;
128 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
129 Uns32T dbVectors = 0;
130
131
132 printf("INDEX: initializing header\n");
133 // Check if audioDB exists, initialize header and open database for read
134 forWrite = false;
135 initDBHeader(dbName);
136
137 if(dbH->flags & O2_FLAG_POWER)
138 usingPower = true;
139
140 if(dbH->flags & O2_FLAG_TIMES)
141 usingTimes = true;
142
143 newIndexName = index_get_name(dbName, radius, sequenceLength);
144
145 // Set unit norming flag override
146 audioDB::normalizedDistance = !audioDB::no_unit_norming;
147
148 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
149 VERB_LOG(1, "INDEX: R %f\n", radius);
150 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
151 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
152 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
153 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
154 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
155 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
156 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
157 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
158
159 if((lshfid = open(newIndexName,O_RDONLY))<0){
160 printf("INDEX: constructing new LSH index\n");
161 printf("INDEX: making index file %s\n", newIndexName);
162 fflush(stdout);
163 // Construct new LSH index
164 lsh = new LSH((float)lsh_param_w, lsh_param_k,
165 lsh_param_m,
166 (Uns32T)(sequenceLength*dbH->dim),
167 lsh_param_N,
168 lsh_param_ncols,
169 (float)radius);
170 assert(lsh);
171
172 Uns32T endTrack = lsh_param_b;
173 if( endTrack > dbH->numFiles)
174 endTrack = dbH->numFiles;
175 // Insert up to lsh_param_b tracks
176 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
177 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
178 }
179 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
180 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
181
182 // Clean up
183 delete lsh;
184 lsh = 0;
185 close(lshfid);
186 }
187
188 // Attempt to open LSH file
189 if((lshfid = open(newIndexName,O_RDONLY))>0){
190 printf("INDEX: merging with existing LSH index\n");
191 fflush(stdout);
192 char* mergeIndexName = newIndexName;
193
194 // Get the lsh header info and find how many tracks are inserted already
195 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
196 assert(lsh);
197 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
198 delete lsh;
199 lsh = 0;
200
201 // Insert up to lsh_param_b tracks
202 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
203 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
204 }
205 // This allows for updating index after more tracks are inserted into audioDB
206 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
207
208 Uns32T endTrack = startTrack + lsh_param_b;
209 if( endTrack > dbH->numFiles)
210 endTrack = dbH->numFiles;
211 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
212 fflush(stdout);
213 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
214 assert(lsh);
215
216 // Insert up to lsh_param_b database tracks
217 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
218
219 // Serialize to file (merging is performed here)
220 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
221 delete lsh;
222 lsh = 0;
223 }
224
225 close(lshfid);
226 printf("INDEX: done constructing LSH index.\n");
227 fflush(stdout);
228
229 }
230 else{
231 error("Something's wrong with LSH index file");
232 exit(1);
233 }
234
235 delete[] newIndexName;
236 delete[] sNorm;
237 delete[] sPower;
238 }
239
240
241 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
242 if(usingPower){
243 int one;
244 unsigned int count;
245
246 count = read(powerfd, &one, sizeof(unsigned int));
247 if (count != sizeof(unsigned int)) {
248 error("powerfd read failed", "int", "read");
249 }
250 if (one != 1) {
251 error("dimensionality of power file not 1", powerFileName);
252 }
253
254 // FIXME: should check that the powerfile is the right size for
255 // this. -- CSR, 2007-10-30
256 count = read(powerfd, powerdata, numVectors * sizeof(double));
257 if (count != numVectors * sizeof(double)) {
258 error("powerfd read failed", "double", "read");
259 }
260 }
261 }
262
263 // initialize auxillary track data from filesystem
264 // pre-conditions:
265 // dbH->flags & O2_FLAG_LARGE_ADB
266 // feature data allocated and copied (fvp)
267 //
268 // post-conditions:
269 // allocated power data
270 // allocated l2norm data
271 //
272 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
273 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
274 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
275
276 // Allocate and read the power sequence
277 if(trackTable[trackID]>=sequenceLength){
278
279 char* prefixedString = new char[O2_MAXFILESTR];
280 char* tmpStr = prefixedString;
281 // Open and check dimensions of power file
282 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
283 prefix_name((char ** const)&prefixedString, adb_feature_root);
284 if(prefixedString!=tmpStr)
285 delete[] tmpStr;
286 powerfd = open(prefixedString, O_RDONLY);
287 if (powerfd < 0) {
288 error("failed to open power file", prefixedString);
289 }
290 if (fstat(powerfd, &statbuf) < 0) {
291 error("fstat error finding size of power file", prefixedString, "fstat");
292 }
293
294 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
295 error("Dimension mismatch: numPowers != numVectors", prefixedString);
296
297 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
298 assert(*sPowerp);
299 *spPtrp = *sPowerp;
300 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
301 if (0 < powerfd) {
302 close(powerfd);
303 }
304
305 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
306 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
307 powerTable = 0;
308
309 // Allocate and calculate the l2norm sequence
310 *sNormpp = new double[trackTable[trackID]];
311 assert(*sNormpp);
312 *snPtrp = *sNormpp;
313 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
314 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
315 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
316 }
317 }
318
319 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
320 double** fvpp, double** sNormpp,double** snPtrp,
321 double** sPowerp, double** spPtrp){
322 size_t nfv = 0;
323 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
324 Uns32T trackID = 0;
325
326 VERB_LOG(1, "indexing tracks...");
327
328 int trackfd = dbfid;
329 for(trackID = start_track ; trackID < end_track ; trackID++ ){
330 if( dbH->flags & O2_FLAG_LARGE_ADB ){
331 char* prefixedString = new char[O2_MAXFILESTR];
332 char* tmpStr = prefixedString;
333 // Open and check dimensions of feature file
334 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
335 prefix_name((char ** const) &prefixedString, adb_feature_root);
336 if(prefixedString!=tmpStr)
337 delete[] tmpStr;
338 initInputFile(prefixedString);
339 trackfd = infid;
340 }
341 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
342 error("failed to read data");
343 *fvpp = fvp; // Protect memory allocation and free() for track data
344
345 if( dbH->flags & O2_FLAG_LARGE_ADB )
346 // Load power and calculate power and l2norm sequence sums
347 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
348
349 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
350 break;
351 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
352 close(infid);
353 delete[] *sNormpp;
354 delete[] *sPowerp;
355 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
356 }
357 } // end for(trackID = start_track ; ... )
358 std::cout << "finished inserting." << endl;
359 }
360
361 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
362 // Loop over the current input track's vectors
363 Uns32T numVecs = 0;
364 if (trackTable[trackID] > O2_MAXTRACKLEN) {
365 if (O2_MAXTRACKLEN < sequenceLength - 1) {
366 numVecs = 0;
367 } else {
368 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
369 }
370 } else {
371 if (trackTable[trackID] < sequenceLength - 1) {
372 numVecs = 0;
373 } else {
374 numVecs = trackTable[trackID] - sequenceLength + 1;
375 }
376 }
377
378 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
379 if(numVecs){
380 vv = index_initialize_shingles(numVecs);
381
382 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
383 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
384
385 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
386 collisionCount = index_insert_shingles(vv, trackID, *sppp);
387 }
388 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
389
390 /* index_norm_shingles() only goes as far as the end of the
391 sequence, which is right, but the space allocated is for the
392 whole track. */
393
394 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
395 * So let's be certain the pointers are in the correct place
396 */
397
398 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
399 *snpp += trackTable[trackID];
400 *sppp += trackTable[trackID];
401 *fvpp += trackTable[trackID] * dbH->dim;
402 }
403
404 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
405 std::cout.flush();
406 return true;
407 }
408
409 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
410 Uns32T collisionCount = 0;
411 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
412 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
413 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
414 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
415 spp+=sequenceHop;
416 }
417 return collisionCount;
418 }
419
420 /********************* LSH shingle construction ***************************/ 125 /********************* LSH shingle construction ***************************/
421 126
422 // Construct shingles out of a feature matrix 127 // Construct shingles out of a feature matrix
423 // inputs: 128 // inputs:
424 // idx is vector index in feature matrix 129 // idx is vector index in feature matrix
430 // idx < numVectors - sequenceLength + 1 135 // idx < numVectors - sequenceLength + 1
431 // 136 //
432 // post-conditions: 137 // post-conditions:
433 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values 138 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
434 139
435 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){ 140 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
436 assert(idx<(*vv).size()); 141 assert(idx<(*vv).size());
437 vector<float>::iterator ve = (*vv)[idx].end(); 142 vector<float>::iterator ve = (*vv)[idx].end();
438 vi=(*vv)[idx].begin(); // shingle iterator 143 vector<float>::iterator vi = (*vv)[idx].begin();
439 // First feature vector in shingle 144 // First feature vector in shingle
440 if(idx==0){ 145 if(idx == 0) {
441 while(vi!=ve) 146 while(vi!=ve) {
442 *vi++ = (float)(*fvp++); 147 *vi++ = (float)(*fvp++);
443 } 148 }
444 // Not first feature vector in shingle 149 } else {
445 else{ 150 // Not first feature vector in shingle
446 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator 151 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
447 // Previous seqLen-1 dim-vectors 152 // Previous seqLen-1 dim-vectors
448 while(vi!=ve-dim) 153 while(vi!=ve-dim) {
449 *vi++=*ui++; 154 *vi++ = *ui++;
155 }
450 // Move data pointer to next feature vector 156 // Move data pointer to next feature vector
451 fvp += ( seqLen + idx - 1 ) * dim ; 157 fvp += ( seqLen + idx - 1 ) * dim ;
452 // New d-vector 158 // New d-vector
453 while(vi!=ve) 159 while(vi!=ve) {
454 *vi++ = (float)(*fvp++); 160 *vi++ = (float)(*fvp++);
161 }
455 } 162 }
456 } 163 }
457 164
458 // norm shingles 165 // norm shingles
459 // in-place norming, no deletions 166 // in-place norming, no deletions
485 } 192 }
486 return z; 193 return z;
487 } 194 }
488 195
489 196
197 /************************ LSH indexing ***********************************/
198 void audioDB::index_index_db(const char* dbName){
199 char* newIndexName;
200 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
201 Uns32T dbVectors = 0;
202
203
204 printf("INDEX: initializing header\n");
205 // Check if audioDB exists, initialize header and open database for read
206 forWrite = false;
207 initDBHeader(dbName);
208
209 if(dbH->flags & O2_FLAG_POWER)
210 usingPower = true;
211
212 if(dbH->flags & O2_FLAG_TIMES)
213 usingTimes = true;
214
215 newIndexName = index_get_name(dbName, radius, sequenceLength);
216
217 // Set unit norming flag override
218 audioDB::normalizedDistance = !audioDB::no_unit_norming;
219
220 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
221 VERB_LOG(1, "INDEX: R %f\n", radius);
222 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
223 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
224 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
225 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
226 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
227 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
228 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
229 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
230
231 if((lshfid = open(newIndexName,O_RDONLY))<0){
232 printf("INDEX: constructing new LSH index\n");
233 printf("INDEX: making index file %s\n", newIndexName);
234 fflush(stdout);
235 // Construct new LSH index
236 lsh = new LSH((float)lsh_param_w, lsh_param_k,
237 lsh_param_m,
238 (Uns32T)(sequenceLength*dbH->dim),
239 lsh_param_N,
240 lsh_param_ncols,
241 (float)radius);
242 assert(lsh);
243
244 Uns32T endTrack = lsh_param_b;
245 if( endTrack > dbH->numFiles)
246 endTrack = dbH->numFiles;
247 // Insert up to lsh_param_b tracks
248 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
249 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
250 }
251 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
252 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
253
254 // Clean up
255 delete lsh;
256 lsh = 0;
257 close(lshfid);
258 }
259
260 // Attempt to open LSH file
261 if((lshfid = open(newIndexName,O_RDONLY))>0){
262 printf("INDEX: merging with existing LSH index\n");
263 fflush(stdout);
264 char* mergeIndexName = newIndexName;
265
266 // Get the lsh header info and find how many tracks are inserted already
267 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
268 assert(lsh);
269 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
270 delete lsh;
271 lsh = 0;
272
273 // Insert up to lsh_param_b tracks
274 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
275 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
276 }
277 // This allows for updating index after more tracks are inserted into audioDB
278 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
279
280 Uns32T endTrack = startTrack + lsh_param_b;
281 if( endTrack > dbH->numFiles)
282 endTrack = dbH->numFiles;
283 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
284 fflush(stdout);
285 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
286 assert(lsh);
287
288 // Insert up to lsh_param_b database tracks
289 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
290
291 // Serialize to file (merging is performed here)
292 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
293 delete lsh;
294 lsh = 0;
295 }
296
297 close(lshfid);
298 printf("INDEX: done constructing LSH index.\n");
299 fflush(stdout);
300
301 }
302 else{
303 error("Something's wrong with LSH index file");
304 exit(1);
305 }
306
307 delete[] newIndexName;
308 delete[] sNorm;
309 delete[] sPower;
310 }
311
312
313 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
314 if(usingPower){
315 int one;
316 unsigned int count;
317
318 count = read(powerfd, &one, sizeof(unsigned int));
319 if (count != sizeof(unsigned int)) {
320 error("powerfd read failed", "int", "read");
321 }
322 if (one != 1) {
323 error("dimensionality of power file not 1", powerFileName);
324 }
325
326 // FIXME: should check that the powerfile is the right size for
327 // this. -- CSR, 2007-10-30
328 count = read(powerfd, powerdata, numVectors * sizeof(double));
329 if (count != numVectors * sizeof(double)) {
330 error("powerfd read failed", "double", "read");
331 }
332 }
333 }
334
335 // initialize auxillary track data from filesystem
336 // pre-conditions:
337 // dbH->flags & O2_FLAG_LARGE_ADB
338 // feature data allocated and copied (fvp)
339 //
340 // post-conditions:
341 // allocated power data
342 // allocated l2norm data
343 //
344 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
345 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
346 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
347
348 // Allocate and read the power sequence
349 if(trackTable[trackID]>=sequenceLength){
350
351 char* prefixedString = new char[O2_MAXFILESTR];
352 char* tmpStr = prefixedString;
353 // Open and check dimensions of power file
354 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
355 prefix_name((char ** const)&prefixedString, adb_feature_root);
356 if(prefixedString!=tmpStr)
357 delete[] tmpStr;
358 powerfd = open(prefixedString, O_RDONLY);
359 if (powerfd < 0) {
360 error("failed to open power file", prefixedString);
361 }
362 if (fstat(powerfd, &statbuf) < 0) {
363 error("fstat error finding size of power file", prefixedString, "fstat");
364 }
365
366 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
367 error("Dimension mismatch: numPowers != numVectors", prefixedString);
368
369 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
370 assert(*sPowerp);
371 *spPtrp = *sPowerp;
372 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
373 if (0 < powerfd) {
374 close(powerfd);
375 }
376
377 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
378 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
379 powerTable = 0;
380
381 // Allocate and calculate the l2norm sequence
382 *sNormpp = new double[trackTable[trackID]];
383 assert(*sNormpp);
384 *snPtrp = *sNormpp;
385 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
386 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
387 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
388 }
389 }
390
391 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
392 double** fvpp, double** sNormpp,double** snPtrp,
393 double** sPowerp, double** spPtrp){
394 size_t nfv = 0;
395 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
396 Uns32T trackID = 0;
397
398 VERB_LOG(1, "indexing tracks...");
399
400 int trackfd = dbfid;
401 for(trackID = start_track ; trackID < end_track ; trackID++ ){
402 if( dbH->flags & O2_FLAG_LARGE_ADB ){
403 char* prefixedString = new char[O2_MAXFILESTR];
404 char* tmpStr = prefixedString;
405 // Open and check dimensions of feature file
406 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
407 prefix_name((char ** const) &prefixedString, adb_feature_root);
408 if(prefixedString!=tmpStr)
409 delete[] tmpStr;
410 initInputFile(prefixedString);
411 trackfd = infid;
412 }
413 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
414 error("failed to read data");
415 *fvpp = fvp; // Protect memory allocation and free() for track data
416
417 if( dbH->flags & O2_FLAG_LARGE_ADB )
418 // Load power and calculate power and l2norm sequence sums
419 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
420
421 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
422 break;
423 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
424 close(infid);
425 delete[] *sNormpp;
426 delete[] *sPowerp;
427 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
428 }
429 } // end for(trackID = start_track ; ... )
430 std::cout << "finished inserting." << endl;
431 }
432
433 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
434 // Loop over the current input track's vectors
435 Uns32T numVecs = 0;
436 if (trackTable[trackID] > O2_MAXTRACKLEN) {
437 if (O2_MAXTRACKLEN < sequenceLength - 1) {
438 numVecs = 0;
439 } else {
440 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
441 }
442 } else {
443 if (trackTable[trackID] < sequenceLength - 1) {
444 numVecs = 0;
445 } else {
446 numVecs = trackTable[trackID] - sequenceLength + 1;
447 }
448 }
449
450 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
451 if(numVecs){
452 vv = index_initialize_shingles(numVecs);
453
454 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
455 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
456
457 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
458 collisionCount = index_insert_shingles(vv, trackID, *sppp);
459 }
460 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
461
462 /* index_norm_shingles() only goes as far as the end of the
463 sequence, which is right, but the space allocated is for the
464 whole track. */
465
466 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
467 * So let's be certain the pointers are in the correct place
468 */
469
470 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
471 *snpp += trackTable[trackID];
472 *sppp += trackTable[trackID];
473 *fvpp += trackTable[trackID] * dbH->dim;
474 }
475
476 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
477 std::cout.flush();
478 return true;
479 }
480
481 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
482 Uns32T collisionCount = 0;
483 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
484 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
485 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
486 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
487 spp+=sequenceHop;
488 }
489 return collisionCount;
490 }
491
490 /*********************** LSH retrieval ****************************/ 492 /*********************** LSH retrieval ****************************/
491 493
492 494
493 // return true if indexed query performed else return false 495 // return true if indexed query performed else return false
494 int audioDB::index_init_query(const char* dbName){ 496 int audioDB::index_init_query(const char* dbName){
607 Uns32T Nq = (qpointers.nvectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:qpointers.nvectors) - sequenceLength + 1; 609 Uns32T Nq = (qpointers.nvectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:qpointers.nvectors) - sequenceLength + 1;
608 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles 610 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
609 611
610 // Construct shingles from query features 612 // Construct shingles from query features
611 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ ) 613 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
612 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength); 614 audiodb_index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
613 615
614 // Normalize query vectors 616 // Normalize query vectors
615 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qpointers.l2norm, qpointers.power ); 617 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qpointers.l2norm, qpointers.power );
616 618
617 // Nq contains number of inspected points in query file, 619 // Nq contains number of inspected points in query file,