Mercurial > hg > audiodb
comparison index.cpp @ 456:0ef029232213 api-inversion
Baby steps with index.cpp
audioDB::index_make_shingle uses almost no shared state. Make it use
none at all, and then remove it from the audioDB class.
author | mas01cr |
---|---|
date | Wed, 24 Dec 2008 10:57:27 +0000 |
parents | 93ce12fe2f76 |
children | 913a95f06998 |
comparison
equal
deleted
inserted
replaced
455:93ce12fe2f76 | 456:0ef029232213 |
---|---|
120 // Move the feature vector read pointer to start of fetures in database | 120 // Move the feature vector read pointer to start of fetures in database |
121 lseek(dbfid, dbH->dataOffset, SEEK_SET); | 121 lseek(dbfid, dbH->dataOffset, SEEK_SET); |
122 } | 122 } |
123 | 123 |
124 | 124 |
125 /************************ LSH indexing ***********************************/ | |
126 void audioDB::index_index_db(const char* dbName){ | |
127 char* newIndexName; | |
128 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; | |
129 Uns32T dbVectors = 0; | |
130 | |
131 | |
132 printf("INDEX: initializing header\n"); | |
133 // Check if audioDB exists, initialize header and open database for read | |
134 forWrite = false; | |
135 initDBHeader(dbName); | |
136 | |
137 if(dbH->flags & O2_FLAG_POWER) | |
138 usingPower = true; | |
139 | |
140 if(dbH->flags & O2_FLAG_TIMES) | |
141 usingTimes = true; | |
142 | |
143 newIndexName = index_get_name(dbName, radius, sequenceLength); | |
144 | |
145 // Set unit norming flag override | |
146 audioDB::normalizedDistance = !audioDB::no_unit_norming; | |
147 | |
148 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim); | |
149 VERB_LOG(1, "INDEX: R %f\n", radius); | |
150 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength); | |
151 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w); | |
152 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k); | |
153 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m); | |
154 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N); | |
155 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols); | |
156 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b); | |
157 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false"); | |
158 | |
159 if((lshfid = open(newIndexName,O_RDONLY))<0){ | |
160 printf("INDEX: constructing new LSH index\n"); | |
161 printf("INDEX: making index file %s\n", newIndexName); | |
162 fflush(stdout); | |
163 // Construct new LSH index | |
164 lsh = new LSH((float)lsh_param_w, lsh_param_k, | |
165 lsh_param_m, | |
166 (Uns32T)(sequenceLength*dbH->dim), | |
167 lsh_param_N, | |
168 lsh_param_ncols, | |
169 (float)radius); | |
170 assert(lsh); | |
171 | |
172 Uns32T endTrack = lsh_param_b; | |
173 if( endTrack > dbH->numFiles) | |
174 endTrack = dbH->numFiles; | |
175 // Insert up to lsh_param_b tracks | |
176 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ | |
177 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); | |
178 } | |
179 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); | |
180 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); | |
181 | |
182 // Clean up | |
183 delete lsh; | |
184 lsh = 0; | |
185 close(lshfid); | |
186 } | |
187 | |
188 // Attempt to open LSH file | |
189 if((lshfid = open(newIndexName,O_RDONLY))>0){ | |
190 printf("INDEX: merging with existing LSH index\n"); | |
191 fflush(stdout); | |
192 char* mergeIndexName = newIndexName; | |
193 | |
194 // Get the lsh header info and find how many tracks are inserted already | |
195 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here | |
196 assert(lsh); | |
197 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1; | |
198 delete lsh; | |
199 lsh = 0; | |
200 | |
201 // Insert up to lsh_param_b tracks | |
202 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){ | |
203 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); | |
204 } | |
205 // This allows for updating index after more tracks are inserted into audioDB | |
206 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ | |
207 | |
208 Uns32T endTrack = startTrack + lsh_param_b; | |
209 if( endTrack > dbH->numFiles) | |
210 endTrack = dbH->numFiles; | |
211 printf("Indexing track range: %d - %d\n", startTrack, endTrack); | |
212 fflush(stdout); | |
213 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables | |
214 assert(lsh); | |
215 | |
216 // Insert up to lsh_param_b database tracks | |
217 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); | |
218 | |
219 // Serialize to file (merging is performed here) | |
220 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk | |
221 delete lsh; | |
222 lsh = 0; | |
223 } | |
224 | |
225 close(lshfid); | |
226 printf("INDEX: done constructing LSH index.\n"); | |
227 fflush(stdout); | |
228 | |
229 } | |
230 else{ | |
231 error("Something's wrong with LSH index file"); | |
232 exit(1); | |
233 } | |
234 | |
235 delete[] newIndexName; | |
236 delete[] sNorm; | |
237 delete[] sPower; | |
238 } | |
239 | |
240 | |
241 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { | |
242 if(usingPower){ | |
243 int one; | |
244 unsigned int count; | |
245 | |
246 count = read(powerfd, &one, sizeof(unsigned int)); | |
247 if (count != sizeof(unsigned int)) { | |
248 error("powerfd read failed", "int", "read"); | |
249 } | |
250 if (one != 1) { | |
251 error("dimensionality of power file not 1", powerFileName); | |
252 } | |
253 | |
254 // FIXME: should check that the powerfile is the right size for | |
255 // this. -- CSR, 2007-10-30 | |
256 count = read(powerfd, powerdata, numVectors * sizeof(double)); | |
257 if (count != numVectors * sizeof(double)) { | |
258 error("powerfd read failed", "double", "read"); | |
259 } | |
260 } | |
261 } | |
262 | |
263 // initialize auxillary track data from filesystem | |
264 // pre-conditions: | |
265 // dbH->flags & O2_FLAG_LARGE_ADB | |
266 // feature data allocated and copied (fvp) | |
267 // | |
268 // post-conditions: | |
269 // allocated power data | |
270 // allocated l2norm data | |
271 // | |
272 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ | |
273 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) | |
274 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); | |
275 | |
276 // Allocate and read the power sequence | |
277 if(trackTable[trackID]>=sequenceLength){ | |
278 | |
279 char* prefixedString = new char[O2_MAXFILESTR]; | |
280 char* tmpStr = prefixedString; | |
281 // Open and check dimensions of power file | |
282 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); | |
283 prefix_name((char ** const)&prefixedString, adb_feature_root); | |
284 if(prefixedString!=tmpStr) | |
285 delete[] tmpStr; | |
286 powerfd = open(prefixedString, O_RDONLY); | |
287 if (powerfd < 0) { | |
288 error("failed to open power file", prefixedString); | |
289 } | |
290 if (fstat(powerfd, &statbuf) < 0) { | |
291 error("fstat error finding size of power file", prefixedString, "fstat"); | |
292 } | |
293 | |
294 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) | |
295 error("Dimension mismatch: numPowers != numVectors", prefixedString); | |
296 | |
297 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values | |
298 assert(*sPowerp); | |
299 *spPtrp = *sPowerp; | |
300 insertPowerData(trackTable[trackID], powerfd, *sPowerp); | |
301 if (0 < powerfd) { | |
302 close(powerfd); | |
303 } | |
304 | |
305 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); | |
306 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength); | |
307 powerTable = 0; | |
308 | |
309 // Allocate and calculate the l2norm sequence | |
310 *sNormpp = new double[trackTable[trackID]]; | |
311 assert(*sNormpp); | |
312 *snPtrp = *sNormpp; | |
313 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp); | |
314 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); | |
315 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); | |
316 } | |
317 } | |
318 | |
319 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, | |
320 double** fvpp, double** sNormpp,double** snPtrp, | |
321 double** sPowerp, double** spPtrp){ | |
322 size_t nfv = 0; | |
323 double* fvp = 0; // Keep pointer for memory allocation and free() for track data | |
324 Uns32T trackID = 0; | |
325 | |
326 VERB_LOG(1, "indexing tracks..."); | |
327 | |
328 int trackfd = dbfid; | |
329 for(trackID = start_track ; trackID < end_track ; trackID++ ){ | |
330 if( dbH->flags & O2_FLAG_LARGE_ADB ){ | |
331 char* prefixedString = new char[O2_MAXFILESTR]; | |
332 char* tmpStr = prefixedString; | |
333 // Open and check dimensions of feature file | |
334 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); | |
335 prefix_name((char ** const) &prefixedString, adb_feature_root); | |
336 if(prefixedString!=tmpStr) | |
337 delete[] tmpStr; | |
338 initInputFile(prefixedString); | |
339 trackfd = infid; | |
340 } | |
341 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv)) | |
342 error("failed to read data"); | |
343 *fvpp = fvp; // Protect memory allocation and free() for track data | |
344 | |
345 if( dbH->flags & O2_FLAG_LARGE_ADB ) | |
346 // Load power and calculate power and l2norm sequence sums | |
347 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); | |
348 | |
349 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) | |
350 break; | |
351 if ( dbH->flags & O2_FLAG_LARGE_ADB ){ | |
352 close(infid); | |
353 delete[] *sNormpp; | |
354 delete[] *sPowerp; | |
355 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; | |
356 } | |
357 } // end for(trackID = start_track ; ... ) | |
358 std::cout << "finished inserting." << endl; | |
359 } | |
360 | |
361 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ | |
362 // Loop over the current input track's vectors | |
363 Uns32T numVecs = 0; | |
364 if (trackTable[trackID] > O2_MAXTRACKLEN) { | |
365 if (O2_MAXTRACKLEN < sequenceLength - 1) { | |
366 numVecs = 0; | |
367 } else { | |
368 numVecs = O2_MAXTRACKLEN - sequenceLength + 1; | |
369 } | |
370 } else { | |
371 if (trackTable[trackID] < sequenceLength - 1) { | |
372 numVecs = 0; | |
373 } else { | |
374 numVecs = trackTable[trackID] - sequenceLength + 1; | |
375 } | |
376 } | |
377 | |
378 Uns32T numVecsAboveThreshold = 0, collisionCount = 0; | |
379 if(numVecs){ | |
380 vv = index_initialize_shingles(numVecs); | |
381 | |
382 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) | |
383 index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); | |
384 | |
385 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); | |
386 collisionCount = index_insert_shingles(vv, trackID, *sppp); | |
387 } | |
388 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; | |
389 | |
390 /* index_norm_shingles() only goes as far as the end of the | |
391 sequence, which is right, but the space allocated is for the | |
392 whole track. */ | |
393 | |
394 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN | |
395 * So let's be certain the pointers are in the correct place | |
396 */ | |
397 | |
398 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ | |
399 *snpp += trackTable[trackID]; | |
400 *sppp += trackTable[trackID]; | |
401 *fvpp += trackTable[trackID] * dbH->dim; | |
402 } | |
403 | |
404 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; | |
405 std::cout.flush(); | |
406 return true; | |
407 } | |
408 | |
409 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){ | |
410 Uns32T collisionCount = 0; | |
411 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; | |
412 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){ | |
413 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))) | |
414 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits)); | |
415 spp+=sequenceHop; | |
416 } | |
417 return collisionCount; | |
418 } | |
419 | |
420 /********************* LSH shingle construction ***************************/ | 125 /********************* LSH shingle construction ***************************/ |
421 | 126 |
422 // Construct shingles out of a feature matrix | 127 // Construct shingles out of a feature matrix |
423 // inputs: | 128 // inputs: |
424 // idx is vector index in feature matrix | 129 // idx is vector index in feature matrix |
430 // idx < numVectors - sequenceLength + 1 | 135 // idx < numVectors - sequenceLength + 1 |
431 // | 136 // |
432 // post-conditions: | 137 // post-conditions: |
433 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values | 138 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values |
434 | 139 |
435 void audioDB::index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){ | 140 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){ |
436 assert(idx<(*vv).size()); | 141 assert(idx<(*vv).size()); |
437 vector<float>::iterator ve = (*vv)[idx].end(); | 142 vector<float>::iterator ve = (*vv)[idx].end(); |
438 vi=(*vv)[idx].begin(); // shingle iterator | 143 vector<float>::iterator vi = (*vv)[idx].begin(); |
439 // First feature vector in shingle | 144 // First feature vector in shingle |
440 if(idx==0){ | 145 if(idx == 0) { |
441 while(vi!=ve) | 146 while(vi!=ve) { |
442 *vi++ = (float)(*fvp++); | 147 *vi++ = (float)(*fvp++); |
443 } | 148 } |
444 // Not first feature vector in shingle | 149 } else { |
445 else{ | 150 // Not first feature vector in shingle |
446 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; // previous shingle iterator | 151 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim; |
447 // Previous seqLen-1 dim-vectors | 152 // Previous seqLen-1 dim-vectors |
448 while(vi!=ve-dim) | 153 while(vi!=ve-dim) { |
449 *vi++=*ui++; | 154 *vi++ = *ui++; |
155 } | |
450 // Move data pointer to next feature vector | 156 // Move data pointer to next feature vector |
451 fvp += ( seqLen + idx - 1 ) * dim ; | 157 fvp += ( seqLen + idx - 1 ) * dim ; |
452 // New d-vector | 158 // New d-vector |
453 while(vi!=ve) | 159 while(vi!=ve) { |
454 *vi++ = (float)(*fvp++); | 160 *vi++ = (float)(*fvp++); |
161 } | |
455 } | 162 } |
456 } | 163 } |
457 | 164 |
458 // norm shingles | 165 // norm shingles |
459 // in-place norming, no deletions | 166 // in-place norming, no deletions |
485 } | 192 } |
486 return z; | 193 return z; |
487 } | 194 } |
488 | 195 |
489 | 196 |
197 /************************ LSH indexing ***********************************/ | |
198 void audioDB::index_index_db(const char* dbName){ | |
199 char* newIndexName; | |
200 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0; | |
201 Uns32T dbVectors = 0; | |
202 | |
203 | |
204 printf("INDEX: initializing header\n"); | |
205 // Check if audioDB exists, initialize header and open database for read | |
206 forWrite = false; | |
207 initDBHeader(dbName); | |
208 | |
209 if(dbH->flags & O2_FLAG_POWER) | |
210 usingPower = true; | |
211 | |
212 if(dbH->flags & O2_FLAG_TIMES) | |
213 usingTimes = true; | |
214 | |
215 newIndexName = index_get_name(dbName, radius, sequenceLength); | |
216 | |
217 // Set unit norming flag override | |
218 audioDB::normalizedDistance = !audioDB::no_unit_norming; | |
219 | |
220 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim); | |
221 VERB_LOG(1, "INDEX: R %f\n", radius); | |
222 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength); | |
223 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w); | |
224 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k); | |
225 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m); | |
226 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N); | |
227 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols); | |
228 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b); | |
229 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false"); | |
230 | |
231 if((lshfid = open(newIndexName,O_RDONLY))<0){ | |
232 printf("INDEX: constructing new LSH index\n"); | |
233 printf("INDEX: making index file %s\n", newIndexName); | |
234 fflush(stdout); | |
235 // Construct new LSH index | |
236 lsh = new LSH((float)lsh_param_w, lsh_param_k, | |
237 lsh_param_m, | |
238 (Uns32T)(sequenceLength*dbH->dim), | |
239 lsh_param_N, | |
240 lsh_param_ncols, | |
241 (float)radius); | |
242 assert(lsh); | |
243 | |
244 Uns32T endTrack = lsh_param_b; | |
245 if( endTrack > dbH->numFiles) | |
246 endTrack = dbH->numFiles; | |
247 // Insert up to lsh_param_b tracks | |
248 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){ | |
249 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); | |
250 } | |
251 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); | |
252 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); | |
253 | |
254 // Clean up | |
255 delete lsh; | |
256 lsh = 0; | |
257 close(lshfid); | |
258 } | |
259 | |
260 // Attempt to open LSH file | |
261 if((lshfid = open(newIndexName,O_RDONLY))>0){ | |
262 printf("INDEX: merging with existing LSH index\n"); | |
263 fflush(stdout); | |
264 char* mergeIndexName = newIndexName; | |
265 | |
266 // Get the lsh header info and find how many tracks are inserted already | |
267 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here | |
268 assert(lsh); | |
269 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1; | |
270 delete lsh; | |
271 lsh = 0; | |
272 | |
273 // Insert up to lsh_param_b tracks | |
274 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){ | |
275 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors); | |
276 } | |
277 // This allows for updating index after more tracks are inserted into audioDB | |
278 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){ | |
279 | |
280 Uns32T endTrack = startTrack + lsh_param_b; | |
281 if( endTrack > dbH->numFiles) | |
282 endTrack = dbH->numFiles; | |
283 printf("Indexing track range: %d - %d\n", startTrack, endTrack); | |
284 fflush(stdout); | |
285 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables | |
286 assert(lsh); | |
287 | |
288 // Insert up to lsh_param_b database tracks | |
289 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr); | |
290 | |
291 // Serialize to file (merging is performed here) | |
292 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk | |
293 delete lsh; | |
294 lsh = 0; | |
295 } | |
296 | |
297 close(lshfid); | |
298 printf("INDEX: done constructing LSH index.\n"); | |
299 fflush(stdout); | |
300 | |
301 } | |
302 else{ | |
303 error("Something's wrong with LSH index file"); | |
304 exit(1); | |
305 } | |
306 | |
307 delete[] newIndexName; | |
308 delete[] sNorm; | |
309 delete[] sPower; | |
310 } | |
311 | |
312 | |
313 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) { | |
314 if(usingPower){ | |
315 int one; | |
316 unsigned int count; | |
317 | |
318 count = read(powerfd, &one, sizeof(unsigned int)); | |
319 if (count != sizeof(unsigned int)) { | |
320 error("powerfd read failed", "int", "read"); | |
321 } | |
322 if (one != 1) { | |
323 error("dimensionality of power file not 1", powerFileName); | |
324 } | |
325 | |
326 // FIXME: should check that the powerfile is the right size for | |
327 // this. -- CSR, 2007-10-30 | |
328 count = read(powerfd, powerdata, numVectors * sizeof(double)); | |
329 if (count != numVectors * sizeof(double)) { | |
330 error("powerfd read failed", "double", "read"); | |
331 } | |
332 } | |
333 } | |
334 | |
335 // initialize auxillary track data from filesystem | |
336 // pre-conditions: | |
337 // dbH->flags & O2_FLAG_LARGE_ADB | |
338 // feature data allocated and copied (fvp) | |
339 // | |
340 // post-conditions: | |
341 // allocated power data | |
342 // allocated l2norm data | |
343 // | |
344 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){ | |
345 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ) | |
346 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB"); | |
347 | |
348 // Allocate and read the power sequence | |
349 if(trackTable[trackID]>=sequenceLength){ | |
350 | |
351 char* prefixedString = new char[O2_MAXFILESTR]; | |
352 char* tmpStr = prefixedString; | |
353 // Open and check dimensions of power file | |
354 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); | |
355 prefix_name((char ** const)&prefixedString, adb_feature_root); | |
356 if(prefixedString!=tmpStr) | |
357 delete[] tmpStr; | |
358 powerfd = open(prefixedString, O_RDONLY); | |
359 if (powerfd < 0) { | |
360 error("failed to open power file", prefixedString); | |
361 } | |
362 if (fstat(powerfd, &statbuf) < 0) { | |
363 error("fstat error finding size of power file", prefixedString, "fstat"); | |
364 } | |
365 | |
366 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] ) | |
367 error("Dimension mismatch: numPowers != numVectors", prefixedString); | |
368 | |
369 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values | |
370 assert(*sPowerp); | |
371 *spPtrp = *sPowerp; | |
372 insertPowerData(trackTable[trackID], powerfd, *sPowerp); | |
373 if (0 < powerfd) { | |
374 close(powerfd); | |
375 } | |
376 | |
377 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength); | |
378 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength); | |
379 powerTable = 0; | |
380 | |
381 // Allocate and calculate the l2norm sequence | |
382 *sNormpp = new double[trackTable[trackID]]; | |
383 assert(*sNormpp); | |
384 *snPtrp = *sNormpp; | |
385 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp); | |
386 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength); | |
387 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength); | |
388 } | |
389 } | |
390 | |
391 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track, | |
392 double** fvpp, double** sNormpp,double** snPtrp, | |
393 double** sPowerp, double** spPtrp){ | |
394 size_t nfv = 0; | |
395 double* fvp = 0; // Keep pointer for memory allocation and free() for track data | |
396 Uns32T trackID = 0; | |
397 | |
398 VERB_LOG(1, "indexing tracks..."); | |
399 | |
400 int trackfd = dbfid; | |
401 for(trackID = start_track ; trackID < end_track ; trackID++ ){ | |
402 if( dbH->flags & O2_FLAG_LARGE_ADB ){ | |
403 char* prefixedString = new char[O2_MAXFILESTR]; | |
404 char* tmpStr = prefixedString; | |
405 // Open and check dimensions of feature file | |
406 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR); | |
407 prefix_name((char ** const) &prefixedString, adb_feature_root); | |
408 if(prefixedString!=tmpStr) | |
409 delete[] tmpStr; | |
410 initInputFile(prefixedString); | |
411 trackfd = infid; | |
412 } | |
413 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv)) | |
414 error("failed to read data"); | |
415 *fvpp = fvp; // Protect memory allocation and free() for track data | |
416 | |
417 if( dbH->flags & O2_FLAG_LARGE_ADB ) | |
418 // Load power and calculate power and l2norm sequence sums | |
419 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp); | |
420 | |
421 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp)) | |
422 break; | |
423 if ( dbH->flags & O2_FLAG_LARGE_ADB ){ | |
424 close(infid); | |
425 delete[] *sNormpp; | |
426 delete[] *sPowerp; | |
427 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0; | |
428 } | |
429 } // end for(trackID = start_track ; ... ) | |
430 std::cout << "finished inserting." << endl; | |
431 } | |
432 | |
433 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){ | |
434 // Loop over the current input track's vectors | |
435 Uns32T numVecs = 0; | |
436 if (trackTable[trackID] > O2_MAXTRACKLEN) { | |
437 if (O2_MAXTRACKLEN < sequenceLength - 1) { | |
438 numVecs = 0; | |
439 } else { | |
440 numVecs = O2_MAXTRACKLEN - sequenceLength + 1; | |
441 } | |
442 } else { | |
443 if (trackTable[trackID] < sequenceLength - 1) { | |
444 numVecs = 0; | |
445 } else { | |
446 numVecs = trackTable[trackID] - sequenceLength + 1; | |
447 } | |
448 } | |
449 | |
450 Uns32T numVecsAboveThreshold = 0, collisionCount = 0; | |
451 if(numVecs){ | |
452 vv = index_initialize_shingles(numVecs); | |
453 | |
454 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ ) | |
455 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength); | |
456 | |
457 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp); | |
458 collisionCount = index_insert_shingles(vv, trackID, *sppp); | |
459 } | |
460 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0; | |
461 | |
462 /* index_norm_shingles() only goes as far as the end of the | |
463 sequence, which is right, but the space allocated is for the | |
464 whole track. */ | |
465 | |
466 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN | |
467 * So let's be certain the pointers are in the correct place | |
468 */ | |
469 | |
470 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){ | |
471 *snpp += trackTable[trackID]; | |
472 *sppp += trackTable[trackID]; | |
473 *fvpp += trackTable[trackID] * dbH->dim; | |
474 } | |
475 | |
476 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl; | |
477 std::cout.flush(); | |
478 return true; | |
479 } | |
480 | |
481 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){ | |
482 Uns32T collisionCount = 0; | |
483 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE; | |
484 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){ | |
485 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold))) | |
486 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits)); | |
487 spp+=sequenceHop; | |
488 } | |
489 return collisionCount; | |
490 } | |
491 | |
490 /*********************** LSH retrieval ****************************/ | 492 /*********************** LSH retrieval ****************************/ |
491 | 493 |
492 | 494 |
493 // return true if indexed query performed else return false | 495 // return true if indexed query performed else return false |
494 int audioDB::index_init_query(const char* dbName){ | 496 int audioDB::index_init_query(const char* dbName){ |
607 Uns32T Nq = (qpointers.nvectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:qpointers.nvectors) - sequenceLength + 1; | 609 Uns32T Nq = (qpointers.nvectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:qpointers.nvectors) - sequenceLength + 1; |
608 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles | 610 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles |
609 | 611 |
610 // Construct shingles from query features | 612 // Construct shingles from query features |
611 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ ) | 613 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ ) |
612 index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength); | 614 audiodb_index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength); |
613 | 615 |
614 // Normalize query vectors | 616 // Normalize query vectors |
615 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qpointers.l2norm, qpointers.power ); | 617 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qpointers.l2norm, qpointers.power ); |
616 | 618 |
617 // Nq contains number of inspected points in query file, | 619 // Nq contains number of inspected points in query file, |