Mercurial > hg > audiodb
comparison query.cpp @ 204:2ea1908707c7 refactoring
Filewise refactor.
Break apart huge monolithic audioDB.cpp file into seven broadly
independent portions:
* SOAP
* DB creation
* insertion
* query
* dump
* common functionality
* constructor functions
Remove the "using namespace std" from the header file, though that
wasn't actually a problem: the problem in question is solved by
including adb.nsmap in only soap.cpp.
Makefile improvements.
author | mas01cr |
---|---|
date | Wed, 28 Nov 2007 15:10:28 +0000 |
parents | |
children | 3c7c8b84e4f3 |
comparison
equal
deleted
inserted
replaced
203:4b05c5bbf06d | 204:2ea1908707c7 |
---|---|
1 #include "audioDB.h" | |
2 | |
3 bool audioDB::powers_acceptable(double p1, double p2) { | |
4 if (use_absolute_threshold) { | |
5 if ((p1 < absolute_threshold) || (p2 < absolute_threshold)) { | |
6 return false; | |
7 } | |
8 } | |
9 if (use_relative_threshold) { | |
10 if (fabs(p1-p2) > fabs(relative_threshold)) { | |
11 return false; | |
12 } | |
13 } | |
14 return true; | |
15 } | |
16 | |
17 void audioDB::query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ | |
18 switch(queryType){ | |
19 case O2_POINT_QUERY: | |
20 pointQuery(dbName, inFile, adbQueryResponse); | |
21 break; | |
22 case O2_SEQUENCE_QUERY: | |
23 if(radius==0) | |
24 trackSequenceQueryNN(dbName, inFile, adbQueryResponse); | |
25 else | |
26 trackSequenceQueryRad(dbName, inFile, adbQueryResponse); | |
27 break; | |
28 case O2_TRACK_QUERY: | |
29 trackPointQuery(dbName, inFile, adbQueryResponse); | |
30 break; | |
31 default: | |
32 error("unrecognized queryType in query()"); | |
33 | |
34 } | |
35 } | |
36 | |
37 //return ordinal position of key in keyTable | |
38 unsigned audioDB::getKeyPos(char* key){ | |
39 for(unsigned k=0; k<dbH->numFiles; k++) | |
40 if(strncmp(fileTable + k*O2_FILETABLESIZE, key, strlen(key))==0) | |
41 return k; | |
42 error("Key not found",key); | |
43 return O2_ERR_KEYNOTFOUND; | |
44 } | |
45 | |
46 // Basic point query engine | |
47 void audioDB::pointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { | |
48 | |
49 initTables(dbName, inFile); | |
50 | |
51 // For each input vector, find the closest pointNN matching output vectors and report | |
52 // we use stdout in this stub version | |
53 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | |
54 | |
55 double* query = (double*)(indata+sizeof(int)); | |
56 CHECKED_MMAP(double *, dataBuf, dbH->dataOffset, dataBufLength); | |
57 double* data = dataBuf; | |
58 double* queryCopy = 0; | |
59 | |
60 if( dbH->flags & O2_FLAG_L2NORM ){ | |
61 // Make a copy of the query | |
62 queryCopy = new double[numVectors*dbH->dim]; | |
63 qNorm = new double[numVectors]; | |
64 assert(queryCopy&&qNorm); | |
65 memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); | |
66 unitNorm(queryCopy, dbH->dim, numVectors, qNorm); | |
67 query = queryCopy; | |
68 } | |
69 | |
70 // Make temporary dynamic memory for results | |
71 assert(pointNN>0 && pointNN<=O2_MAXNN); | |
72 double distances[pointNN]; | |
73 unsigned qIndexes[pointNN]; | |
74 unsigned sIndexes[pointNN]; | |
75 for(unsigned k=0; k<pointNN; k++){ | |
76 distances[k]=-DBL_MAX; | |
77 qIndexes[k]=~0; | |
78 sIndexes[k]=~0; | |
79 } | |
80 | |
81 unsigned j=numVectors; | |
82 unsigned k,l,n; | |
83 double thisDist; | |
84 | |
85 unsigned totalVecs=dbH->length/(dbH->dim*sizeof(double)); | |
86 double meanQdur = 0; | |
87 double *timesdata = 0; | |
88 double *querydurs = 0; | |
89 double *dbdurs = 0; | |
90 | |
91 if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ | |
92 std::cerr << "warning: ignoring query timestamps for non-timestamped database" << std::endl; | |
93 usingTimes=0; | |
94 } | |
95 | |
96 else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) | |
97 std::cerr << "warning: no timestamps given for query. Ignoring database timestamps." << std::endl; | |
98 | |
99 else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ | |
100 timesdata = new double[2*numVectors]; | |
101 querydurs = new double[numVectors]; | |
102 insertTimeStamps(numVectors, timesFile, timesdata); | |
103 // Calculate durations of points | |
104 for(k=0; k<numVectors-1; k++){ | |
105 querydurs[k]=timesdata[2*k+1]-timesdata[2*k]; | |
106 meanQdur+=querydurs[k]; | |
107 } | |
108 meanQdur/=k; | |
109 // Individual exhaustive timepoint durations | |
110 dbdurs = new double[totalVecs]; | |
111 for(k=0; k<totalVecs-1; k++) { | |
112 dbdurs[k]=timesTable[2*k+1]-timesTable[2*k]; | |
113 } | |
114 } | |
115 | |
116 if(usingQueryPoint) | |
117 if(queryPoint>numVectors-1) | |
118 error("queryPoint > numVectors in query"); | |
119 else{ | |
120 if(verbosity>1) { | |
121 std::cerr << "query point: " << queryPoint << std::endl; std::cerr.flush(); | |
122 } | |
123 query=query+queryPoint*dbH->dim; | |
124 numVectors=queryPoint+1; | |
125 j=1; | |
126 } | |
127 | |
128 gettimeofday(&tv1, NULL); | |
129 while(j--){ // query | |
130 data=dataBuf; | |
131 k=totalVecs; // number of database vectors | |
132 while(k--){ // database | |
133 thisDist=0; | |
134 l=dbH->dim; | |
135 double* q=query; | |
136 while(l--) | |
137 thisDist+=*q++**data++; | |
138 if(!usingTimes || | |
139 (usingTimes | |
140 && fabs(dbdurs[totalVecs-k-1]-querydurs[numVectors-j-1])<querydurs[numVectors-j-1]*timesTol)){ | |
141 n=pointNN; | |
142 while(n--){ | |
143 if(thisDist>=distances[n]){ | |
144 if((n==0 || thisDist<=distances[n-1])){ | |
145 // Copy all values above up the queue | |
146 for( l=pointNN-1 ; l >= n+1 ; l--){ | |
147 distances[l]=distances[l-1]; | |
148 qIndexes[l]=qIndexes[l-1]; | |
149 sIndexes[l]=sIndexes[l-1]; | |
150 } | |
151 distances[n]=thisDist; | |
152 qIndexes[n]=numVectors-j-1; | |
153 sIndexes[n]=dbH->length/(sizeof(double)*dbH->dim)-k-1; | |
154 break; | |
155 } | |
156 } | |
157 else | |
158 break; | |
159 } | |
160 } | |
161 } | |
162 // Move query pointer to next query point | |
163 query+=dbH->dim; | |
164 } | |
165 | |
166 gettimeofday(&tv2, NULL); | |
167 if(verbosity>1) { | |
168 std::cerr << std::endl << " elapsed time:" << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << std::endl; | |
169 } | |
170 | |
171 if(adbQueryResponse==0){ | |
172 // Output answer | |
173 // Loop over nearest neighbours | |
174 for(k=0; k < pointNN; k++){ | |
175 // Scan for key | |
176 unsigned cumTrack=0; | |
177 for(l=0 ; l<dbH->numFiles; l++){ | |
178 cumTrack+=trackTable[l]; | |
179 if(sIndexes[k]<cumTrack){ | |
180 std::cout << fileTable+l*O2_FILETABLESIZE << " " << distances[k] << " " << qIndexes[k] << " " | |
181 << sIndexes[k]+trackTable[l]-cumTrack << std::endl; | |
182 break; | |
183 } | |
184 } | |
185 } | |
186 } | |
187 else{ // Process Web Services Query | |
188 int listLen; | |
189 for(k = 0; k < pointNN; k++) { | |
190 if(distances[k] == -DBL_MAX) | |
191 break; | |
192 } | |
193 listLen = k; | |
194 | |
195 adbQueryResponse->result.__sizeRlist=listLen; | |
196 adbQueryResponse->result.__sizeDist=listLen; | |
197 adbQueryResponse->result.__sizeQpos=listLen; | |
198 adbQueryResponse->result.__sizeSpos=listLen; | |
199 adbQueryResponse->result.Rlist= new char*[listLen]; | |
200 adbQueryResponse->result.Dist = new double[listLen]; | |
201 adbQueryResponse->result.Qpos = new unsigned int[listLen]; | |
202 adbQueryResponse->result.Spos = new unsigned int[listLen]; | |
203 for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ | |
204 adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; | |
205 adbQueryResponse->result.Dist[k]=distances[k]; | |
206 adbQueryResponse->result.Qpos[k]=qIndexes[k]; | |
207 unsigned cumTrack=0; | |
208 for(l=0 ; l<dbH->numFiles; l++){ | |
209 cumTrack+=trackTable[l]; | |
210 if(sIndexes[k]<cumTrack){ | |
211 sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+l*O2_FILETABLESIZE); | |
212 break; | |
213 } | |
214 } | |
215 adbQueryResponse->result.Spos[k]=sIndexes[k]+trackTable[l]-cumTrack; | |
216 } | |
217 } | |
218 | |
219 // Clean up | |
220 if(queryCopy) | |
221 delete queryCopy; | |
222 if(qNorm) | |
223 delete qNorm; | |
224 if(timesdata) | |
225 delete[] timesdata; | |
226 if(querydurs) | |
227 delete[] querydurs; | |
228 if(dbdurs) | |
229 delete dbdurs; | |
230 } | |
231 | |
232 // trackPointQuery | |
233 // return the trackNN closest tracks to the query track | |
234 // uses average of pointNN points per track | |
235 void audioDB::trackPointQuery(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse) { | |
236 initTables(dbName, inFile); | |
237 | |
238 // For each input vector, find the closest pointNN matching output vectors and report | |
239 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | |
240 double* query = (double*)(indata+sizeof(int)); | |
241 double* data; | |
242 double* queryCopy = 0; | |
243 | |
244 if( dbH->flags & O2_FLAG_L2NORM ){ | |
245 // Make a copy of the query | |
246 queryCopy = new double[numVectors*dbH->dim]; | |
247 qNorm = new double[numVectors]; | |
248 assert(queryCopy&&qNorm); | |
249 memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); | |
250 unitNorm(queryCopy, dbH->dim, numVectors, qNorm); | |
251 query = queryCopy; | |
252 } | |
253 | |
254 assert(pointNN>0 && pointNN<=O2_MAXNN); | |
255 assert(trackNN>0 && trackNN<=O2_MAXNN); | |
256 | |
257 // Make temporary dynamic memory for results | |
258 double trackDistances[trackNN]; | |
259 unsigned trackIDs[trackNN]; | |
260 unsigned trackQIndexes[trackNN]; | |
261 unsigned trackSIndexes[trackNN]; | |
262 | |
263 double distances[pointNN]; | |
264 unsigned qIndexes[pointNN]; | |
265 unsigned sIndexes[pointNN]; | |
266 | |
267 unsigned j=numVectors; // number of query points | |
268 unsigned k,l,n, track, trackOffset=0, processedTracks=0; | |
269 double thisDist; | |
270 | |
271 for(k=0; k<pointNN; k++){ | |
272 distances[k]=-DBL_MAX; | |
273 qIndexes[k]=~0; | |
274 sIndexes[k]=~0; | |
275 } | |
276 | |
277 for(k=0; k<trackNN; k++){ | |
278 trackDistances[k]=-DBL_MAX; | |
279 trackQIndexes[k]=~0; | |
280 trackSIndexes[k]=~0; | |
281 trackIDs[k]=~0; | |
282 } | |
283 | |
284 double meanQdur = 0; | |
285 double *timesdata = 0; | |
286 double *querydurs = 0; | |
287 double *meanDBdur = 0; | |
288 | |
289 if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ | |
290 std::cerr << "warning: ignoring query timestamps for non-timestamped database" << std::endl; | |
291 usingTimes=0; | |
292 } | |
293 | |
294 else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) | |
295 std::cerr << "warning: no timestamps given for query. Ignoring database timestamps." << std::endl; | |
296 | |
297 else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ | |
298 timesdata = new double[2*numVectors]; | |
299 querydurs = new double[numVectors]; | |
300 insertTimeStamps(numVectors, timesFile, timesdata); | |
301 // Calculate durations of points | |
302 for(k=0; k<numVectors-1; k++) { | |
303 querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; | |
304 meanQdur += querydurs[k]; | |
305 } | |
306 meanQdur/=k; | |
307 meanDBdur = new double[dbH->numFiles]; | |
308 for(k=0; k<dbH->numFiles; k++){ | |
309 meanDBdur[k]=0.0; | |
310 for(j=0; j<trackTable[k]-1 ; j++) { | |
311 meanDBdur[k]+=timesTable[2*j+1]-timesTable[2*j]; | |
312 } | |
313 meanDBdur[k]/=j; | |
314 } | |
315 } | |
316 | |
317 if(usingQueryPoint) | |
318 if(queryPoint>numVectors-1) | |
319 error("queryPoint > numVectors in query"); | |
320 else{ | |
321 if(verbosity>1) { | |
322 std::cerr << "query point: " << queryPoint << std::endl; std::cerr.flush(); | |
323 } | |
324 query=query+queryPoint*dbH->dim; | |
325 numVectors=queryPoint+1; | |
326 } | |
327 | |
328 // build track offset table | |
329 off_t *trackOffsetTable = new off_t[dbH->numFiles]; | |
330 unsigned cumTrack=0; | |
331 off_t trackIndexOffset; | |
332 for(k=0; k<dbH->numFiles;k++){ | |
333 trackOffsetTable[k]=cumTrack; | |
334 cumTrack+=trackTable[k]*dbH->dim; | |
335 } | |
336 | |
337 char nextKey[MAXSTR]; | |
338 | |
339 gettimeofday(&tv1, NULL); | |
340 | |
341 size_t data_buffer_size = 0; | |
342 double *data_buffer = 0; | |
343 lseek(dbfid, dbH->dataOffset, SEEK_SET); | |
344 | |
345 for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ | |
346 | |
347 trackOffset = trackOffsetTable[track]; // numDoubles offset | |
348 | |
349 // get trackID from file if using a control file | |
350 if(trackFile) { | |
351 trackFile->getline(nextKey,MAXSTR); | |
352 if(!trackFile->eof()) { | |
353 track = getKeyPos(nextKey); | |
354 trackOffset = trackOffsetTable[track]; | |
355 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); | |
356 } else { | |
357 break; | |
358 } | |
359 } | |
360 | |
361 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset | |
362 | |
363 if(verbosity>7) { | |
364 std::cerr << track << "." << trackOffset/(dbH->dim) << "." << trackTable[track] << " | ";std::cerr.flush(); | |
365 } | |
366 | |
367 if(dbH->flags & O2_FLAG_L2NORM) | |
368 usingQueryPoint?query=queryCopy+queryPoint*dbH->dim:query=queryCopy; | |
369 else | |
370 usingQueryPoint?query=(double*)(indata+sizeof(int))+queryPoint*dbH->dim:query=(double*)(indata+sizeof(int)); | |
371 if(usingQueryPoint) | |
372 j=1; | |
373 else | |
374 j=numVectors; | |
375 | |
376 if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { | |
377 if(data_buffer) { | |
378 free(data_buffer); | |
379 } | |
380 { | |
381 data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; | |
382 void *tmp = malloc(data_buffer_size); | |
383 if (tmp == NULL) { | |
384 error("error allocating data buffer"); | |
385 } | |
386 data_buffer = (double *) tmp; | |
387 } | |
388 } | |
389 | |
390 read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); | |
391 | |
392 while(j--){ | |
393 k=trackTable[track]; // number of vectors in track | |
394 data=data_buffer; // data for track | |
395 while(k--){ | |
396 thisDist=0; | |
397 l=dbH->dim; | |
398 double* q=query; | |
399 while(l--) | |
400 thisDist+=*q++**data++; | |
401 if(!usingTimes || | |
402 (usingTimes | |
403 && fabs(meanDBdur[track]-meanQdur)<meanQdur*timesTol)){ | |
404 n=pointNN; | |
405 while(n--){ | |
406 if(thisDist>=distances[n]){ | |
407 if((n==0 || thisDist<=distances[n-1])){ | |
408 // Copy all values above up the queue | |
409 for( l=pointNN-1 ; l > n ; l--){ | |
410 distances[l]=distances[l-1]; | |
411 qIndexes[l]=qIndexes[l-1]; | |
412 sIndexes[l]=sIndexes[l-1]; | |
413 } | |
414 distances[n]=thisDist; | |
415 qIndexes[n]=numVectors-j-1; | |
416 sIndexes[n]=trackTable[track]-k-1; | |
417 break; | |
418 } | |
419 } | |
420 else | |
421 break; | |
422 } | |
423 } | |
424 } // track | |
425 // Move query pointer to next query point | |
426 query+=dbH->dim; | |
427 } // query | |
428 // Take the average of this track's distance | |
429 // Test the track distances | |
430 thisDist=0; | |
431 for (n = 0; n < pointNN; n++) { | |
432 if (distances[n] == -DBL_MAX) break; | |
433 thisDist += distances[n]; | |
434 } | |
435 thisDist /= n; | |
436 | |
437 n=trackNN; | |
438 while(n--){ | |
439 if(thisDist>=trackDistances[n]){ | |
440 if((n==0 || thisDist<=trackDistances[n-1])){ | |
441 // Copy all values above up the queue | |
442 for( l=trackNN-1 ; l > n ; l--){ | |
443 trackDistances[l]=trackDistances[l-1]; | |
444 trackQIndexes[l]=trackQIndexes[l-1]; | |
445 trackSIndexes[l]=trackSIndexes[l-1]; | |
446 trackIDs[l]=trackIDs[l-1]; | |
447 } | |
448 trackDistances[n]=thisDist; | |
449 trackQIndexes[n]=qIndexes[0]; | |
450 trackSIndexes[n]=sIndexes[0]; | |
451 trackIDs[n]=track; | |
452 break; | |
453 } | |
454 } | |
455 else | |
456 break; | |
457 } | |
458 for(unsigned k=0; k<pointNN; k++){ | |
459 distances[k]=-DBL_MAX; | |
460 qIndexes[k]=~0; | |
461 sIndexes[k]=~0; | |
462 } | |
463 } // tracks | |
464 | |
465 free(data_buffer); | |
466 | |
467 gettimeofday(&tv2, NULL); | |
468 | |
469 if(verbosity>1) { | |
470 std::cerr << std::endl << "processed tracks :" << processedTracks | |
471 << " elapsed time:" << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << std::endl; | |
472 } | |
473 | |
474 if(adbQueryResponse==0){ | |
475 if(verbosity>1) { | |
476 std::cerr<<std::endl; | |
477 } | |
478 // Output answer | |
479 // Loop over nearest neighbours | |
480 for(k=0; k < std::min(trackNN,processedTracks); k++) | |
481 std::cout << fileTable+trackIDs[k]*O2_FILETABLESIZE | |
482 << " " << trackDistances[k] << " " << trackQIndexes[k] << " " << trackSIndexes[k] << std::endl; | |
483 } | |
484 else{ // Process Web Services Query | |
485 int listLen = std::min(trackNN, processedTracks); | |
486 adbQueryResponse->result.__sizeRlist=listLen; | |
487 adbQueryResponse->result.__sizeDist=listLen; | |
488 adbQueryResponse->result.__sizeQpos=listLen; | |
489 adbQueryResponse->result.__sizeSpos=listLen; | |
490 adbQueryResponse->result.Rlist= new char*[listLen]; | |
491 adbQueryResponse->result.Dist = new double[listLen]; | |
492 adbQueryResponse->result.Qpos = new unsigned int[listLen]; | |
493 adbQueryResponse->result.Spos = new unsigned int[listLen]; | |
494 for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ | |
495 adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; | |
496 adbQueryResponse->result.Dist[k]=trackDistances[k]; | |
497 adbQueryResponse->result.Qpos[k]=trackQIndexes[k]; | |
498 adbQueryResponse->result.Spos[k]=trackSIndexes[k]; | |
499 sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+trackIDs[k]*O2_FILETABLESIZE); | |
500 } | |
501 } | |
502 | |
503 // Clean up | |
504 if(trackOffsetTable) | |
505 delete trackOffsetTable; | |
506 if(queryCopy) | |
507 delete queryCopy; | |
508 if(qNorm) | |
509 delete qNorm; | |
510 if(timesdata) | |
511 delete[] timesdata; | |
512 if(querydurs) | |
513 delete[] querydurs; | |
514 if(meanDBdur) | |
515 delete meanDBdur; | |
516 } | |
517 | |
518 // This is a common pattern in sequence queries: what we are doing is | |
519 // taking a window of length seqlen over a buffer of length length, | |
520 // and placing the sum of the elements in that window in the first | |
521 // element of the window: thus replacing all but the last seqlen | |
522 // elements in the buffer the corresponding windowed sum. | |
523 void audioDB::sequence_sum(double *buffer, int length, int seqlen) { | |
524 double tmp1, tmp2, *ps; | |
525 int j, w; | |
526 | |
527 tmp1 = *buffer; | |
528 j = 1; | |
529 w = seqlen - 1; | |
530 while(w--) { | |
531 *buffer += buffer[j++]; | |
532 } | |
533 ps = buffer + 1; | |
534 w = length - seqlen; // +1 - 1 | |
535 while(w--) { | |
536 tmp2 = *ps; | |
537 *ps = *(ps - 1) - tmp1 + *(ps + seqlen - 1); | |
538 tmp1 = tmp2; | |
539 ps++; | |
540 } | |
541 } | |
542 | |
543 void audioDB::sequence_sqrt(double *buffer, int length, int seqlen) { | |
544 int w = length - seqlen + 1; | |
545 while(w--) { | |
546 *buffer = sqrt(*buffer); | |
547 buffer++; | |
548 } | |
549 } | |
550 | |
551 void audioDB::sequence_average(double *buffer, int length, int seqlen) { | |
552 int w = length - seqlen + 1; | |
553 while(w--) { | |
554 *buffer /= seqlen; | |
555 buffer++; | |
556 } | |
557 } | |
558 | |
559 // k nearest-neighbor (k-NN) search between query and target tracks | |
560 // efficient implementation based on matched filter | |
561 // assumes normed shingles | |
562 // outputs distances of retrieved shingles, max retreived = pointNN shingles per per track | |
563 void audioDB::trackSequenceQueryNN(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ | |
564 | |
565 initTables(dbName, inFile); | |
566 | |
567 // For each input vector, find the closest pointNN matching output vectors and report | |
568 // we use stdout in this stub version | |
569 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | |
570 double* query = (double*)(indata+sizeof(int)); | |
571 double* queryCopy = 0; | |
572 | |
573 if(!(dbH->flags & O2_FLAG_L2NORM) ) | |
574 error("Database must be L2 normed for sequence query","use -L2NORM"); | |
575 | |
576 if(numVectors<sequenceLength) | |
577 error("Query shorter than requested sequence length", "maybe use -l"); | |
578 | |
579 if(verbosity>1) { | |
580 std::cerr << "performing norms ... "; std::cerr.flush(); | |
581 } | |
582 unsigned dbVectors = dbH->length/(sizeof(double)*dbH->dim); | |
583 | |
584 // Make a copy of the query | |
585 queryCopy = new double[numVectors*dbH->dim]; | |
586 memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); | |
587 qNorm = new double[numVectors]; | |
588 sNorm = new double[dbVectors]; | |
589 assert(qNorm&&sNorm&&queryCopy&&sequenceLength); | |
590 unitNorm(queryCopy, dbH->dim, numVectors, qNorm); | |
591 query = queryCopy; | |
592 | |
593 // Make norm measurements relative to sequenceLength | |
594 unsigned w = sequenceLength-1; | |
595 unsigned i,j; | |
596 | |
597 // Copy the L2 norm values to core to avoid disk random access later on | |
598 memcpy(sNorm, l2normTable, dbVectors*sizeof(double)); | |
599 double* qnPtr = qNorm; | |
600 double* snPtr = sNorm; | |
601 | |
602 double *sPower = 0, *qPower = 0; | |
603 double *spPtr = 0, *qpPtr = 0; | |
604 | |
605 if (usingPower) { | |
606 if (!(dbH->flags & O2_FLAG_POWER)) { | |
607 error("database not power-enabled", dbName); | |
608 } | |
609 sPower = new double[dbVectors]; | |
610 spPtr = sPower; | |
611 memcpy(sPower, powerTable, dbVectors * sizeof(double)); | |
612 } | |
613 | |
614 for(i=0; i<dbH->numFiles; i++){ | |
615 if(trackTable[i]>=sequenceLength) { | |
616 sequence_sum(snPtr, trackTable[i], sequenceLength); | |
617 sequence_sqrt(snPtr, trackTable[i], sequenceLength); | |
618 | |
619 if (usingPower) { | |
620 sequence_sum(spPtr, trackTable[i], sequenceLength); | |
621 sequence_average(spPtr, trackTable[i], sequenceLength); | |
622 } | |
623 } | |
624 snPtr += trackTable[i]; | |
625 if (usingPower) { | |
626 spPtr += trackTable[i]; | |
627 } | |
628 } | |
629 | |
630 sequence_sum(qnPtr, numVectors, sequenceLength); | |
631 sequence_sqrt(qnPtr, numVectors, sequenceLength); | |
632 | |
633 if (usingPower) { | |
634 qPower = new double[numVectors]; | |
635 qpPtr = qPower; | |
636 if (lseek(powerfd, sizeof(int), SEEK_SET) == (off_t) -1) { | |
637 error("error seeking to data", powerFileName, "lseek"); | |
638 } | |
639 int count = read(powerfd, qPower, numVectors * sizeof(double)); | |
640 if (count == -1) { | |
641 error("error reading data", powerFileName, "read"); | |
642 } | |
643 if ((unsigned) count != numVectors * sizeof(double)) { | |
644 error("short read", powerFileName); | |
645 } | |
646 | |
647 sequence_sum(qpPtr, numVectors, sequenceLength); | |
648 sequence_average(qpPtr, numVectors, sequenceLength); | |
649 } | |
650 | |
651 if(verbosity>1) { | |
652 std::cerr << "done." << std::endl; | |
653 } | |
654 | |
655 if(verbosity>1) { | |
656 std::cerr << "matching tracks..." << std::endl; | |
657 } | |
658 | |
659 assert(pointNN>0 && pointNN<=O2_MAXNN); | |
660 assert(trackNN>0 && trackNN<=O2_MAXNN); | |
661 | |
662 // Make temporary dynamic memory for results | |
663 double trackDistances[trackNN]; | |
664 unsigned trackIDs[trackNN]; | |
665 unsigned trackQIndexes[trackNN]; | |
666 unsigned trackSIndexes[trackNN]; | |
667 | |
668 double distances[pointNN]; | |
669 unsigned qIndexes[pointNN]; | |
670 unsigned sIndexes[pointNN]; | |
671 | |
672 | |
673 unsigned k,l,m,n,track,trackOffset=0, HOP_SIZE=sequenceHop, wL=sequenceLength; | |
674 double thisDist; | |
675 | |
676 for(k=0; k<pointNN; k++){ | |
677 distances[k]=1.0e6; | |
678 qIndexes[k]=~0; | |
679 sIndexes[k]=~0; | |
680 } | |
681 | |
682 for(k=0; k<trackNN; k++){ | |
683 trackDistances[k]=1.0e6; | |
684 trackQIndexes[k]=~0; | |
685 trackSIndexes[k]=~0; | |
686 trackIDs[k]=~0; | |
687 } | |
688 | |
689 // Timestamp and durations processing | |
690 double meanQdur = 0; | |
691 double *timesdata = 0; | |
692 double *querydurs = 0; | |
693 double *meanDBdur = 0; | |
694 | |
695 if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ | |
696 std::cerr << "warning: ignoring query timestamps for non-timestamped database" << std::endl; | |
697 usingTimes=0; | |
698 } | |
699 | |
700 else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) | |
701 std::cerr << "warning: no timestamps given for query. Ignoring database timestamps." << std::endl; | |
702 | |
703 else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ | |
704 timesdata = new double[2*numVectors]; | |
705 querydurs = new double[numVectors]; | |
706 | |
707 insertTimeStamps(numVectors, timesFile, timesdata); | |
708 // Calculate durations of points | |
709 for(k=0; k<numVectors-1; k++) { | |
710 querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; | |
711 meanQdur += querydurs[k]; | |
712 } | |
713 meanQdur/=k; | |
714 if(verbosity>1) { | |
715 std::cerr << "mean query file duration: " << meanQdur << std::endl; | |
716 } | |
717 meanDBdur = new double[dbH->numFiles]; | |
718 assert(meanDBdur); | |
719 for(k=0; k<dbH->numFiles; k++){ | |
720 meanDBdur[k]=0.0; | |
721 for(j=0; j<trackTable[k]-1 ; j++) { | |
722 meanDBdur[k]+=timesTable[2*j+1]-timesTable[2*j]; | |
723 } | |
724 meanDBdur[k]/=j; | |
725 } | |
726 } | |
727 | |
728 if(usingQueryPoint) | |
729 if(queryPoint>numVectors || queryPoint>numVectors-wL+1) | |
730 error("queryPoint > numVectors-wL+1 in query"); | |
731 else{ | |
732 if(verbosity>1) { | |
733 std::cerr << "query point: " << queryPoint << std::endl; std::cerr.flush(); | |
734 } | |
735 query = query + queryPoint * dbH->dim; | |
736 qnPtr = qnPtr + queryPoint; | |
737 if (usingPower) { | |
738 qpPtr = qpPtr + queryPoint; | |
739 } | |
740 numVectors=wL; | |
741 } | |
742 | |
743 double ** D = 0; // Differences query and target | |
744 double ** DD = 0; // Matched filter distance | |
745 | |
746 D = new double*[numVectors]; | |
747 assert(D); | |
748 DD = new double*[numVectors]; | |
749 assert(DD); | |
750 | |
751 gettimeofday(&tv1, NULL); | |
752 unsigned processedTracks = 0; | |
753 unsigned successfulTracks=0; | |
754 | |
755 double* qp; | |
756 double* sp; | |
757 double* dp; | |
758 | |
759 // build track offset table | |
760 off_t *trackOffsetTable = new off_t[dbH->numFiles]; | |
761 unsigned cumTrack=0; | |
762 off_t trackIndexOffset; | |
763 for(k=0; k<dbH->numFiles;k++){ | |
764 trackOffsetTable[k]=cumTrack; | |
765 cumTrack+=trackTable[k]*dbH->dim; | |
766 } | |
767 | |
768 char nextKey [MAXSTR]; | |
769 | |
770 // chi^2 statistics | |
771 double sampleCount = 0; | |
772 double sampleSum = 0; | |
773 double logSampleSum = 0; | |
774 double minSample = 1e9; | |
775 double maxSample = 0; | |
776 | |
777 // Track loop | |
778 size_t data_buffer_size = 0; | |
779 double *data_buffer = 0; | |
780 lseek(dbfid, dbH->dataOffset, SEEK_SET); | |
781 | |
782 for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++) { | |
783 | |
784 trackOffset = trackOffsetTable[track]; // numDoubles offset | |
785 | |
786 // get trackID from file if using a control file | |
787 if(trackFile) { | |
788 trackFile->getline(nextKey,MAXSTR); | |
789 if(!trackFile->eof()) { | |
790 track = getKeyPos(nextKey); | |
791 trackOffset = trackOffsetTable[track]; | |
792 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); | |
793 } else { | |
794 break; | |
795 } | |
796 } | |
797 | |
798 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset | |
799 | |
800 if(sequenceLength<=trackTable[track]){ // test for short sequences | |
801 | |
802 if(verbosity>7) { | |
803 std::cerr << track << "." << trackIndexOffset << "." << trackTable[track] << " | ";std::cerr.flush(); | |
804 } | |
805 | |
806 // Sum products matrix | |
807 for(j=0; j<numVectors;j++){ | |
808 D[j]=new double[trackTable[track]]; | |
809 assert(D[j]); | |
810 | |
811 } | |
812 | |
813 // Matched filter matrix | |
814 for(j=0; j<numVectors;j++){ | |
815 DD[j]=new double[trackTable[track]]; | |
816 assert(DD[j]); | |
817 } | |
818 | |
819 if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { | |
820 if(data_buffer) { | |
821 free(data_buffer); | |
822 } | |
823 { | |
824 data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; | |
825 void *tmp = malloc(data_buffer_size); | |
826 if (tmp == NULL) { | |
827 error("error allocating data buffer"); | |
828 } | |
829 data_buffer = (double *) tmp; | |
830 } | |
831 } | |
832 | |
833 read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); | |
834 | |
835 // Dot product | |
836 for(j=0; j<numVectors; j++) | |
837 for(k=0; k<trackTable[track]; k++){ | |
838 qp=query+j*dbH->dim; | |
839 sp=data_buffer+k*dbH->dim; | |
840 DD[j][k]=0.0; // Initialize matched filter array | |
841 dp=&D[j][k]; // point to correlation cell j,k | |
842 *dp=0.0; // initialize correlation cell | |
843 l=dbH->dim; // size of vectors | |
844 while(l--) | |
845 *dp+=*qp++**sp++; | |
846 } | |
847 | |
848 // Matched Filter | |
849 // HOP SIZE == 1 | |
850 double* spd; | |
851 if(HOP_SIZE==1){ // HOP_SIZE = shingleHop | |
852 for(w=0; w<wL; w++) | |
853 for(j=0; j<numVectors-w; j++){ | |
854 sp=DD[j]; | |
855 spd=D[j+w]+w; | |
856 k=trackTable[track]-w; | |
857 while(k--) | |
858 *sp+++=*spd++; | |
859 } | |
860 } | |
861 | |
862 else{ // HOP_SIZE != 1 | |
863 for(w=0; w<wL; w++) | |
864 for(j=0; j<numVectors-w; j+=HOP_SIZE){ | |
865 sp=DD[j]; | |
866 spd=D[j+w]+w; | |
867 for(k=0; k<trackTable[track]-w; k+=HOP_SIZE){ | |
868 *sp+=*spd; | |
869 sp+=HOP_SIZE; | |
870 spd+=HOP_SIZE; | |
871 } | |
872 } | |
873 } | |
874 | |
875 if(verbosity>3 && usingTimes) { | |
876 std::cerr << "meanQdur=" << meanQdur << " meanDBdur=" << meanDBdur[track] << std::endl; | |
877 std::cerr.flush(); | |
878 } | |
879 | |
880 if(!usingTimes || | |
881 (usingTimes | |
882 && fabs(meanDBdur[track]-meanQdur)<meanQdur*timesTol)){ | |
883 | |
884 if(verbosity>3 && usingTimes) { | |
885 std::cerr << "within duration tolerance." << std::endl; | |
886 std::cerr.flush(); | |
887 } | |
888 | |
889 // Search for minimum distance by shingles (concatenated vectors) | |
890 for(j=0;j<=numVectors-wL;j+=HOP_SIZE) | |
891 for(k=0;k<=trackTable[track]-wL;k+=HOP_SIZE){ | |
892 thisDist=2-(2/(qnPtr[j]*sNorm[trackIndexOffset+k]))*DD[j][k]; | |
893 if(verbosity>9) { | |
894 std::cerr << thisDist << " " << qnPtr[j] << " " << sNorm[trackIndexOffset+k] << std::endl; | |
895 } | |
896 // Gather chi^2 statistics | |
897 if(thisDist<minSample) | |
898 minSample=thisDist; | |
899 else if(thisDist>maxSample) | |
900 maxSample=thisDist; | |
901 if(thisDist>1e-9){ | |
902 sampleCount++; | |
903 sampleSum+=thisDist; | |
904 logSampleSum+=log(thisDist); | |
905 } | |
906 | |
907 // diffL2 = fabs(qnPtr[j] - sNorm[trackIndexOffset+k]); | |
908 // Power test | |
909 if (usingPower) { | |
910 if (!(powers_acceptable(qpPtr[j], sPower[trackIndexOffset + k]))) { | |
911 thisDist = 1000000.0; | |
912 } | |
913 } | |
914 | |
915 // k-NN match algorithm | |
916 m=pointNN; | |
917 while(m--){ | |
918 if(thisDist<=distances[m]) | |
919 if(m==0 || thisDist>=distances[m-1]){ | |
920 // Shuffle distances up the list | |
921 for(l=pointNN-1; l>m; l--){ | |
922 distances[l]=distances[l-1]; | |
923 qIndexes[l]=qIndexes[l-1]; | |
924 sIndexes[l]=sIndexes[l-1]; | |
925 } | |
926 distances[m]=thisDist; | |
927 if(usingQueryPoint) | |
928 qIndexes[m]=queryPoint; | |
929 else | |
930 qIndexes[m]=j; | |
931 sIndexes[m]=k; | |
932 break; | |
933 } | |
934 } | |
935 } | |
936 // Calculate the mean of the N-Best matches | |
937 thisDist=0.0; | |
938 for(m=0; m<pointNN; m++) { | |
939 if (distances[m] == 1000000.0) break; | |
940 thisDist+=distances[m]; | |
941 } | |
942 thisDist/=m; | |
943 | |
944 // Let's see the distances then... | |
945 if(verbosity>3) { | |
946 std::cerr << fileTable+track*O2_FILETABLESIZE << " " << thisDist << std::endl; | |
947 } | |
948 | |
949 | |
950 // All the track stuff goes here | |
951 n=trackNN; | |
952 while(n--){ | |
953 if(thisDist<=trackDistances[n]){ | |
954 if((n==0 || thisDist>=trackDistances[n-1])){ | |
955 // Copy all values above up the queue | |
956 for( l=trackNN-1 ; l > n ; l--){ | |
957 trackDistances[l]=trackDistances[l-1]; | |
958 trackQIndexes[l]=trackQIndexes[l-1]; | |
959 trackSIndexes[l]=trackSIndexes[l-1]; | |
960 trackIDs[l]=trackIDs[l-1]; | |
961 } | |
962 trackDistances[n]=thisDist; | |
963 trackQIndexes[n]=qIndexes[0]; | |
964 trackSIndexes[n]=sIndexes[0]; | |
965 successfulTracks++; | |
966 trackIDs[n]=track; | |
967 break; | |
968 } | |
969 } | |
970 else | |
971 break; | |
972 } | |
973 } // Duration match | |
974 | |
975 // Clean up current track | |
976 if(D!=NULL){ | |
977 for(j=0; j<numVectors; j++) | |
978 delete[] D[j]; | |
979 } | |
980 | |
981 if(DD!=NULL){ | |
982 for(j=0; j<numVectors; j++) | |
983 delete[] DD[j]; | |
984 } | |
985 } | |
986 // per-track reset array values | |
987 for(unsigned k=0; k<pointNN; k++){ | |
988 distances[k]=1.0e6; | |
989 qIndexes[k]=~0; | |
990 sIndexes[k]=~0; | |
991 } | |
992 } | |
993 | |
994 free(data_buffer); | |
995 | |
996 gettimeofday(&tv2,NULL); | |
997 if(verbosity>1) { | |
998 std::cerr << std::endl << "processed tracks :" << processedTracks << " matched tracks: " << successfulTracks << " elapsed time:" | |
999 << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << std::endl; | |
1000 std::cerr << "sampleCount: " << sampleCount << " sampleSum: " << sampleSum << " logSampleSum: " << logSampleSum | |
1001 << " minSample: " << minSample << " maxSample: " << maxSample << std::endl; | |
1002 } | |
1003 if(adbQueryResponse==0){ | |
1004 if(verbosity>1) { | |
1005 std::cerr<<std::endl; | |
1006 } | |
1007 // Output answer | |
1008 // Loop over nearest neighbours | |
1009 for(k=0; k < std::min(trackNN,successfulTracks); k++) | |
1010 std::cout << fileTable+trackIDs[k]*O2_FILETABLESIZE << " " << trackDistances[k] << " " | |
1011 << trackQIndexes[k] << " " << trackSIndexes[k] << std::endl; | |
1012 } | |
1013 else{ // Process Web Services Query | |
1014 int listLen = std::min(trackNN, processedTracks); | |
1015 adbQueryResponse->result.__sizeRlist=listLen; | |
1016 adbQueryResponse->result.__sizeDist=listLen; | |
1017 adbQueryResponse->result.__sizeQpos=listLen; | |
1018 adbQueryResponse->result.__sizeSpos=listLen; | |
1019 adbQueryResponse->result.Rlist= new char*[listLen]; | |
1020 adbQueryResponse->result.Dist = new double[listLen]; | |
1021 adbQueryResponse->result.Qpos = new unsigned int[listLen]; | |
1022 adbQueryResponse->result.Spos = new unsigned int[listLen]; | |
1023 for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ | |
1024 adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; | |
1025 adbQueryResponse->result.Dist[k]=trackDistances[k]; | |
1026 adbQueryResponse->result.Qpos[k]=trackQIndexes[k]; | |
1027 adbQueryResponse->result.Spos[k]=trackSIndexes[k]; | |
1028 sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+trackIDs[k]*O2_FILETABLESIZE); | |
1029 } | |
1030 } | |
1031 | |
1032 // Clean up | |
1033 if(trackOffsetTable) | |
1034 delete[] trackOffsetTable; | |
1035 if(queryCopy) | |
1036 delete[] queryCopy; | |
1037 if(qNorm) | |
1038 delete[] qNorm; | |
1039 if(sNorm) | |
1040 delete[] sNorm; | |
1041 if(qPower) | |
1042 delete[] qPower; | |
1043 if(sPower) | |
1044 delete[] sPower; | |
1045 if(D) | |
1046 delete[] D; | |
1047 if(DD) | |
1048 delete[] DD; | |
1049 if(timesdata) | |
1050 delete[] timesdata; | |
1051 if(querydurs) | |
1052 delete[] querydurs; | |
1053 if(meanDBdur) | |
1054 delete[] meanDBdur; | |
1055 } | |
1056 | |
1057 // Radius search between query and target tracks | |
1058 // efficient implementation based on matched filter | |
1059 // assumes normed shingles | |
1060 // outputs count of retrieved shingles, max retreived = one shingle per query shingle per track | |
1061 void audioDB::trackSequenceQueryRad(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse){ | |
1062 | |
1063 initTables(dbName, inFile); | |
1064 | |
1065 // For each input vector, find the closest pointNN matching output vectors and report | |
1066 // we use stdout in this stub version | |
1067 unsigned numVectors = (statbuf.st_size-sizeof(int))/(sizeof(double)*dbH->dim); | |
1068 double* query = (double*)(indata+sizeof(int)); | |
1069 double* queryCopy = 0; | |
1070 | |
1071 if(!(dbH->flags & O2_FLAG_L2NORM) ) | |
1072 error("Database must be L2 normed for sequence query","use -l2norm"); | |
1073 | |
1074 if(verbosity>1) { | |
1075 std::cerr << "performing norms ... "; std::cerr.flush(); | |
1076 } | |
1077 unsigned dbVectors = dbH->length/(sizeof(double)*dbH->dim); | |
1078 | |
1079 // Make a copy of the query | |
1080 queryCopy = new double[numVectors*dbH->dim]; | |
1081 memcpy(queryCopy, query, numVectors*dbH->dim*sizeof(double)); | |
1082 qNorm = new double[numVectors]; | |
1083 sNorm = new double[dbVectors]; | |
1084 assert(qNorm&&sNorm&&queryCopy&&sequenceLength); | |
1085 unitNorm(queryCopy, dbH->dim, numVectors, qNorm); | |
1086 query = queryCopy; | |
1087 | |
1088 // Make norm measurements relative to sequenceLength | |
1089 unsigned w = sequenceLength-1; | |
1090 unsigned i,j; | |
1091 | |
1092 // Copy the L2 norm values to core to avoid disk random access later on | |
1093 memcpy(sNorm, l2normTable, dbVectors*sizeof(double)); | |
1094 double* snPtr = sNorm; | |
1095 double* qnPtr = qNorm; | |
1096 | |
1097 double *sPower = 0, *qPower = 0; | |
1098 double *spPtr = 0, *qpPtr = 0; | |
1099 | |
1100 if (usingPower) { | |
1101 if(!(dbH->flags & O2_FLAG_POWER)) { | |
1102 error("database not power-enabled", dbName); | |
1103 } | |
1104 sPower = new double[dbVectors]; | |
1105 spPtr = sPower; | |
1106 memcpy(sPower, powerTable, dbVectors * sizeof(double)); | |
1107 } | |
1108 | |
1109 for(i=0; i<dbH->numFiles; i++){ | |
1110 if(trackTable[i]>=sequenceLength) { | |
1111 sequence_sum(snPtr, trackTable[i], sequenceLength); | |
1112 sequence_sqrt(snPtr, trackTable[i], sequenceLength); | |
1113 if (usingPower) { | |
1114 sequence_sum(spPtr, trackTable[i], sequenceLength); | |
1115 sequence_average(spPtr, trackTable[i], sequenceLength); | |
1116 } | |
1117 } | |
1118 snPtr += trackTable[i]; | |
1119 if (usingPower) { | |
1120 spPtr += trackTable[i]; | |
1121 } | |
1122 } | |
1123 | |
1124 sequence_sum(qnPtr, numVectors, sequenceLength); | |
1125 sequence_sqrt(qnPtr, numVectors, sequenceLength); | |
1126 | |
1127 if (usingPower) { | |
1128 qPower = new double[numVectors]; | |
1129 qpPtr = qPower; | |
1130 if (lseek(powerfd, sizeof(int), SEEK_SET) == (off_t) -1) { | |
1131 error("error seeking to data", powerFileName, "lseek"); | |
1132 } | |
1133 int count = read(powerfd, qPower, numVectors * sizeof(double)); | |
1134 if (count == -1) { | |
1135 error("error reading data", powerFileName, "read"); | |
1136 } | |
1137 if ((unsigned) count != numVectors * sizeof(double)) { | |
1138 error("short read", powerFileName); | |
1139 } | |
1140 | |
1141 sequence_sum(qpPtr, numVectors, sequenceLength); | |
1142 sequence_average(qpPtr, numVectors, sequenceLength); | |
1143 } | |
1144 | |
1145 if(verbosity>1) { | |
1146 std::cerr << "done." << std::endl; | |
1147 } | |
1148 | |
1149 if(verbosity>1) { | |
1150 std::cerr << "matching tracks..." << std::endl; | |
1151 } | |
1152 | |
1153 assert(pointNN>0 && pointNN<=O2_MAXNN); | |
1154 assert(trackNN>0 && trackNN<=O2_MAXNN); | |
1155 | |
1156 // Make temporary dynamic memory for results | |
1157 double trackDistances[trackNN]; | |
1158 unsigned trackIDs[trackNN]; | |
1159 unsigned trackQIndexes[trackNN]; | |
1160 unsigned trackSIndexes[trackNN]; | |
1161 | |
1162 double distances[pointNN]; | |
1163 unsigned qIndexes[pointNN]; | |
1164 unsigned sIndexes[pointNN]; | |
1165 | |
1166 | |
1167 unsigned k,l,n,track,trackOffset=0, HOP_SIZE=sequenceHop, wL=sequenceLength; | |
1168 double thisDist; | |
1169 | |
1170 for(k=0; k<pointNN; k++){ | |
1171 distances[k]=0.0; | |
1172 qIndexes[k]=~0; | |
1173 sIndexes[k]=~0; | |
1174 } | |
1175 | |
1176 for(k=0; k<trackNN; k++){ | |
1177 trackDistances[k]=0.0; | |
1178 trackQIndexes[k]=~0; | |
1179 trackSIndexes[k]=~0; | |
1180 trackIDs[k]=~0; | |
1181 } | |
1182 | |
1183 // Timestamp and durations processing | |
1184 double meanQdur = 0; | |
1185 double *timesdata = 0; | |
1186 double *querydurs = 0; | |
1187 double *meanDBdur = 0; | |
1188 | |
1189 if(usingTimes && !(dbH->flags & O2_FLAG_TIMES)){ | |
1190 std::cerr << "warning: ignoring query timestamps for non-timestamped database" << std::endl; | |
1191 usingTimes=0; | |
1192 } | |
1193 | |
1194 else if(!usingTimes && (dbH->flags & O2_FLAG_TIMES)) | |
1195 std::cerr << "warning: no timestamps given for query. Ignoring database timestamps." << std::endl; | |
1196 | |
1197 else if(usingTimes && (dbH->flags & O2_FLAG_TIMES)){ | |
1198 timesdata = new double[2*numVectors]; | |
1199 querydurs = new double[numVectors]; | |
1200 | |
1201 insertTimeStamps(numVectors, timesFile, timesdata); | |
1202 // Calculate durations of points | |
1203 for(k=0; k<numVectors-1; k++){ | |
1204 querydurs[k] = timesdata[2*k+1] - timesdata[2*k]; | |
1205 meanQdur += querydurs[k]; | |
1206 } | |
1207 meanQdur/=k; | |
1208 if(verbosity>1) { | |
1209 std::cerr << "mean query file duration: " << meanQdur << std::endl; | |
1210 } | |
1211 meanDBdur = new double[dbH->numFiles]; | |
1212 assert(meanDBdur); | |
1213 for(k=0; k<dbH->numFiles; k++){ | |
1214 meanDBdur[k]=0.0; | |
1215 for(j=0; j<trackTable[k]-1 ; j++) { | |
1216 meanDBdur[k]+=timesTable[2*j+1]-timesTable[2*j]; | |
1217 } | |
1218 meanDBdur[k]/=j; | |
1219 } | |
1220 } | |
1221 | |
1222 if(usingQueryPoint) | |
1223 if(queryPoint>numVectors || queryPoint>numVectors-wL+1) | |
1224 error("queryPoint > numVectors-wL+1 in query"); | |
1225 else{ | |
1226 if(verbosity>1) { | |
1227 std::cerr << "query point: " << queryPoint << std::endl; std::cerr.flush(); | |
1228 } | |
1229 query = query + queryPoint*dbH->dim; | |
1230 qnPtr = qnPtr + queryPoint; | |
1231 if (usingPower) { | |
1232 qpPtr = qpPtr + queryPoint; | |
1233 } | |
1234 numVectors=wL; | |
1235 } | |
1236 | |
1237 double ** D = 0; // Differences query and target | |
1238 double ** DD = 0; // Matched filter distance | |
1239 | |
1240 D = new double*[numVectors]; | |
1241 assert(D); | |
1242 DD = new double*[numVectors]; | |
1243 assert(DD); | |
1244 | |
1245 gettimeofday(&tv1, NULL); | |
1246 unsigned processedTracks = 0; | |
1247 unsigned successfulTracks=0; | |
1248 | |
1249 double* qp; | |
1250 double* sp; | |
1251 double* dp; | |
1252 | |
1253 // build track offset table | |
1254 off_t *trackOffsetTable = new off_t[dbH->numFiles]; | |
1255 unsigned cumTrack=0; | |
1256 off_t trackIndexOffset; | |
1257 for(k=0; k<dbH->numFiles;k++){ | |
1258 trackOffsetTable[k]=cumTrack; | |
1259 cumTrack+=trackTable[k]*dbH->dim; | |
1260 } | |
1261 | |
1262 char nextKey [MAXSTR]; | |
1263 | |
1264 // chi^2 statistics | |
1265 double sampleCount = 0; | |
1266 double sampleSum = 0; | |
1267 double logSampleSum = 0; | |
1268 double minSample = 1e9; | |
1269 double maxSample = 0; | |
1270 | |
1271 // Track loop | |
1272 size_t data_buffer_size = 0; | |
1273 double *data_buffer = 0; | |
1274 lseek(dbfid, dbH->dataOffset, SEEK_SET); | |
1275 | |
1276 for(processedTracks=0, track=0 ; processedTracks < dbH->numFiles ; track++, processedTracks++){ | |
1277 | |
1278 trackOffset = trackOffsetTable[track]; // numDoubles offset | |
1279 | |
1280 // get trackID from file if using a control file | |
1281 if(trackFile) { | |
1282 trackFile->getline(nextKey,MAXSTR); | |
1283 if(!trackFile->eof()) { | |
1284 track = getKeyPos(nextKey); | |
1285 trackOffset = trackOffsetTable[track]; | |
1286 lseek(dbfid, dbH->dataOffset + trackOffset * sizeof(double), SEEK_SET); | |
1287 } else { | |
1288 break; | |
1289 } | |
1290 } | |
1291 | |
1292 trackIndexOffset=trackOffset/dbH->dim; // numVectors offset | |
1293 | |
1294 if(sequenceLength<=trackTable[track]){ // test for short sequences | |
1295 | |
1296 if(verbosity>7) { | |
1297 std::cerr << track << "." << trackIndexOffset << "." << trackTable[track] << " | ";std::cerr.flush(); | |
1298 } | |
1299 | |
1300 // Sum products matrix | |
1301 for(j=0; j<numVectors;j++){ | |
1302 D[j]=new double[trackTable[track]]; | |
1303 assert(D[j]); | |
1304 | |
1305 } | |
1306 | |
1307 // Matched filter matrix | |
1308 for(j=0; j<numVectors;j++){ | |
1309 DD[j]=new double[trackTable[track]]; | |
1310 assert(DD[j]); | |
1311 } | |
1312 | |
1313 if (trackTable[track] * sizeof(double) * dbH->dim > data_buffer_size) { | |
1314 if(data_buffer) { | |
1315 free(data_buffer); | |
1316 } | |
1317 { | |
1318 data_buffer_size = trackTable[track] * sizeof(double) * dbH->dim; | |
1319 void *tmp = malloc(data_buffer_size); | |
1320 if (tmp == NULL) { | |
1321 error("error allocating data buffer"); | |
1322 } | |
1323 data_buffer = (double *) tmp; | |
1324 } | |
1325 } | |
1326 | |
1327 read(dbfid, data_buffer, trackTable[track] * sizeof(double) * dbH->dim); | |
1328 | |
1329 // Dot product | |
1330 for(j=0; j<numVectors; j++) | |
1331 for(k=0; k<trackTable[track]; k++){ | |
1332 qp=query+j*dbH->dim; | |
1333 sp=data_buffer+k*dbH->dim; | |
1334 DD[j][k]=0.0; // Initialize matched filter array | |
1335 dp=&D[j][k]; // point to correlation cell j,k | |
1336 *dp=0.0; // initialize correlation cell | |
1337 l=dbH->dim; // size of vectors | |
1338 while(l--) | |
1339 *dp+=*qp++**sp++; | |
1340 } | |
1341 | |
1342 // Matched Filter | |
1343 // HOP SIZE == 1 | |
1344 double* spd; | |
1345 if(HOP_SIZE==1){ // HOP_SIZE = shingleHop | |
1346 for(w=0; w<wL; w++) | |
1347 for(j=0; j<numVectors-w; j++){ | |
1348 sp=DD[j]; | |
1349 spd=D[j+w]+w; | |
1350 k=trackTable[track]-w; | |
1351 while(k--) | |
1352 *sp+++=*spd++; | |
1353 } | |
1354 } | |
1355 | |
1356 else{ // HOP_SIZE != 1 | |
1357 for(w=0; w<wL; w++) | |
1358 for(j=0; j<numVectors-w; j+=HOP_SIZE){ | |
1359 sp=DD[j]; | |
1360 spd=D[j+w]+w; | |
1361 for(k=0; k<trackTable[track]-w; k+=HOP_SIZE){ | |
1362 *sp+=*spd; | |
1363 sp+=HOP_SIZE; | |
1364 spd+=HOP_SIZE; | |
1365 } | |
1366 } | |
1367 } | |
1368 | |
1369 if(verbosity>3 && usingTimes) { | |
1370 std::cerr << "meanQdur=" << meanQdur << " meanDBdur=" << meanDBdur[track] << std::endl; | |
1371 std::cerr.flush(); | |
1372 } | |
1373 | |
1374 if(!usingTimes || | |
1375 (usingTimes | |
1376 && fabs(meanDBdur[track]-meanQdur)<meanQdur*timesTol)){ | |
1377 | |
1378 if(verbosity>3 && usingTimes) { | |
1379 std::cerr << "within duration tolerance." << std::endl; | |
1380 std::cerr.flush(); | |
1381 } | |
1382 | |
1383 // Search for minimum distance by shingles (concatenated vectors) | |
1384 for(j=0;j<=numVectors-wL;j+=HOP_SIZE) | |
1385 for(k=0;k<=trackTable[track]-wL;k+=HOP_SIZE){ | |
1386 thisDist=2-(2/(qnPtr[j]*sNorm[trackIndexOffset+k]))*DD[j][k]; | |
1387 if(verbosity>9) { | |
1388 std::cerr << thisDist << " " << qnPtr[j] << " " << sNorm[trackIndexOffset+k] << std::endl; | |
1389 } | |
1390 // Gather chi^2 statistics | |
1391 if(thisDist<minSample) | |
1392 minSample=thisDist; | |
1393 else if(thisDist>maxSample) | |
1394 maxSample=thisDist; | |
1395 if(thisDist>1e-9){ | |
1396 sampleCount++; | |
1397 sampleSum+=thisDist; | |
1398 logSampleSum+=log(thisDist); | |
1399 } | |
1400 | |
1401 // diffL2 = fabs(qnPtr[j] - sNorm[trackIndexOffset+k]); | |
1402 // Power test | |
1403 if (usingPower) { | |
1404 if (!(powers_acceptable(qpPtr[j], sPower[trackIndexOffset + k]))) { | |
1405 thisDist = 1000000.0; | |
1406 } | |
1407 } | |
1408 | |
1409 if(thisDist>=0 && thisDist<=radius){ | |
1410 distances[0]++; // increment count | |
1411 break; // only need one track point per query point | |
1412 } | |
1413 } | |
1414 // How many points were below threshold ? | |
1415 thisDist=distances[0]; | |
1416 | |
1417 // Let's see the distances then... | |
1418 if(verbosity>3) { | |
1419 std::cerr << fileTable+track*O2_FILETABLESIZE << " " << thisDist << std::endl; | |
1420 } | |
1421 | |
1422 // All the track stuff goes here | |
1423 n=trackNN; | |
1424 while(n--){ | |
1425 if(thisDist>trackDistances[n]){ | |
1426 if((n==0 || thisDist<=trackDistances[n-1])){ | |
1427 // Copy all values above up the queue | |
1428 for( l=trackNN-1 ; l > n ; l--){ | |
1429 trackDistances[l]=trackDistances[l-1]; | |
1430 trackQIndexes[l]=trackQIndexes[l-1]; | |
1431 trackSIndexes[l]=trackSIndexes[l-1]; | |
1432 trackIDs[l]=trackIDs[l-1]; | |
1433 } | |
1434 trackDistances[n]=thisDist; | |
1435 trackQIndexes[n]=qIndexes[0]; | |
1436 trackSIndexes[n]=sIndexes[0]; | |
1437 successfulTracks++; | |
1438 trackIDs[n]=track; | |
1439 break; | |
1440 } | |
1441 } | |
1442 else | |
1443 break; | |
1444 } | |
1445 } // Duration match | |
1446 | |
1447 // Clean up current track | |
1448 if(D!=NULL){ | |
1449 for(j=0; j<numVectors; j++) | |
1450 delete[] D[j]; | |
1451 } | |
1452 | |
1453 if(DD!=NULL){ | |
1454 for(j=0; j<numVectors; j++) | |
1455 delete[] DD[j]; | |
1456 } | |
1457 } | |
1458 // per-track reset array values | |
1459 for(unsigned k=0; k<pointNN; k++){ | |
1460 distances[k]=0.0; | |
1461 qIndexes[k]=~0; | |
1462 sIndexes[k]=~0; | |
1463 } | |
1464 } | |
1465 | |
1466 free(data_buffer); | |
1467 | |
1468 gettimeofday(&tv2,NULL); | |
1469 if(verbosity>1) { | |
1470 std::cerr << std::endl << "processed tracks :" << processedTracks << " matched tracks: " << successfulTracks << " elapsed time:" | |
1471 << ( tv2.tv_sec*1000 + tv2.tv_usec/1000 ) - ( tv1.tv_sec*1000+tv1.tv_usec/1000 ) << " msec" << std::endl; | |
1472 std::cerr << "sampleCount: " << sampleCount << " sampleSum: " << sampleSum << " logSampleSum: " << logSampleSum | |
1473 << " minSample: " << minSample << " maxSample: " << maxSample << std::endl; | |
1474 } | |
1475 | |
1476 if(adbQueryResponse==0){ | |
1477 if(verbosity>1) { | |
1478 std::cerr<<std::endl; | |
1479 } | |
1480 // Output answer | |
1481 // Loop over nearest neighbours | |
1482 for(k=0; k < std::min(trackNN,successfulTracks); k++) | |
1483 std::cout << fileTable+trackIDs[k]*O2_FILETABLESIZE << " " << trackDistances[k] << std::endl; | |
1484 } | |
1485 else{ // Process Web Services Query | |
1486 int listLen = std::min(trackNN, processedTracks); | |
1487 adbQueryResponse->result.__sizeRlist=listLen; | |
1488 adbQueryResponse->result.__sizeDist=listLen; | |
1489 adbQueryResponse->result.__sizeQpos=listLen; | |
1490 adbQueryResponse->result.__sizeSpos=listLen; | |
1491 adbQueryResponse->result.Rlist= new char*[listLen]; | |
1492 adbQueryResponse->result.Dist = new double[listLen]; | |
1493 adbQueryResponse->result.Qpos = new unsigned int[listLen]; | |
1494 adbQueryResponse->result.Spos = new unsigned int[listLen]; | |
1495 for(k=0; k<(unsigned)adbQueryResponse->result.__sizeRlist; k++){ | |
1496 adbQueryResponse->result.Rlist[k]=new char[O2_MAXFILESTR]; | |
1497 adbQueryResponse->result.Dist[k]=trackDistances[k]; | |
1498 adbQueryResponse->result.Qpos[k]=trackQIndexes[k]; | |
1499 adbQueryResponse->result.Spos[k]=trackSIndexes[k]; | |
1500 sprintf(adbQueryResponse->result.Rlist[k], "%s", fileTable+trackIDs[k]*O2_FILETABLESIZE); | |
1501 } | |
1502 } | |
1503 | |
1504 // Clean up | |
1505 if(trackOffsetTable) | |
1506 delete[] trackOffsetTable; | |
1507 if(queryCopy) | |
1508 delete[] queryCopy; | |
1509 if(qNorm) | |
1510 delete[] qNorm; | |
1511 if(sNorm) | |
1512 delete[] sNorm; | |
1513 if(qPower) | |
1514 delete[] qPower; | |
1515 if(sPower) | |
1516 delete[] sPower; | |
1517 if(D) | |
1518 delete[] D; | |
1519 if(DD) | |
1520 delete[] DD; | |
1521 if(timesdata) | |
1522 delete[] timesdata; | |
1523 if(querydurs) | |
1524 delete[] querydurs; | |
1525 if(meanDBdur) | |
1526 delete[] meanDBdur; | |
1527 } | |
1528 | |
1529 // Unit norm block of features | |
1530 void audioDB::unitNorm(double* X, unsigned dim, unsigned n, double* qNorm){ | |
1531 unsigned d; | |
1532 double L2, *p; | |
1533 if(verbosity>2) { | |
1534 std::cerr << "norming " << n << " vectors...";std::cerr.flush(); | |
1535 } | |
1536 while(n--){ | |
1537 p=X; | |
1538 L2=0.0; | |
1539 d=dim; | |
1540 while(d--){ | |
1541 L2+=*p**p; | |
1542 p++; | |
1543 } | |
1544 /* L2=sqrt(L2);*/ | |
1545 if(qNorm) | |
1546 *qNorm++=L2; | |
1547 /* | |
1548 oneOverL2 = 1.0/L2; | |
1549 d=dim; | |
1550 while(d--){ | |
1551 *X*=oneOverL2; | |
1552 X++; | |
1553 */ | |
1554 X+=dim; | |
1555 } | |
1556 if(verbosity>2) { | |
1557 std::cerr << "done..." << std::endl; | |
1558 } | |
1559 } |