mas01mc@292
|
1 // LSH indexing
|
mas01mc@292
|
2 //
|
mas01mc@292
|
3 // Construct a persistent LSH table structure
|
mas01mc@292
|
4 // Store at the same location as dbName
|
mas01mc@292
|
5 // Naming convention:
|
mas01mc@292
|
6 // dbName.lsh.${radius}.${sequenceLength}
|
mas01mc@292
|
7 //
|
mas01mc@292
|
8 //
|
mas01mc@292
|
9 // Author: Michael Casey
|
mas01mc@292
|
10 // Date: 23 June 2008
|
mas01mc@324
|
11 //
|
mas01mc@324
|
12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
|
mas01mc@292
|
13
|
mas01mc@292
|
14 #include "audioDB.h"
|
mas01cr@426
|
15 #include "audioDB-internals.h"
|
mas01mc@292
|
16
|
mas01cr@458
|
17 typedef struct adb_qcallback {
|
mas01cr@458
|
18 adb_t *adb;
|
mas01cr@458
|
19 adb_qstate_internal_t *qstate;
|
mas01cr@458
|
20 } adb_qcallback_t;
|
mas01mc@292
|
21
|
mas01mc@292
|
22 /************************* LSH indexing and query initialization *****************/
|
mas01mc@292
|
23
|
mas01cr@460
|
24 /* FIXME: there are several things wrong with this: the memory
|
mas01cr@460
|
25 * discipline isn't ideal, the radius printing is a bit lame, the name
|
mas01cr@460
|
26 * getting will succeed or fail depending on whether the path was
|
mas01cr@460
|
27 * relative or absolute -- but most importantly encoding all that
|
mas01cr@460
|
28 * information in a filename is going to lose: it's impossible to
|
mas01cr@460
|
29 * maintain backwards-compatibility. Instead we should probably store
|
mas01cr@460
|
30 * the index metadata inside the audiodb instance. */
|
mas01cr@460
|
31 char *audiodb_index_get_name(const char *dbName, double radius, Uns32T sequenceLength) {
|
mas01cr@460
|
32 char *indexName;
|
mas01cr@460
|
33 if(strlen(dbName) > (MAXSTR - 32)) {
|
mas01cr@460
|
34 return NULL;
|
mas01cr@460
|
35 }
|
mas01cr@460
|
36 indexName = new char[MAXSTR];
|
mas01mc@292
|
37 strncpy(indexName, dbName, MAXSTR);
|
mas01mc@292
|
38 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
|
mas01mc@292
|
39 return indexName;
|
mas01mc@292
|
40 }
|
mas01mc@292
|
41
|
mas01cr@460
|
42 bool audiodb_index_exists(const char *dbName, double radius, Uns32T sequenceLength) {
|
mas01cr@460
|
43 char *indexName = audiodb_index_get_name(dbName, radius, sequenceLength);
|
mas01cr@460
|
44 if(!indexName) {
|
mas01cr@460
|
45 return false;
|
mas01cr@460
|
46 }
|
mas01cr@460
|
47 struct stat st;
|
mas01cr@460
|
48 if(stat(indexName, &st)) {
|
mas01cr@460
|
49 delete [] indexName;
|
mas01cr@460
|
50 return false;
|
mas01cr@460
|
51 }
|
mas01cr@460
|
52 /* FIXME: other stat checks here? */
|
mas01cr@460
|
53 /* FIXME: is there any better way to check whether we can open a
|
mas01cr@460
|
54 * file for reading than by opening a file for reading? */
|
mas01cr@460
|
55 int fd = open(indexName, O_RDONLY);
|
mas01cr@460
|
56 delete [] indexName;
|
mas01cr@460
|
57 if(fd < 0) {
|
mas01cr@460
|
58 return false;
|
mas01cr@460
|
59 } else {
|
mas01cr@460
|
60 close(fd);
|
mas01mc@292
|
61 return true;
|
mas01cr@460
|
62 }
|
mas01mc@292
|
63 }
|
mas01mc@292
|
64
|
mas01cr@465
|
65 /* FIXME: the indexName arg should be "const char *", but the LSH
|
mas01cr@465
|
66 * library doesn't like that.
|
mas01cr@465
|
67 */
|
mas01cr@465
|
68 LSH *audiodb_index_allocate(adb_t *adb, char *indexName, bool load_tables) {
|
mas01cr@465
|
69 LSH *lsh;
|
mas01cr@465
|
70 if(adb->cached_lsh) {
|
mas01cr@465
|
71 if(!strncmp(adb->cached_lsh->get_indexName(), indexName, MAXSTR)) {
|
mas01cr@465
|
72 return adb->cached_lsh;
|
mas01cr@465
|
73 } else {
|
mas01cr@465
|
74 delete adb->cached_lsh;
|
mas01cr@465
|
75 }
|
mas01mc@308
|
76 }
|
mas01cr@465
|
77 lsh = new LSH(indexName, load_tables);
|
mas01cr@465
|
78 if(load_tables) {
|
mas01cr@465
|
79 adb->cached_lsh = lsh;
|
mas01cr@465
|
80 }
|
mas01cr@465
|
81 return lsh;
|
mas01mc@308
|
82 }
|
mas01mc@308
|
83
|
mas01cr@459
|
84 vector<vector<float> > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) {
|
mas01cr@459
|
85 std::vector<std::vector<float> > *vv = new vector<vector<float> >(sz);
|
mas01cr@459
|
86 for(Uns32T i=0 ; i < sz ; i++) {
|
mas01cr@459
|
87 (*vv)[i]=vector<float>(dim * seqLen);
|
mas01cr@459
|
88 }
|
mas01mc@292
|
89 return vv;
|
mas01mc@292
|
90 }
|
mas01mc@292
|
91
|
mas01cr@459
|
92 void audiodb_index_delete_shingles(vector<vector<float> > *vv) {
|
mas01cr@459
|
93 delete vv;
|
mas01cr@459
|
94 }
|
mas01cr@459
|
95
|
mas01mc@292
|
96 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
|
mas01mc@292
|
97
|
mas01mc@292
|
98 // Prepare the AudioDB database for read access and allocate auxillary memory
|
mas01mc@292
|
99 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
|
mas01mc@324
|
100 if (!(dbH->flags & O2_FLAG_POWER)) {
|
mas01mc@324
|
101 error("INDEXed database must be power-enabled", dbName);
|
mas01mc@324
|
102 }
|
mas01mc@324
|
103
|
mas01mc@325
|
104 double *snpp = 0, *sppp = 0;
|
mas01mc@324
|
105
|
mas01mc@292
|
106 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
|
mas01mc@292
|
107 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
|
mas01mc@325
|
108 snpp = *snp;
|
mas01mc@292
|
109 *spp = new double[*dvp]; // song powertable pointer
|
mas01mc@292
|
110 sppp = *spp;
|
mas01mc@325
|
111
|
mas01mc@324
|
112 memcpy(*snp, l2normTable, *dvp * sizeof(double));
|
mas01mc@292
|
113 memcpy(*spp, powerTable, *dvp * sizeof(double));
|
mas01mc@324
|
114
|
mas01mc@324
|
115
|
mas01mc@292
|
116 for(Uns32T i = 0; i < dbH->numFiles; i++){
|
mas01mc@292
|
117 if(trackTable[i] >= sequenceLength) {
|
mas01cr@427
|
118 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
|
mas01cr@427
|
119 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
|
mas01mc@292
|
120
|
mas01cr@427
|
121 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
|
mas01cr@427
|
122 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
|
mas01mc@292
|
123 }
|
mas01mc@292
|
124 snpp += trackTable[i];
|
mas01mc@292
|
125 sppp += trackTable[i];
|
mas01mc@292
|
126 }
|
mas01mc@324
|
127
|
mas01mc@292
|
128 *vsnp = *snp;
|
mas01mc@292
|
129 *vspp = *spp;
|
mas01mc@324
|
130
|
mas01mc@292
|
131 // Move the feature vector read pointer to start of fetures in database
|
mas01mc@292
|
132 lseek(dbfid, dbH->dataOffset, SEEK_SET);
|
mas01mc@292
|
133 }
|
mas01mc@292
|
134
|
mas01mc@292
|
135
|
mas01cr@456
|
136 /********************* LSH shingle construction ***************************/
|
mas01cr@456
|
137
|
mas01cr@456
|
138 // Construct shingles out of a feature matrix
|
mas01cr@456
|
139 // inputs:
|
mas01cr@456
|
140 // idx is vector index in feature matrix
|
mas01cr@456
|
141 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
|
mas01cr@456
|
142 //
|
mas01cr@456
|
143 // pre-conditions:
|
mas01cr@456
|
144 // dbH->dim
|
mas01cr@456
|
145 // sequenceLength
|
mas01cr@456
|
146 // idx < numVectors - sequenceLength + 1
|
mas01cr@456
|
147 //
|
mas01cr@456
|
148 // post-conditions:
|
mas01cr@456
|
149 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
|
mas01cr@456
|
150
|
mas01cr@456
|
151 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
|
mas01cr@456
|
152 assert(idx<(*vv).size());
|
mas01cr@456
|
153 vector<float>::iterator ve = (*vv)[idx].end();
|
mas01cr@456
|
154 vector<float>::iterator vi = (*vv)[idx].begin();
|
mas01cr@456
|
155 // First feature vector in shingle
|
mas01cr@456
|
156 if(idx == 0) {
|
mas01cr@456
|
157 while(vi!=ve) {
|
mas01cr@456
|
158 *vi++ = (float)(*fvp++);
|
mas01cr@456
|
159 }
|
mas01cr@456
|
160 } else {
|
mas01cr@456
|
161 // Not first feature vector in shingle
|
mas01cr@456
|
162 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
|
mas01cr@456
|
163 // Previous seqLen-1 dim-vectors
|
mas01cr@456
|
164 while(vi!=ve-dim) {
|
mas01cr@456
|
165 *vi++ = *ui++;
|
mas01cr@456
|
166 }
|
mas01cr@456
|
167 // Move data pointer to next feature vector
|
mas01cr@456
|
168 fvp += ( seqLen + idx - 1 ) * dim ;
|
mas01cr@456
|
169 // New d-vector
|
mas01cr@456
|
170 while(vi!=ve) {
|
mas01cr@456
|
171 *vi++ = (float)(*fvp++);
|
mas01cr@456
|
172 }
|
mas01cr@456
|
173 }
|
mas01cr@456
|
174 }
|
mas01cr@456
|
175
|
mas01cr@456
|
176 // norm shingles
|
mas01cr@456
|
177 // in-place norming, no deletions
|
mas01cr@456
|
178 // If using power, return number of shingles above power threshold
|
mas01cr@459
|
179 int audiodb_index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) {
|
mas01cr@456
|
180 int z = 0; // number of above-threshold shingles
|
mas01cr@456
|
181 float l2norm;
|
mas01cr@456
|
182 double power;
|
mas01cr@456
|
183 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
|
mas01cr@456
|
184 float oneOverSqrtl2NormDivRad = oneOverRadius;
|
mas01cr@459
|
185 Uns32T shingleSize = seqLen * dim;
|
mas01cr@459
|
186
|
mas01cr@459
|
187 if(!spp) {
|
mas01cr@459
|
188 return -1;
|
mas01cr@459
|
189 }
|
mas01cr@456
|
190 for(Uns32T a=0; a<(*vv).size(); a++){
|
mas01cr@456
|
191 l2norm = (float)(*snp++);
|
mas01cr@459
|
192 if(normed_vectors)
|
mas01cr@456
|
193 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
|
mas01cr@456
|
194
|
mas01cr@456
|
195 for(Uns32T b=0; b < shingleSize ; b++)
|
mas01cr@456
|
196 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
|
mas01cr@456
|
197
|
mas01cr@456
|
198 power = *spp++;
|
mas01cr@459
|
199 if(use_pthreshold){
|
mas01cr@459
|
200 if (power >= pthreshold)
|
mas01cr@456
|
201 z++;
|
mas01cr@456
|
202 }
|
mas01cr@456
|
203 else
|
mas01cr@456
|
204 z++;
|
mas01cr@456
|
205 }
|
mas01cr@456
|
206 return z;
|
mas01cr@456
|
207 }
|
mas01cr@456
|
208
|
mas01cr@456
|
209
|
mas01mc@292
|
210 /************************ LSH indexing ***********************************/
|
mas01mc@292
|
211 void audioDB::index_index_db(const char* dbName){
|
mas01mc@292
|
212 char* newIndexName;
|
mas01mc@292
|
213 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
|
mas01mc@292
|
214 Uns32T dbVectors = 0;
|
mas01mc@292
|
215
|
mas01mc@324
|
216
|
mas01mc@292
|
217 printf("INDEX: initializing header\n");
|
mas01mc@292
|
218 // Check if audioDB exists, initialize header and open database for read
|
mas01mc@292
|
219 forWrite = false;
|
mas01mc@292
|
220 initDBHeader(dbName);
|
mas01mc@292
|
221
|
mas01mc@324
|
222 if(dbH->flags & O2_FLAG_POWER)
|
mas01mc@324
|
223 usingPower = true;
|
mas01mc@324
|
224
|
mas01mc@324
|
225 if(dbH->flags & O2_FLAG_TIMES)
|
mas01mc@324
|
226 usingTimes = true;
|
mas01mc@324
|
227
|
mas01cr@460
|
228 newIndexName = audiodb_index_get_name(dbName, radius, sequenceLength);
|
mas01cr@460
|
229 if(!newIndexName) {
|
mas01cr@460
|
230 error("failed to get index name", dbName);
|
mas01cr@460
|
231 }
|
mas01mc@292
|
232
|
mas01mc@292
|
233 // Set unit norming flag override
|
mas01mc@292
|
234 audioDB::normalizedDistance = !audioDB::no_unit_norming;
|
mas01mc@292
|
235
|
mas01mc@327
|
236 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
|
mas01mc@327
|
237 VERB_LOG(1, "INDEX: R %f\n", radius);
|
mas01mc@327
|
238 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
|
mas01mc@327
|
239 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
|
mas01mc@327
|
240 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
|
mas01mc@327
|
241 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
|
mas01mc@327
|
242 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
|
mas01mc@327
|
243 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
|
mas01mc@327
|
244 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
|
mas01mc@327
|
245 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
|
mas01mc@292
|
246
|
mas01mc@292
|
247 if((lshfid = open(newIndexName,O_RDONLY))<0){
|
mas01mc@292
|
248 printf("INDEX: constructing new LSH index\n");
|
mas01mc@292
|
249 printf("INDEX: making index file %s\n", newIndexName);
|
mas01mc@292
|
250 fflush(stdout);
|
mas01mc@292
|
251 // Construct new LSH index
|
mas01mc@292
|
252 lsh = new LSH((float)lsh_param_w, lsh_param_k,
|
mas01mc@292
|
253 lsh_param_m,
|
mas01mc@292
|
254 (Uns32T)(sequenceLength*dbH->dim),
|
mas01mc@292
|
255 lsh_param_N,
|
mas01mc@292
|
256 lsh_param_ncols,
|
mas01mc@292
|
257 (float)radius);
|
mas01mc@292
|
258 assert(lsh);
|
mas01mc@292
|
259
|
mas01mc@292
|
260 Uns32T endTrack = lsh_param_b;
|
mas01mc@292
|
261 if( endTrack > dbH->numFiles)
|
mas01mc@292
|
262 endTrack = dbH->numFiles;
|
mas01mc@292
|
263 // Insert up to lsh_param_b tracks
|
mas01mc@324
|
264 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
|
mas01mc@324
|
265 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
|
mas01mc@324
|
266 }
|
mas01mc@324
|
267 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
|
mas01mc@292
|
268 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
|
mas01mc@292
|
269
|
mas01mc@292
|
270 // Clean up
|
mas01mc@292
|
271 delete lsh;
|
mas01mc@308
|
272 lsh = 0;
|
mas01cr@465
|
273 } else {
|
mas01mc@292
|
274 close(lshfid);
|
mas01mc@292
|
275 }
|
mas01mc@292
|
276
|
mas01mc@292
|
277 // Attempt to open LSH file
|
mas01mc@292
|
278 if((lshfid = open(newIndexName,O_RDONLY))>0){
|
mas01mc@292
|
279 printf("INDEX: merging with existing LSH index\n");
|
mas01mc@292
|
280 fflush(stdout);
|
mas01mc@340
|
281 char* mergeIndexName = newIndexName;
|
mas01mc@292
|
282
|
mas01mc@292
|
283 // Get the lsh header info and find how many tracks are inserted already
|
mas01mc@340
|
284 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
|
mas01mc@292
|
285 assert(lsh);
|
mas01cr@458
|
286 Uns32T maxs = audiodb_index_to_track_id(lsh->get_maxp(), audiodb_lsh_n_point_bits(adb))+1;
|
mas01mc@292
|
287 delete lsh;
|
mas01mc@308
|
288 lsh = 0;
|
mas01mc@292
|
289
|
mas01mc@340
|
290 // Insert up to lsh_param_b tracks
|
mas01mc@340
|
291 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
|
mas01mc@340
|
292 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
|
mas01mc@340
|
293 }
|
mas01mc@292
|
294 // This allows for updating index after more tracks are inserted into audioDB
|
mas01mc@292
|
295 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
|
mas01mc@292
|
296
|
mas01mc@292
|
297 Uns32T endTrack = startTrack + lsh_param_b;
|
mas01mc@292
|
298 if( endTrack > dbH->numFiles)
|
mas01mc@292
|
299 endTrack = dbH->numFiles;
|
mas01mc@292
|
300 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
|
mas01mc@292
|
301 fflush(stdout);
|
mas01mc@340
|
302 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
|
mas01mc@292
|
303 assert(lsh);
|
mas01mc@292
|
304
|
mas01mc@292
|
305 // Insert up to lsh_param_b database tracks
|
mas01mc@292
|
306 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
|
mas01mc@292
|
307
|
mas01mc@340
|
308 // Serialize to file (merging is performed here)
|
mas01mc@340
|
309 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
|
mas01mc@292
|
310 delete lsh;
|
mas01mc@308
|
311 lsh = 0;
|
mas01mc@340
|
312 }
|
mas01mc@292
|
313
|
mas01mc@292
|
314 close(lshfid);
|
mas01mc@292
|
315 printf("INDEX: done constructing LSH index.\n");
|
mas01mc@292
|
316 fflush(stdout);
|
mas01mc@292
|
317
|
mas01mc@292
|
318 }
|
mas01mc@292
|
319 else{
|
mas01mc@292
|
320 error("Something's wrong with LSH index file");
|
mas01mc@292
|
321 exit(1);
|
mas01mc@292
|
322 }
|
mas01mc@292
|
323
|
mas01mc@324
|
324 delete[] newIndexName;
|
mas01mc@324
|
325 delete[] sNorm;
|
mas01mc@324
|
326 delete[] sPower;
|
mas01mc@324
|
327 }
|
mas01mc@292
|
328
|
mas01mc@292
|
329
|
mas01cr@405
|
330 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
|
mas01cr@405
|
331 if(usingPower){
|
mas01cr@405
|
332 int one;
|
mas01cr@405
|
333 unsigned int count;
|
mas01cr@405
|
334
|
mas01cr@405
|
335 count = read(powerfd, &one, sizeof(unsigned int));
|
mas01cr@405
|
336 if (count != sizeof(unsigned int)) {
|
mas01cr@405
|
337 error("powerfd read failed", "int", "read");
|
mas01cr@405
|
338 }
|
mas01cr@405
|
339 if (one != 1) {
|
mas01cr@405
|
340 error("dimensionality of power file not 1", powerFileName);
|
mas01cr@405
|
341 }
|
mas01cr@405
|
342
|
mas01cr@405
|
343 // FIXME: should check that the powerfile is the right size for
|
mas01cr@405
|
344 // this. -- CSR, 2007-10-30
|
mas01cr@405
|
345 count = read(powerfd, powerdata, numVectors * sizeof(double));
|
mas01cr@405
|
346 if (count != numVectors * sizeof(double)) {
|
mas01cr@405
|
347 error("powerfd read failed", "double", "read");
|
mas01cr@405
|
348 }
|
mas01cr@405
|
349 }
|
mas01cr@405
|
350 }
|
mas01cr@405
|
351
|
mas01mc@324
|
352 // initialize auxillary track data from filesystem
|
mas01mc@324
|
353 // pre-conditions:
|
mas01mc@324
|
354 // dbH->flags & O2_FLAG_LARGE_ADB
|
mas01mc@324
|
355 // feature data allocated and copied (fvp)
|
mas01mc@324
|
356 //
|
mas01mc@324
|
357 // post-conditions:
|
mas01mc@324
|
358 // allocated power data
|
mas01mc@324
|
359 // allocated l2norm data
|
mas01mc@324
|
360 //
|
mas01mc@324
|
361 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
|
mas01mc@324
|
362 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
|
mas01mc@324
|
363 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
|
mas01mc@292
|
364
|
mas01mc@324
|
365 // Allocate and read the power sequence
|
mas01mc@324
|
366 if(trackTable[trackID]>=sequenceLength){
|
mas01mc@324
|
367
|
mas01mc@324
|
368 char* prefixedString = new char[O2_MAXFILESTR];
|
mas01mc@324
|
369 char* tmpStr = prefixedString;
|
mas01mc@324
|
370 // Open and check dimensions of power file
|
mas01mc@324
|
371 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
|
mas01mc@324
|
372 prefix_name((char ** const)&prefixedString, adb_feature_root);
|
mas01mc@324
|
373 if(prefixedString!=tmpStr)
|
mas01mc@324
|
374 delete[] tmpStr;
|
mas01mc@324
|
375 powerfd = open(prefixedString, O_RDONLY);
|
mas01mc@324
|
376 if (powerfd < 0) {
|
mas01mc@324
|
377 error("failed to open power file", prefixedString);
|
mas01mc@324
|
378 }
|
mas01mc@324
|
379 if (fstat(powerfd, &statbuf) < 0) {
|
mas01mc@324
|
380 error("fstat error finding size of power file", prefixedString, "fstat");
|
mas01mc@324
|
381 }
|
mas01mc@324
|
382
|
mas01mc@324
|
383 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
|
mas01mc@324
|
384 error("Dimension mismatch: numPowers != numVectors", prefixedString);
|
mas01mc@324
|
385
|
mas01mc@324
|
386 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
|
mas01mc@324
|
387 assert(*sPowerp);
|
mas01mc@324
|
388 *spPtrp = *sPowerp;
|
mas01mc@324
|
389 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
|
mas01mc@324
|
390 if (0 < powerfd) {
|
mas01mc@324
|
391 close(powerfd);
|
mas01mc@324
|
392 }
|
mas01mc@324
|
393
|
mas01cr@427
|
394 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
|
mas01cr@427
|
395 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
|
mas01mc@324
|
396 powerTable = 0;
|
mas01mc@324
|
397
|
mas01mc@324
|
398 // Allocate and calculate the l2norm sequence
|
mas01mc@324
|
399 *sNormpp = new double[trackTable[trackID]];
|
mas01mc@324
|
400 assert(*sNormpp);
|
mas01mc@324
|
401 *snPtrp = *sNormpp;
|
mas01cr@426
|
402 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
|
mas01cr@427
|
403 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
|
mas01cr@427
|
404 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
|
mas01mc@324
|
405 }
|
mas01mc@292
|
406 }
|
mas01mc@292
|
407
|
mas01mc@292
|
408 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
|
mas01mc@292
|
409 double** fvpp, double** sNormpp,double** snPtrp,
|
mas01mc@292
|
410 double** sPowerp, double** spPtrp){
|
mas01mc@292
|
411 size_t nfv = 0;
|
mas01mc@292
|
412 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
|
mas01mc@292
|
413 Uns32T trackID = 0;
|
mas01mc@292
|
414
|
mas01mc@292
|
415 VERB_LOG(1, "indexing tracks...");
|
mas01mc@292
|
416
|
mas01mc@324
|
417 int trackfd = dbfid;
|
mas01mc@292
|
418 for(trackID = start_track ; trackID < end_track ; trackID++ ){
|
mas01mc@324
|
419 if( dbH->flags & O2_FLAG_LARGE_ADB ){
|
mas01mc@324
|
420 char* prefixedString = new char[O2_MAXFILESTR];
|
mas01mc@324
|
421 char* tmpStr = prefixedString;
|
mas01mc@324
|
422 // Open and check dimensions of feature file
|
mas01mc@324
|
423 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
|
mas01mc@324
|
424 prefix_name((char ** const) &prefixedString, adb_feature_root);
|
mas01mc@324
|
425 if(prefixedString!=tmpStr)
|
mas01mc@324
|
426 delete[] tmpStr;
|
mas01cr@454
|
427 initInputFile(prefixedString);
|
mas01mc@324
|
428 trackfd = infid;
|
mas01mc@324
|
429 }
|
mas01cr@433
|
430 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
|
mas01cr@433
|
431 error("failed to read data");
|
mas01mc@292
|
432 *fvpp = fvp; // Protect memory allocation and free() for track data
|
mas01mc@324
|
433
|
mas01mc@324
|
434 if( dbH->flags & O2_FLAG_LARGE_ADB )
|
mas01mc@324
|
435 // Load power and calculate power and l2norm sequence sums
|
mas01mc@324
|
436 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
|
mas01mc@324
|
437
|
mas01mc@292
|
438 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
|
mas01mc@292
|
439 break;
|
mas01mc@324
|
440 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
|
mas01mc@324
|
441 close(infid);
|
mas01mc@324
|
442 delete[] *sNormpp;
|
mas01mc@324
|
443 delete[] *sPowerp;
|
mas01mc@324
|
444 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
|
mas01mc@324
|
445 }
|
mas01mc@324
|
446 } // end for(trackID = start_track ; ... )
|
mas01mc@292
|
447 std::cout << "finished inserting." << endl;
|
mas01mc@292
|
448 }
|
mas01mc@292
|
449
|
mas01mc@292
|
450 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
|
mas01mc@292
|
451 // Loop over the current input track's vectors
|
mas01cr@305
|
452 Uns32T numVecs = 0;
|
mas01cr@305
|
453 if (trackTable[trackID] > O2_MAXTRACKLEN) {
|
mas01cr@305
|
454 if (O2_MAXTRACKLEN < sequenceLength - 1) {
|
mas01cr@305
|
455 numVecs = 0;
|
mas01cr@305
|
456 } else {
|
mas01cr@305
|
457 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
|
mas01cr@305
|
458 }
|
mas01cr@305
|
459 } else {
|
mas01cr@305
|
460 if (trackTable[trackID] < sequenceLength - 1) {
|
mas01cr@305
|
461 numVecs = 0;
|
mas01cr@305
|
462 } else {
|
mas01cr@305
|
463 numVecs = trackTable[trackID] - sequenceLength + 1;
|
mas01cr@305
|
464 }
|
mas01cr@305
|
465 }
|
mas01mc@292
|
466
|
mas01mc@324
|
467 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
|
mas01mc@324
|
468 if(numVecs){
|
mas01cr@459
|
469 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(numVecs, dbH->dim, sequenceLength);
|
mas01mc@324
|
470
|
mas01mc@324
|
471 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
|
mas01cr@456
|
472 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
|
mas01cr@459
|
473 int vcount = audiodb_index_norm_shingles(vv, *snpp, *sppp, dbH->dim, sequenceLength, radius, normalizedDistance, use_absolute_threshold, absolute_threshold);
|
mas01cr@459
|
474 if(vcount == -1) {
|
mas01cr@459
|
475 audiodb_index_delete_shingles(vv);
|
mas01cr@459
|
476 error("failed to norm shingles");
|
mas01cr@459
|
477 }
|
mas01cr@459
|
478 numVecsAboveThreshold = vcount;
|
mas01mc@324
|
479 collisionCount = index_insert_shingles(vv, trackID, *sppp);
|
mas01cr@459
|
480 audiodb_index_delete_shingles(vv);
|
mas01mc@324
|
481 }
|
mas01cr@459
|
482
|
mas01mc@292
|
483 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
|
mas01mc@292
|
484
|
mas01cr@459
|
485 /* audiodb_index_norm_shingles() only goes as far as the end of the
|
mas01mc@292
|
486 sequence, which is right, but the space allocated is for the
|
mas01mc@292
|
487 whole track. */
|
mas01mc@292
|
488
|
mas01mc@292
|
489 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
|
mas01mc@292
|
490 * So let's be certain the pointers are in the correct place
|
mas01mc@292
|
491 */
|
mas01mc@292
|
492
|
mas01mc@324
|
493 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
|
mas01mc@324
|
494 *snpp += trackTable[trackID];
|
mas01mc@324
|
495 *sppp += trackTable[trackID];
|
mas01mc@324
|
496 *fvpp += trackTable[trackID] * dbH->dim;
|
mas01mc@324
|
497 }
|
mas01mc@292
|
498
|
mas01mc@292
|
499 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
|
mas01mc@292
|
500 std::cout.flush();
|
mas01mc@292
|
501 return true;
|
mas01mc@292
|
502 }
|
mas01mc@292
|
503
|
mas01mc@292
|
504 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
|
mas01mc@292
|
505 Uns32T collisionCount = 0;
|
mas01mc@292
|
506 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
|
mas01mc@324
|
507 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
|
mas01mc@324
|
508 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
|
mas01cr@458
|
509 collisionCount += lsh->insert_point((*vv)[pointID], audiodb_index_from_trackinfo(trackID, pointID, audiodb_lsh_n_point_bits(adb)));
|
mas01mc@324
|
510 spp+=sequenceHop;
|
mas01mc@311
|
511 }
|
mas01mc@292
|
512 return collisionCount;
|
mas01mc@292
|
513 }
|
mas01mc@292
|
514
|
mas01mc@292
|
515 /*********************** LSH retrieval ****************************/
|
mas01mc@292
|
516
|
mas01mc@292
|
517
|
mas01mc@292
|
518 // return true if indexed query performed else return false
|
mas01cr@473
|
519 int audiodb_index_init_query(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate, bool corep) {
|
mas01mc@292
|
520
|
mas01cr@465
|
521 uint32_t sequence_length = spec->qid.sequence_length;
|
mas01cr@465
|
522 double radius = spec->refine.radius;
|
mas01cr@465
|
523 if(!(audiodb_index_exists(adb->path, radius, sequence_length)))
|
mas01mc@292
|
524 return false;
|
mas01mc@292
|
525
|
mas01cr@465
|
526 char *indexName = audiodb_index_get_name(adb->path, radius, sequence_length);
|
mas01cr@460
|
527 if(!indexName) {
|
mas01cr@465
|
528 return false;
|
mas01cr@460
|
529 }
|
mas01mc@292
|
530
|
mas01cr@465
|
531 qstate->lsh = audiodb_index_allocate(adb, indexName, corep);
|
mas01cr@465
|
532
|
mas01cr@465
|
533 /* FIXME: it would be nice if the LSH library didn't make me do
|
mas01cr@465
|
534 * this. */
|
mas01cr@465
|
535 if((!corep) && (qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2)) {
|
mas01cr@465
|
536 delete qstate->lsh;
|
mas01cr@465
|
537 qstate->lsh = audiodb_index_allocate(adb, indexName, true);
|
mas01mc@292
|
538 }
|
mas01mc@292
|
539
|
mas01mc@292
|
540 delete[] indexName;
|
mas01mc@292
|
541 return true;
|
mas01mc@292
|
542 }
|
mas01mc@292
|
543
|
mas01cr@458
|
544 void audiodb_index_add_point_approximate(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
|
mas01cr@458
|
545 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
|
mas01cr@458
|
546 adb_t *adb = data->adb;
|
mas01cr@458
|
547 adb_qstate_internal_t *qstate = data->qstate;
|
mas01cr@458
|
548 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
|
mas01cr@458
|
549 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
|
mas01cr@458
|
550 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
|
mas01cr@458
|
551 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
|
mas01cr@458
|
552 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
|
mas01cr@424
|
553 adb_result_t r;
|
mas01cr@458
|
554 r.key = (*adb->keys)[trackID].c_str();
|
mas01cr@424
|
555 r.dist = dist;
|
mas01cr@424
|
556 r.qpos = qpos;
|
mas01cr@424
|
557 r.ipos = spos;
|
mas01cr@458
|
558 qstate->accumulator->add_point(&r);
|
mas01cr@424
|
559 }
|
mas01mc@292
|
560 }
|
mas01mc@292
|
561
|
mas01cr@463
|
562 // Maintain a queue of points to pass to audiodb_query_queue_loop()
|
mas01cr@463
|
563 // for exact evaluation
|
mas01cr@458
|
564 void audiodb_index_add_point_exact(void *user_data, Uns32T pointID, Uns32T qpos, float dist) {
|
mas01cr@458
|
565 adb_qcallback_t *data = (adb_qcallback_t *) user_data;
|
mas01cr@458
|
566 adb_t *adb = data->adb;
|
mas01cr@458
|
567 adb_qstate_internal_t *qstate = data->qstate;
|
mas01cr@458
|
568 uint32_t nbits = audiodb_lsh_n_point_bits(adb);
|
mas01cr@458
|
569 uint32_t trackID = audiodb_index_to_track_id(pointID, nbits);
|
mas01cr@458
|
570 uint32_t spos = audiodb_index_to_track_pos(pointID, nbits);
|
mas01cr@458
|
571 std::set<std::string>::iterator keys_end = qstate->allowed_keys->end();
|
mas01cr@458
|
572 if(qstate->allowed_keys->find((*adb->keys)[trackID]) != keys_end) {
|
mas01cr@458
|
573 PointPair p(trackID, qpos, spos);
|
mas01cr@458
|
574 qstate->exact_evaluation_queue->push(p);
|
mas01cr@458
|
575 }
|
mas01mc@292
|
576 }
|
mas01mc@292
|
577
|
mas01cr@466
|
578 // return -1 on error
|
mas01mc@292
|
579 // return 0: if index does not exist
|
mas01mc@292
|
580 // return nqv: if index exists
|
mas01cr@473
|
581 int audiodb_index_query_loop(adb_t *adb, const adb_query_spec_t *spec, adb_qstate_internal_t *qstate) {
|
mas01mc@292
|
582
|
mas01mc@324
|
583 double *query = 0, *query_data = 0;
|
mas01cr@437
|
584 adb_qpointers_internal_t qpointers = {0};
|
mas01cr@437
|
585
|
mas01cr@458
|
586 adb_qcallback_t callback_data;
|
mas01cr@458
|
587 callback_data.adb = adb;
|
mas01cr@458
|
588 callback_data.qstate = qstate;
|
mas01cr@458
|
589
|
mas01cr@466
|
590 void (*add_point_func)(void *, uint32_t, uint32_t, float);
|
mas01mc@292
|
591
|
mas01cr@466
|
592 uint32_t sequence_length = spec->qid.sequence_length;
|
mas01cr@466
|
593 bool normalized = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED);
|
mas01cr@466
|
594 double radius = spec->refine.radius;
|
mas01cr@466
|
595 bool use_absolute_threshold = spec->refine.flags & ADB_REFINE_ABSOLUTE_THRESHOLD;
|
mas01cr@466
|
596 double absolute_threshold = spec->refine.absolute_threshold;
|
mas01cr@431
|
597
|
mas01cr@468
|
598 if(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES) {
|
mas01cr@468
|
599 add_point_func = &audiodb_index_add_point_approximate;
|
mas01cr@468
|
600 } else {
|
mas01cr@458
|
601 qstate->exact_evaluation_queue = new std::priority_queue<PointPair>;
|
mas01cr@458
|
602 add_point_func = &audiodb_index_add_point_exact;
|
mas01mc@292
|
603 }
|
mas01mc@292
|
604
|
mas01cr@468
|
605 /* FIXME: this hardwired lsh_in_core is here to allow for a
|
mas01cr@468
|
606 * transition period while the need for the argument is worked
|
mas01cr@468
|
607 * through. Hopefully it will disappear again eventually. */
|
mas01cr@468
|
608 bool lsh_in_core = true;
|
mas01cr@468
|
609
|
mas01cr@465
|
610 if(!audiodb_index_init_query(adb, spec, qstate, lsh_in_core)) {
|
mas01mc@292
|
611 return 0;
|
mas01cr@465
|
612 }
|
mas01mc@292
|
613
|
mas01cr@466
|
614 char *database = audiodb_index_get_name(adb->path, radius, sequence_length);
|
mas01cr@460
|
615 if(!database) {
|
mas01cr@466
|
616 return -1;
|
mas01cr@460
|
617 }
|
mas01mc@292
|
618
|
mas01cr@444
|
619 if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
|
mas01cr@466
|
620 delete [] database;
|
mas01cr@466
|
621 return -1;
|
mas01cr@444
|
622 }
|
mas01mc@292
|
623
|
mas01cr@466
|
624 uint32_t Nq = (qpointers.nvectors > O2_MAXTRACKLEN ? O2_MAXTRACKLEN : qpointers.nvectors) - sequence_length + 1;
|
mas01cr@466
|
625 std::vector<std::vector<float> > *vv = audiodb_index_initialize_shingles(Nq, adb->header->dim, sequence_length);
|
mas01cr@447
|
626
|
mas01mc@292
|
627 // Construct shingles from query features
|
mas01cr@468
|
628 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
|
mas01cr@466
|
629 audiodb_index_make_shingle(vv, pointID, query, adb->header->dim, sequence_length);
|
mas01cr@468
|
630 }
|
mas01mc@292
|
631
|
mas01mc@292
|
632 // Normalize query vectors
|
mas01cr@466
|
633 int vcount = audiodb_index_norm_shingles(vv, qpointers.l2norm, qpointers.power, adb->header->dim, sequence_length, radius, normalized, use_absolute_threshold, absolute_threshold);
|
mas01cr@459
|
634 if(vcount == -1) {
|
mas01cr@459
|
635 audiodb_index_delete_shingles(vv);
|
mas01cr@466
|
636 delete [] database;
|
mas01cr@466
|
637 return -1;
|
mas01cr@459
|
638 }
|
mas01cr@466
|
639 uint32_t numVecsAboveThreshold = vcount;
|
mas01mc@292
|
640
|
mas01mc@292
|
641 // Nq contains number of inspected points in query file,
|
mas01mc@292
|
642 // numVecsAboveThreshold is number of points with power >= absolute_threshold
|
mas01cr@466
|
643 double *qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation
|
mas01cr@468
|
644 if(!(spec->qid.flags & ADB_QID_FLAG_EXHAUSTIVE) && numVecsAboveThreshold) {
|
mas01cr@466
|
645 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
|
mas01cr@466
|
646 qstate->lsh->retrieve_point((*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
|
mas01cr@466
|
647 } else {
|
mas01cr@466
|
648 qstate->lsh->serial_retrieve_point(database, (*vv)[0], spec->qid.sequence_start, add_point_func, &callback_data);
|
mas01cr@466
|
649 }
|
mas01cr@466
|
650 } else if(numVecsAboveThreshold) {
|
mas01cr@466
|
651 for(uint32_t pointID = 0; pointID < Nq; pointID++) {
|
mas01cr@370
|
652 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
|
mas01cr@466
|
653 if((qstate->lsh->get_lshHeader()->flags & O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
|
mas01cr@465
|
654 qstate->lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, &callback_data);
|
mas01cr@370
|
655 } else {
|
mas01cr@465
|
656 qstate->lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, &callback_data);
|
mas01cr@370
|
657 }
|
mas01cr@370
|
658 }
|
mas01cr@466
|
659 }
|
mas01cr@466
|
660 }
|
mas01cr@459
|
661 audiodb_index_delete_shingles(vv);
|
mas01mc@292
|
662
|
mas01cr@468
|
663 if(!(spec->qid.flags & ADB_QID_FLAG_ALLOW_FALSE_POSITIVES)) {
|
mas01cr@463
|
664 audiodb_query_queue_loop(adb, spec, qstate, query, &qpointers);
|
mas01cr@466
|
665 }
|
mas01mc@292
|
666
|
mas01mc@292
|
667 // Clean up
|
mas01mc@292
|
668 if(query_data)
|
mas01mc@292
|
669 delete[] query_data;
|
mas01cr@437
|
670 if(qpointers.l2norm_data)
|
mas01cr@437
|
671 delete[] qpointers.l2norm_data;
|
mas01cr@437
|
672 if(qpointers.power_data)
|
mas01cr@437
|
673 delete[] qpointers.power_data;
|
mas01cr@437
|
674 if(qpointers.mean_duration)
|
mas01cr@437
|
675 delete[] qpointers.mean_duration;
|
mas01mc@292
|
676 if(database)
|
mas01mc@292
|
677 delete[] database;
|
mas01cr@465
|
678 if(qstate->lsh != adb->cached_lsh)
|
mas01cr@465
|
679 delete qstate->lsh;
|
mas01mc@292
|
680
|
mas01mc@292
|
681 return Nq;
|
mas01mc@292
|
682 }
|
mas01mc@292
|
683
|