mas01mc@292
|
1 // LSH indexing
|
mas01mc@292
|
2 //
|
mas01mc@292
|
3 // Construct a persistent LSH table structure
|
mas01mc@292
|
4 // Store at the same location as dbName
|
mas01mc@292
|
5 // Naming convention:
|
mas01mc@292
|
6 // dbName.lsh.${radius}.${sequenceLength}
|
mas01mc@292
|
7 //
|
mas01mc@292
|
8 //
|
mas01mc@292
|
9 // Author: Michael Casey
|
mas01mc@292
|
10 // Date: 23 June 2008
|
mas01mc@324
|
11 //
|
mas01mc@324
|
12 // 19th August 2008 - added O2_FLAG_LARGE_ADB support
|
mas01mc@292
|
13
|
mas01mc@292
|
14 #include "audioDB.h"
|
mas01cr@426
|
15 #include "audioDB-internals.h"
|
mas01mc@292
|
16
|
mas01mc@292
|
17 /************************* LSH point index to audioDB conversion *****************/
|
mas01mc@324
|
18 Uns32T audioDB::index_to_trackID(Uns32T lshID, Uns32T nPntBits){
|
mas01mc@324
|
19 assert(nPntBits);
|
mas01mc@324
|
20 return lshID>>nPntBits;
|
mas01mc@292
|
21 }
|
mas01mc@292
|
22
|
mas01mc@324
|
23 Uns32T audioDB::index_to_trackPos(Uns32T lshID, Uns32T nPntBits){
|
mas01mc@324
|
24 assert(nPntBits);
|
mas01mc@324
|
25 return lshID&((1<<nPntBits)-1);
|
mas01mc@292
|
26 }
|
mas01mc@292
|
27
|
mas01mc@324
|
28 Uns32T audioDB::index_from_trackInfo(Uns32T trackID, Uns32T spos, Uns32T nPntBits){
|
mas01mc@324
|
29 assert(nPntBits);
|
mas01mc@324
|
30 return (trackID << nPntBits) | spos;
|
mas01mc@292
|
31 }
|
mas01mc@292
|
32
|
mas01mc@292
|
33 /************************* LSH indexing and query initialization *****************/
|
mas01mc@292
|
34
|
mas01mc@292
|
35 char* audioDB::index_get_name(const char*dbName, double radius, Uns32T sequenceLength){
|
mas01mc@292
|
36 char* indexName = new char[MAXSTR];
|
mas01mc@292
|
37 // Attempt to make new file
|
mas01mc@292
|
38 if(strlen(dbName) > (MAXSTR - 32))
|
mas01mc@292
|
39 error("dbName is too long for LSH index filename appendages");
|
mas01mc@292
|
40 strncpy(indexName, dbName, MAXSTR);
|
mas01mc@292
|
41 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
|
mas01mc@292
|
42 return indexName;
|
mas01mc@292
|
43 }
|
mas01mc@292
|
44
|
mas01mc@292
|
45 // return true if index exists else return false
|
mas01mc@292
|
46 int audioDB::index_exists(const char* dbName, double radius, Uns32T sequenceLength){
|
mas01mc@292
|
47 // Test to see if file exists
|
mas01mc@292
|
48 char* indexName = index_get_name(dbName, radius, sequenceLength);
|
mas01mc@292
|
49 lshfid = open (indexName, O_RDONLY);
|
mas01mc@292
|
50 delete[] indexName;
|
mas01mc@292
|
51 close(lshfid);
|
mas01mc@292
|
52
|
mas01mc@292
|
53 if(lshfid<0)
|
mas01mc@292
|
54 return false;
|
mas01mc@292
|
55 else
|
mas01mc@292
|
56 return true;
|
mas01mc@292
|
57 }
|
mas01mc@292
|
58
|
mas01mc@324
|
59 // If we are a server and have a memory-resident index, check the indexName against the resident index (using get_indexName())
|
mas01mc@324
|
60 // If they match, i.e. path+dbName_resident == path+dbName_requested, use
|
mas01mc@324
|
61 // the memory-resident index.
|
mas01mc@324
|
62 // Else allocate a new LSH instance and load the index from disk
|
mas01mc@308
|
63 LSH* audioDB::index_allocate(char* indexName, bool load_hashTables){
|
mas01mc@308
|
64 LSH* gIndx=SERVER_LSH_INDEX_SINGLETON;
|
mas01mc@308
|
65 if(isServer && gIndx && (strncmp(gIndx->get_indexName(), indexName, MAXSTR)==0) )
|
mas01mc@308
|
66 audioDB::lsh = gIndx; // Use the global SERVER resident index
|
mas01mc@308
|
67 else{
|
mas01mc@308
|
68 if(audioDB::lsh)
|
mas01mc@308
|
69 delete audioDB::lsh;
|
mas01mc@308
|
70 audioDB::lsh = new LSH(indexName, load_hashTables);
|
mas01mc@308
|
71 }
|
mas01mc@308
|
72 assert(audioDB::lsh);
|
mas01mc@308
|
73 return audioDB::lsh;
|
mas01mc@308
|
74 }
|
mas01mc@308
|
75
|
mas01mc@292
|
76 vector<vector<float> >* audioDB::index_initialize_shingles(Uns32T sz){
|
mas01mc@292
|
77 if(vv)
|
mas01mc@292
|
78 delete vv;
|
mas01mc@292
|
79 vv = new vector<vector<float> >(sz);
|
mas01mc@292
|
80 for(Uns32T i=0 ; i < sz ; i++)
|
mas01mc@292
|
81 (*vv)[i]=vector<float>(dbH->dim*sequenceLength); // allocate shingle storage
|
mas01mc@292
|
82 return vv;
|
mas01mc@292
|
83 }
|
mas01mc@292
|
84
|
mas01mc@292
|
85 /******************** LSH indexing audioDB database access forall s \in {S} ***********************/
|
mas01mc@292
|
86
|
mas01mc@292
|
87 // Prepare the AudioDB database for read access and allocate auxillary memory
|
mas01mc@292
|
88 void audioDB::index_initialize(double **snp, double **vsnp, double **spp, double **vspp, Uns32T *dvp) {
|
mas01mc@324
|
89 if (!(dbH->flags & O2_FLAG_POWER)) {
|
mas01mc@324
|
90 error("INDEXed database must be power-enabled", dbName);
|
mas01mc@324
|
91 }
|
mas01mc@324
|
92
|
mas01mc@325
|
93 double *snpp = 0, *sppp = 0;
|
mas01mc@324
|
94
|
mas01mc@292
|
95 *dvp = dbH->length / (dbH->dim * sizeof(double)); // number of database vectors
|
mas01mc@292
|
96 *snp = new double[*dvp]; // songs norm pointer: L2 norm table for each vector
|
mas01mc@325
|
97 snpp = *snp;
|
mas01mc@292
|
98 *spp = new double[*dvp]; // song powertable pointer
|
mas01mc@292
|
99 sppp = *spp;
|
mas01mc@325
|
100
|
mas01mc@324
|
101 memcpy(*snp, l2normTable, *dvp * sizeof(double));
|
mas01mc@292
|
102 memcpy(*spp, powerTable, *dvp * sizeof(double));
|
mas01mc@324
|
103
|
mas01mc@324
|
104
|
mas01mc@292
|
105 for(Uns32T i = 0; i < dbH->numFiles; i++){
|
mas01mc@292
|
106 if(trackTable[i] >= sequenceLength) {
|
mas01cr@427
|
107 audiodb_sequence_sum(snpp, trackTable[i], sequenceLength);
|
mas01cr@427
|
108 audiodb_sequence_sqrt(snpp, trackTable[i], sequenceLength);
|
mas01mc@292
|
109
|
mas01cr@427
|
110 audiodb_sequence_sum(sppp, trackTable[i], sequenceLength);
|
mas01cr@427
|
111 audiodb_sequence_average(sppp, trackTable[i], sequenceLength);
|
mas01mc@292
|
112 }
|
mas01mc@292
|
113 snpp += trackTable[i];
|
mas01mc@292
|
114 sppp += trackTable[i];
|
mas01mc@292
|
115 }
|
mas01mc@324
|
116
|
mas01mc@292
|
117 *vsnp = *snp;
|
mas01mc@292
|
118 *vspp = *spp;
|
mas01mc@324
|
119
|
mas01mc@292
|
120 // Move the feature vector read pointer to start of fetures in database
|
mas01mc@292
|
121 lseek(dbfid, dbH->dataOffset, SEEK_SET);
|
mas01mc@292
|
122 }
|
mas01mc@292
|
123
|
mas01mc@292
|
124
|
mas01cr@456
|
125 /********************* LSH shingle construction ***************************/
|
mas01cr@456
|
126
|
mas01cr@456
|
127 // Construct shingles out of a feature matrix
|
mas01cr@456
|
128 // inputs:
|
mas01cr@456
|
129 // idx is vector index in feature matrix
|
mas01cr@456
|
130 // fvp is base feature matrix pointer double* [numVecs x dbH->dim]
|
mas01cr@456
|
131 //
|
mas01cr@456
|
132 // pre-conditions:
|
mas01cr@456
|
133 // dbH->dim
|
mas01cr@456
|
134 // sequenceLength
|
mas01cr@456
|
135 // idx < numVectors - sequenceLength + 1
|
mas01cr@456
|
136 //
|
mas01cr@456
|
137 // post-conditions:
|
mas01cr@456
|
138 // (*vv)[idx] contains a shingle with dbH->dim*sequenceLength float values
|
mas01cr@456
|
139
|
mas01cr@456
|
140 static void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
|
mas01cr@456
|
141 assert(idx<(*vv).size());
|
mas01cr@456
|
142 vector<float>::iterator ve = (*vv)[idx].end();
|
mas01cr@456
|
143 vector<float>::iterator vi = (*vv)[idx].begin();
|
mas01cr@456
|
144 // First feature vector in shingle
|
mas01cr@456
|
145 if(idx == 0) {
|
mas01cr@456
|
146 while(vi!=ve) {
|
mas01cr@456
|
147 *vi++ = (float)(*fvp++);
|
mas01cr@456
|
148 }
|
mas01cr@456
|
149 } else {
|
mas01cr@456
|
150 // Not first feature vector in shingle
|
mas01cr@456
|
151 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
|
mas01cr@456
|
152 // Previous seqLen-1 dim-vectors
|
mas01cr@456
|
153 while(vi!=ve-dim) {
|
mas01cr@456
|
154 *vi++ = *ui++;
|
mas01cr@456
|
155 }
|
mas01cr@456
|
156 // Move data pointer to next feature vector
|
mas01cr@456
|
157 fvp += ( seqLen + idx - 1 ) * dim ;
|
mas01cr@456
|
158 // New d-vector
|
mas01cr@456
|
159 while(vi!=ve) {
|
mas01cr@456
|
160 *vi++ = (float)(*fvp++);
|
mas01cr@456
|
161 }
|
mas01cr@456
|
162 }
|
mas01cr@456
|
163 }
|
mas01cr@456
|
164
|
mas01cr@456
|
165 // norm shingles
|
mas01cr@456
|
166 // in-place norming, no deletions
|
mas01cr@456
|
167 // If using power, return number of shingles above power threshold
|
mas01cr@456
|
168 int audioDB::index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp){
|
mas01cr@456
|
169 int z = 0; // number of above-threshold shingles
|
mas01cr@456
|
170 float l2norm;
|
mas01cr@456
|
171 double power;
|
mas01cr@456
|
172 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
|
mas01cr@456
|
173 float oneOverSqrtl2NormDivRad = oneOverRadius;
|
mas01cr@456
|
174 if(!spp)
|
mas01cr@456
|
175 error("LSH indexing and query requires a power feature using -w or -W");
|
mas01cr@456
|
176 Uns32T shingleSize = sequenceLength*dbH->dim;
|
mas01cr@456
|
177 for(Uns32T a=0; a<(*vv).size(); a++){
|
mas01cr@456
|
178 l2norm = (float)(*snp++);
|
mas01cr@456
|
179 if(audioDB::normalizedDistance)
|
mas01cr@456
|
180 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
|
mas01cr@456
|
181
|
mas01cr@456
|
182 for(Uns32T b=0; b < shingleSize ; b++)
|
mas01cr@456
|
183 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
|
mas01cr@456
|
184
|
mas01cr@456
|
185 power = *spp++;
|
mas01cr@456
|
186 if(use_absolute_threshold){
|
mas01cr@456
|
187 if ( power >= absolute_threshold )
|
mas01cr@456
|
188 z++;
|
mas01cr@456
|
189 }
|
mas01cr@456
|
190 else
|
mas01cr@456
|
191 z++;
|
mas01cr@456
|
192 }
|
mas01cr@456
|
193 return z;
|
mas01cr@456
|
194 }
|
mas01cr@456
|
195
|
mas01cr@456
|
196
|
mas01mc@292
|
197 /************************ LSH indexing ***********************************/
|
mas01mc@292
|
198 void audioDB::index_index_db(const char* dbName){
|
mas01mc@292
|
199 char* newIndexName;
|
mas01mc@292
|
200 double *fvp = 0, *sNorm = 0, *snPtr = 0, *sPower = 0, *spPtr = 0;
|
mas01mc@292
|
201 Uns32T dbVectors = 0;
|
mas01mc@292
|
202
|
mas01mc@324
|
203
|
mas01mc@292
|
204 printf("INDEX: initializing header\n");
|
mas01mc@292
|
205 // Check if audioDB exists, initialize header and open database for read
|
mas01mc@292
|
206 forWrite = false;
|
mas01mc@292
|
207 initDBHeader(dbName);
|
mas01mc@292
|
208
|
mas01mc@324
|
209 if(dbH->flags & O2_FLAG_POWER)
|
mas01mc@324
|
210 usingPower = true;
|
mas01mc@324
|
211
|
mas01mc@324
|
212 if(dbH->flags & O2_FLAG_TIMES)
|
mas01mc@324
|
213 usingTimes = true;
|
mas01mc@324
|
214
|
mas01mc@292
|
215 newIndexName = index_get_name(dbName, radius, sequenceLength);
|
mas01mc@292
|
216
|
mas01mc@292
|
217 // Set unit norming flag override
|
mas01mc@292
|
218 audioDB::normalizedDistance = !audioDB::no_unit_norming;
|
mas01mc@292
|
219
|
mas01mc@327
|
220 VERB_LOG(1, "INDEX: dim %d\n", (int)dbH->dim);
|
mas01mc@327
|
221 VERB_LOG(1, "INDEX: R %f\n", radius);
|
mas01mc@327
|
222 VERB_LOG(1, "INDEX: seqlen %d\n", sequenceLength);
|
mas01mc@327
|
223 VERB_LOG(1, "INDEX: lsh_w %f\n", lsh_param_w);
|
mas01mc@327
|
224 VERB_LOG(1, "INDEX: lsh_k %d\n", lsh_param_k);
|
mas01mc@327
|
225 VERB_LOG(1, "INDEX: lsh_m %d\n", lsh_param_m);
|
mas01mc@327
|
226 VERB_LOG(1, "INDEX: lsh_N %d\n", lsh_param_N);
|
mas01mc@327
|
227 VERB_LOG(1, "INDEX: lsh_C %d\n", lsh_param_ncols);
|
mas01mc@327
|
228 VERB_LOG(1, "INDEX: lsh_b %d\n", lsh_param_b);
|
mas01mc@327
|
229 VERB_LOG(1, "INDEX: normalized? %s\n", normalizedDistance?"true":"false");
|
mas01mc@292
|
230
|
mas01mc@292
|
231 if((lshfid = open(newIndexName,O_RDONLY))<0){
|
mas01mc@292
|
232 printf("INDEX: constructing new LSH index\n");
|
mas01mc@292
|
233 printf("INDEX: making index file %s\n", newIndexName);
|
mas01mc@292
|
234 fflush(stdout);
|
mas01mc@292
|
235 // Construct new LSH index
|
mas01mc@292
|
236 lsh = new LSH((float)lsh_param_w, lsh_param_k,
|
mas01mc@292
|
237 lsh_param_m,
|
mas01mc@292
|
238 (Uns32T)(sequenceLength*dbH->dim),
|
mas01mc@292
|
239 lsh_param_N,
|
mas01mc@292
|
240 lsh_param_ncols,
|
mas01mc@292
|
241 (float)radius);
|
mas01mc@292
|
242 assert(lsh);
|
mas01mc@292
|
243
|
mas01mc@292
|
244 Uns32T endTrack = lsh_param_b;
|
mas01mc@292
|
245 if( endTrack > dbH->numFiles)
|
mas01mc@292
|
246 endTrack = dbH->numFiles;
|
mas01mc@292
|
247 // Insert up to lsh_param_b tracks
|
mas01mc@324
|
248 if( ! (dbH->flags & O2_FLAG_LARGE_ADB) ){
|
mas01mc@324
|
249 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
|
mas01mc@324
|
250 }
|
mas01mc@324
|
251 index_insert_tracks(0, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
|
mas01mc@292
|
252 lsh->serialize(newIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1);
|
mas01mc@292
|
253
|
mas01mc@292
|
254 // Clean up
|
mas01mc@292
|
255 delete lsh;
|
mas01mc@308
|
256 lsh = 0;
|
mas01mc@292
|
257 close(lshfid);
|
mas01mc@292
|
258 }
|
mas01mc@292
|
259
|
mas01mc@292
|
260 // Attempt to open LSH file
|
mas01mc@292
|
261 if((lshfid = open(newIndexName,O_RDONLY))>0){
|
mas01mc@292
|
262 printf("INDEX: merging with existing LSH index\n");
|
mas01mc@292
|
263 fflush(stdout);
|
mas01mc@340
|
264 char* mergeIndexName = newIndexName;
|
mas01mc@292
|
265
|
mas01mc@292
|
266 // Get the lsh header info and find how many tracks are inserted already
|
mas01mc@340
|
267 lsh = new LSH(mergeIndexName, false); // lshInCore=false to avoid loading hashTables here
|
mas01mc@292
|
268 assert(lsh);
|
mas01mc@324
|
269 Uns32T maxs = index_to_trackID(lsh->get_maxp(), lsh_n_point_bits)+1;
|
mas01mc@292
|
270 delete lsh;
|
mas01mc@308
|
271 lsh = 0;
|
mas01mc@292
|
272
|
mas01mc@340
|
273 // Insert up to lsh_param_b tracks
|
mas01mc@340
|
274 if( !sNorm && !(dbH->flags & O2_FLAG_LARGE_ADB) ){
|
mas01mc@340
|
275 index_initialize(&sNorm, &snPtr, &sPower, &spPtr, &dbVectors);
|
mas01mc@340
|
276 }
|
mas01mc@292
|
277 // This allows for updating index after more tracks are inserted into audioDB
|
mas01mc@292
|
278 for(Uns32T startTrack = maxs; startTrack < dbH->numFiles; startTrack+=lsh_param_b){
|
mas01mc@292
|
279
|
mas01mc@292
|
280 Uns32T endTrack = startTrack + lsh_param_b;
|
mas01mc@292
|
281 if( endTrack > dbH->numFiles)
|
mas01mc@292
|
282 endTrack = dbH->numFiles;
|
mas01mc@292
|
283 printf("Indexing track range: %d - %d\n", startTrack, endTrack);
|
mas01mc@292
|
284 fflush(stdout);
|
mas01mc@340
|
285 lsh = new LSH(mergeIndexName, false); // Initialize empty LSH tables
|
mas01mc@292
|
286 assert(lsh);
|
mas01mc@292
|
287
|
mas01mc@292
|
288 // Insert up to lsh_param_b database tracks
|
mas01mc@292
|
289 index_insert_tracks(startTrack, endTrack, &fvp, &sNorm, &snPtr, &sPower, &spPtr);
|
mas01mc@292
|
290
|
mas01mc@340
|
291 // Serialize to file (merging is performed here)
|
mas01mc@340
|
292 lsh->serialize(mergeIndexName, lsh_in_core?O2_SERIAL_FILEFORMAT2:O2_SERIAL_FILEFORMAT1); // Serialize core LSH heap to disk
|
mas01mc@292
|
293 delete lsh;
|
mas01mc@308
|
294 lsh = 0;
|
mas01mc@340
|
295 }
|
mas01mc@292
|
296
|
mas01mc@292
|
297 close(lshfid);
|
mas01mc@292
|
298 printf("INDEX: done constructing LSH index.\n");
|
mas01mc@292
|
299 fflush(stdout);
|
mas01mc@292
|
300
|
mas01mc@292
|
301 }
|
mas01mc@292
|
302 else{
|
mas01mc@292
|
303 error("Something's wrong with LSH index file");
|
mas01mc@292
|
304 exit(1);
|
mas01mc@292
|
305 }
|
mas01mc@292
|
306
|
mas01mc@324
|
307 delete[] newIndexName;
|
mas01mc@324
|
308 delete[] sNorm;
|
mas01mc@324
|
309 delete[] sPower;
|
mas01mc@324
|
310 }
|
mas01mc@292
|
311
|
mas01mc@292
|
312
|
mas01cr@405
|
313 void audioDB::insertPowerData(unsigned numVectors, int powerfd, double *powerdata) {
|
mas01cr@405
|
314 if(usingPower){
|
mas01cr@405
|
315 int one;
|
mas01cr@405
|
316 unsigned int count;
|
mas01cr@405
|
317
|
mas01cr@405
|
318 count = read(powerfd, &one, sizeof(unsigned int));
|
mas01cr@405
|
319 if (count != sizeof(unsigned int)) {
|
mas01cr@405
|
320 error("powerfd read failed", "int", "read");
|
mas01cr@405
|
321 }
|
mas01cr@405
|
322 if (one != 1) {
|
mas01cr@405
|
323 error("dimensionality of power file not 1", powerFileName);
|
mas01cr@405
|
324 }
|
mas01cr@405
|
325
|
mas01cr@405
|
326 // FIXME: should check that the powerfile is the right size for
|
mas01cr@405
|
327 // this. -- CSR, 2007-10-30
|
mas01cr@405
|
328 count = read(powerfd, powerdata, numVectors * sizeof(double));
|
mas01cr@405
|
329 if (count != numVectors * sizeof(double)) {
|
mas01cr@405
|
330 error("powerfd read failed", "double", "read");
|
mas01cr@405
|
331 }
|
mas01cr@405
|
332 }
|
mas01cr@405
|
333 }
|
mas01cr@405
|
334
|
mas01mc@324
|
335 // initialize auxillary track data from filesystem
|
mas01mc@324
|
336 // pre-conditions:
|
mas01mc@324
|
337 // dbH->flags & O2_FLAG_LARGE_ADB
|
mas01mc@324
|
338 // feature data allocated and copied (fvp)
|
mas01mc@324
|
339 //
|
mas01mc@324
|
340 // post-conditions:
|
mas01mc@324
|
341 // allocated power data
|
mas01mc@324
|
342 // allocated l2norm data
|
mas01mc@324
|
343 //
|
mas01mc@324
|
344 void audioDB::init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp){
|
mas01mc@324
|
345 if( !(dbH->flags & O2_FLAG_LARGE_ADB) )
|
mas01mc@324
|
346 error("error: init_track_large_adb required O2_FLAG_LARGE_ADB");
|
mas01mc@292
|
347
|
mas01mc@324
|
348 // Allocate and read the power sequence
|
mas01mc@324
|
349 if(trackTable[trackID]>=sequenceLength){
|
mas01mc@324
|
350
|
mas01mc@324
|
351 char* prefixedString = new char[O2_MAXFILESTR];
|
mas01mc@324
|
352 char* tmpStr = prefixedString;
|
mas01mc@324
|
353 // Open and check dimensions of power file
|
mas01mc@324
|
354 strncpy(prefixedString, powerFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
|
mas01mc@324
|
355 prefix_name((char ** const)&prefixedString, adb_feature_root);
|
mas01mc@324
|
356 if(prefixedString!=tmpStr)
|
mas01mc@324
|
357 delete[] tmpStr;
|
mas01mc@324
|
358 powerfd = open(prefixedString, O_RDONLY);
|
mas01mc@324
|
359 if (powerfd < 0) {
|
mas01mc@324
|
360 error("failed to open power file", prefixedString);
|
mas01mc@324
|
361 }
|
mas01mc@324
|
362 if (fstat(powerfd, &statbuf) < 0) {
|
mas01mc@324
|
363 error("fstat error finding size of power file", prefixedString, "fstat");
|
mas01mc@324
|
364 }
|
mas01mc@324
|
365
|
mas01mc@324
|
366 if( (statbuf.st_size - sizeof(int)) / (sizeof(double)) != trackTable[trackID] )
|
mas01mc@324
|
367 error("Dimension mismatch: numPowers != numVectors", prefixedString);
|
mas01mc@324
|
368
|
mas01mc@324
|
369 *sPowerp = new double[trackTable[trackID]]; // Allocate memory for power values
|
mas01mc@324
|
370 assert(*sPowerp);
|
mas01mc@324
|
371 *spPtrp = *sPowerp;
|
mas01mc@324
|
372 insertPowerData(trackTable[trackID], powerfd, *sPowerp);
|
mas01mc@324
|
373 if (0 < powerfd) {
|
mas01mc@324
|
374 close(powerfd);
|
mas01mc@324
|
375 }
|
mas01mc@324
|
376
|
mas01cr@427
|
377 audiodb_sequence_sum(*sPowerp, trackTable[trackID], sequenceLength);
|
mas01cr@427
|
378 audiodb_sequence_average(*sPowerp, trackTable[trackID], sequenceLength);
|
mas01mc@324
|
379 powerTable = 0;
|
mas01mc@324
|
380
|
mas01mc@324
|
381 // Allocate and calculate the l2norm sequence
|
mas01mc@324
|
382 *sNormpp = new double[trackTable[trackID]];
|
mas01mc@324
|
383 assert(*sNormpp);
|
mas01mc@324
|
384 *snPtrp = *sNormpp;
|
mas01cr@426
|
385 audiodb_l2norm_buffer(fvp, dbH->dim, trackTable[trackID], *sNormpp);
|
mas01cr@427
|
386 audiodb_sequence_sum(*sNormpp, trackTable[trackID], sequenceLength);
|
mas01cr@427
|
387 audiodb_sequence_sqrt(*sNormpp, trackTable[trackID], sequenceLength);
|
mas01mc@324
|
388 }
|
mas01mc@292
|
389 }
|
mas01mc@292
|
390
|
mas01mc@292
|
391 void audioDB::index_insert_tracks(Uns32T start_track, Uns32T end_track,
|
mas01mc@292
|
392 double** fvpp, double** sNormpp,double** snPtrp,
|
mas01mc@292
|
393 double** sPowerp, double** spPtrp){
|
mas01mc@292
|
394 size_t nfv = 0;
|
mas01mc@292
|
395 double* fvp = 0; // Keep pointer for memory allocation and free() for track data
|
mas01mc@292
|
396 Uns32T trackID = 0;
|
mas01mc@292
|
397
|
mas01mc@292
|
398 VERB_LOG(1, "indexing tracks...");
|
mas01mc@292
|
399
|
mas01mc@324
|
400 int trackfd = dbfid;
|
mas01mc@292
|
401 for(trackID = start_track ; trackID < end_track ; trackID++ ){
|
mas01mc@324
|
402 if( dbH->flags & O2_FLAG_LARGE_ADB ){
|
mas01mc@324
|
403 char* prefixedString = new char[O2_MAXFILESTR];
|
mas01mc@324
|
404 char* tmpStr = prefixedString;
|
mas01mc@324
|
405 // Open and check dimensions of feature file
|
mas01mc@324
|
406 strncpy(prefixedString, featureFileNameTable+trackID*O2_FILETABLE_ENTRY_SIZE, O2_MAXFILESTR);
|
mas01mc@324
|
407 prefix_name((char ** const) &prefixedString, adb_feature_root);
|
mas01mc@324
|
408 if(prefixedString!=tmpStr)
|
mas01mc@324
|
409 delete[] tmpStr;
|
mas01cr@454
|
410 initInputFile(prefixedString);
|
mas01mc@324
|
411 trackfd = infid;
|
mas01mc@324
|
412 }
|
mas01cr@433
|
413 if(audiodb_read_data(adb, trackfd, trackID, &fvp, &nfv))
|
mas01cr@433
|
414 error("failed to read data");
|
mas01mc@292
|
415 *fvpp = fvp; // Protect memory allocation and free() for track data
|
mas01mc@324
|
416
|
mas01mc@324
|
417 if( dbH->flags & O2_FLAG_LARGE_ADB )
|
mas01mc@324
|
418 // Load power and calculate power and l2norm sequence sums
|
mas01mc@324
|
419 init_track_aux_data(trackID, fvp, sNormpp, snPtrp, sPowerp, spPtrp);
|
mas01mc@324
|
420
|
mas01mc@292
|
421 if(!index_insert_track(trackID, fvpp, snPtrp, spPtrp))
|
mas01mc@292
|
422 break;
|
mas01mc@324
|
423 if ( dbH->flags & O2_FLAG_LARGE_ADB ){
|
mas01mc@324
|
424 close(infid);
|
mas01mc@324
|
425 delete[] *sNormpp;
|
mas01mc@324
|
426 delete[] *sPowerp;
|
mas01mc@324
|
427 *sNormpp = *sPowerp = *snPtrp = *snPtrp = 0;
|
mas01mc@324
|
428 }
|
mas01mc@324
|
429 } // end for(trackID = start_track ; ... )
|
mas01mc@292
|
430 std::cout << "finished inserting." << endl;
|
mas01mc@292
|
431 }
|
mas01mc@292
|
432
|
mas01mc@292
|
433 int audioDB::index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp){
|
mas01mc@292
|
434 // Loop over the current input track's vectors
|
mas01cr@305
|
435 Uns32T numVecs = 0;
|
mas01cr@305
|
436 if (trackTable[trackID] > O2_MAXTRACKLEN) {
|
mas01cr@305
|
437 if (O2_MAXTRACKLEN < sequenceLength - 1) {
|
mas01cr@305
|
438 numVecs = 0;
|
mas01cr@305
|
439 } else {
|
mas01cr@305
|
440 numVecs = O2_MAXTRACKLEN - sequenceLength + 1;
|
mas01cr@305
|
441 }
|
mas01cr@305
|
442 } else {
|
mas01cr@305
|
443 if (trackTable[trackID] < sequenceLength - 1) {
|
mas01cr@305
|
444 numVecs = 0;
|
mas01cr@305
|
445 } else {
|
mas01cr@305
|
446 numVecs = trackTable[trackID] - sequenceLength + 1;
|
mas01cr@305
|
447 }
|
mas01cr@305
|
448 }
|
mas01mc@292
|
449
|
mas01mc@324
|
450 Uns32T numVecsAboveThreshold = 0, collisionCount = 0;
|
mas01mc@324
|
451 if(numVecs){
|
mas01mc@324
|
452 vv = index_initialize_shingles(numVecs);
|
mas01mc@324
|
453
|
mas01mc@324
|
454 for( Uns32T pointID = 0 ; pointID < numVecs; pointID++ )
|
mas01cr@456
|
455 audiodb_index_make_shingle(vv, pointID, *fvpp, dbH->dim, sequenceLength);
|
mas01mc@324
|
456
|
mas01mc@324
|
457 numVecsAboveThreshold = index_norm_shingles(vv, *snpp, *sppp);
|
mas01mc@324
|
458 collisionCount = index_insert_shingles(vv, trackID, *sppp);
|
mas01mc@324
|
459 }
|
mas01mc@292
|
460 float meanCollisionCount = numVecsAboveThreshold?(float)collisionCount/numVecsAboveThreshold:0;
|
mas01mc@292
|
461
|
mas01mc@292
|
462 /* index_norm_shingles() only goes as far as the end of the
|
mas01mc@292
|
463 sequence, which is right, but the space allocated is for the
|
mas01mc@292
|
464 whole track. */
|
mas01mc@292
|
465
|
mas01mc@292
|
466 /* But numVecs will be <trackTable[track] if trackTable[track]>O2_MAXTRACKLEN
|
mas01mc@292
|
467 * So let's be certain the pointers are in the correct place
|
mas01mc@292
|
468 */
|
mas01mc@292
|
469
|
mas01mc@324
|
470 if( !(dbH->flags & O2_FLAG_LARGE_ADB) ){
|
mas01mc@324
|
471 *snpp += trackTable[trackID];
|
mas01mc@324
|
472 *sppp += trackTable[trackID];
|
mas01mc@324
|
473 *fvpp += trackTable[trackID] * dbH->dim;
|
mas01mc@324
|
474 }
|
mas01mc@292
|
475
|
mas01mc@292
|
476 std::cout << " n=" << trackTable[trackID] << " n'=" << numVecsAboveThreshold << " E[#c]=" << lsh->get_mean_collision_rate() << " E[#p]=" << meanCollisionCount << endl;
|
mas01mc@292
|
477 std::cout.flush();
|
mas01mc@292
|
478 return true;
|
mas01mc@292
|
479 }
|
mas01mc@292
|
480
|
mas01mc@292
|
481 Uns32T audioDB::index_insert_shingles(vector<vector<float> >* vv, Uns32T trackID, double* spp){
|
mas01mc@292
|
482 Uns32T collisionCount = 0;
|
mas01mc@292
|
483 cout << "[" << trackID << "]" << fileTable+trackID*O2_FILETABLE_ENTRY_SIZE;
|
mas01mc@324
|
484 for( Uns32T pointID=0 ; pointID < (*vv).size(); pointID+=sequenceHop){
|
mas01mc@324
|
485 if(!use_absolute_threshold || (use_absolute_threshold && (*spp >= absolute_threshold)))
|
mas01mc@324
|
486 collisionCount += lsh->insert_point((*vv)[pointID], index_from_trackInfo(trackID, pointID, lsh_n_point_bits));
|
mas01mc@324
|
487 spp+=sequenceHop;
|
mas01mc@311
|
488 }
|
mas01mc@292
|
489 return collisionCount;
|
mas01mc@292
|
490 }
|
mas01mc@292
|
491
|
mas01mc@292
|
492 /*********************** LSH retrieval ****************************/
|
mas01mc@292
|
493
|
mas01mc@292
|
494
|
mas01mc@292
|
495 // return true if indexed query performed else return false
|
mas01mc@292
|
496 int audioDB::index_init_query(const char* dbName){
|
mas01mc@292
|
497
|
mas01mc@292
|
498 if(!(index_exists(dbName, radius, sequenceLength)))
|
mas01mc@292
|
499 return false;
|
mas01mc@292
|
500
|
mas01mc@292
|
501 char* indexName = index_get_name(dbName, radius, sequenceLength);
|
mas01mc@292
|
502
|
mas01mc@292
|
503 // Test to see if file exists
|
mas01mc@292
|
504 if((lshfid = open (indexName, O_RDONLY)) < 0){
|
mas01mc@292
|
505 delete[] indexName;
|
mas01mc@292
|
506 return false;
|
mas01mc@292
|
507 }
|
mas01mc@292
|
508
|
mas01mc@308
|
509 lsh = index_allocate(indexName, false); // Get the header only here
|
mas01mc@292
|
510 sequenceLength = lsh->get_lshHeader()->dataDim / dbH->dim; // shingleDim / vectorDim
|
mas01mc@292
|
511
|
mas01mc@311
|
512 if(lsh!=SERVER_LSH_INDEX_SINGLETON){
|
mas01mc@308
|
513 if( fabs(radius - lsh->get_radius())>fabs(O2_DISTANCE_TOLERANCE))
|
mas01mc@308
|
514 printf("*** Warning: adb_radius (%f) != lsh_radius (%f) ***\n", radius, lsh->get_radius());
|
mas01mc@327
|
515 VERB_LOG(1,"INDEX: dim %d\n", (int)dbH->dim);
|
mas01mc@327
|
516 VERB_LOG(1,"INDEX: R %f\n", lsh->get_radius());
|
mas01mc@327
|
517 VERB_LOG(1,"INDEX: seqlen %d\n", sequenceLength);
|
mas01mc@327
|
518 VERB_LOG(1,"INDEX: w %f\n", lsh->get_lshHeader()->get_binWidth());
|
mas01mc@327
|
519 VERB_LOG(1,"INDEX: k %d\n", lsh->get_lshHeader()->get_numFuns());
|
mas01mc@327
|
520 VERB_LOG(1,"INDEX: L (m*(m-1))/2 %d\n", lsh->get_lshHeader()->get_numTables());
|
mas01mc@327
|
521 VERB_LOG(1,"INDEX: N %d\n", lsh->get_lshHeader()->get_numRows());
|
mas01mc@327
|
522 VERB_LOG(1,"INDEX: s %d\n", index_to_trackID(lsh->get_maxp(), lsh_n_point_bits));
|
mas01mc@327
|
523 VERB_LOG(1,"INDEX: Opened LSH index file %s\n", indexName);
|
mas01mc@308
|
524 }
|
mas01mc@292
|
525
|
mas01mc@292
|
526 // Check to see if we are loading hash tables into core, and do so if true
|
mas01mc@292
|
527 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core){
|
mas01mc@308
|
528 if(SERVER_LSH_INDEX_SINGLETON)
|
mas01mc@308
|
529 fprintf(stderr,"INDEX: using persistent hash tables: %s\n", lsh->get_indexName());
|
mas01mc@308
|
530 else
|
mas01mc@327
|
531 VERB_LOG(1,"INDEX: loading hash tables into core %s\n", (lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2)?"FORMAT2":"FORMAT1");
|
mas01mc@308
|
532 lsh = index_allocate(indexName, true);
|
mas01mc@292
|
533 }
|
mas01mc@292
|
534
|
mas01mc@292
|
535 delete[] indexName;
|
mas01mc@292
|
536 return true;
|
mas01mc@292
|
537 }
|
mas01mc@292
|
538
|
mas01mc@292
|
539 // *Static* approximate NN point reporter callback method for lshlib
|
mas01mc@292
|
540 void audioDB::index_add_point_approximate(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
|
mas01mc@292
|
541 assert(instancePtr); // We need an instance for this callback
|
mas01mc@292
|
542 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
|
mas01mc@324
|
543 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
|
mas01mc@324
|
544 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
|
mas01mc@292
|
545 // Skip identity in query_from_key
|
mas01cr@424
|
546 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) ) {
|
mas01cr@424
|
547 adb_result_t r;
|
mas01cr@424
|
548 r.key = myself->fileTable + trackID * O2_FILETABLE_ENTRY_SIZE;
|
mas01cr@424
|
549 r.dist = dist;
|
mas01cr@424
|
550 r.qpos = qpos;
|
mas01cr@424
|
551 r.ipos = spos;
|
mas01cr@424
|
552 myself->accumulator->add_point(&r);
|
mas01cr@424
|
553 }
|
mas01mc@292
|
554 }
|
mas01mc@292
|
555
|
mas01mc@292
|
556 // *Static* exact NN point reporter callback method for lshlib
|
mas01mc@292
|
557 // Maintain a queue of points to pass to query_points() for exact evaluation
|
mas01mc@292
|
558 void audioDB::index_add_point_exact(void* instancePtr, Uns32T pointID, Uns32T qpos, float dist){
|
mas01mc@292
|
559 assert(instancePtr); // We need an instance for this callback
|
mas01mc@292
|
560 audioDB* myself = (audioDB*) instancePtr; // Use explicit cast to recover "this" instance
|
mas01mc@324
|
561 Uns32T trackID = index_to_trackID(pointID, myself->lsh_n_point_bits);
|
mas01mc@324
|
562 Uns32T spos = index_to_trackPos(pointID, myself->lsh_n_point_bits);
|
mas01mc@292
|
563 // Skip identity in query_from_key
|
mas01mc@292
|
564 if( !myself->query_from_key || (myself->query_from_key && ( trackID != myself->query_from_key_index )) )
|
mas01mc@292
|
565 myself->index_insert_exact_evaluation_queue(trackID, qpos, spos);
|
mas01mc@292
|
566 }
|
mas01mc@292
|
567
|
mas01mc@292
|
568 void audioDB::initialize_exact_evalutation_queue(){
|
mas01mc@292
|
569 if(exact_evaluation_queue)
|
mas01mc@292
|
570 delete exact_evaluation_queue;
|
mas01mc@292
|
571 exact_evaluation_queue = new priority_queue<PointPair, std::vector<PointPair>, std::less<PointPair> >;
|
mas01mc@292
|
572 }
|
mas01mc@292
|
573
|
mas01mc@292
|
574 void audioDB::index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos){
|
mas01mc@292
|
575 PointPair p(trackID, qpos, spos);
|
mas01mc@292
|
576 exact_evaluation_queue->push(p);
|
mas01mc@292
|
577 }
|
mas01mc@292
|
578
|
mas01mc@292
|
579 // return 0: if index does not exist
|
mas01mc@292
|
580 // return nqv: if index exists
|
mas01cr@455
|
581 int audioDB::index_query_loop(adb_t *adb, adb_query_spec_t *spec) {
|
mas01mc@292
|
582
|
mas01mc@324
|
583 double *query = 0, *query_data = 0;
|
mas01cr@437
|
584 adb_qpointers_internal_t qpointers = {0};
|
mas01cr@437
|
585
|
mas01mc@292
|
586 void (*add_point_func)(void*,Uns32T,Uns32T,float);
|
mas01mc@292
|
587
|
mas01cr@436
|
588 sequenceLength = spec->qid.sequence_length;
|
mas01cr@435
|
589 normalizedDistance = (spec->params.distance == ADB_DISTANCE_EUCLIDEAN_NORMED);
|
mas01cr@431
|
590
|
mas01mc@292
|
591 // Set the point-reporter callback based on the value of lsh_exact
|
mas01mc@292
|
592 if(lsh_exact){
|
mas01mc@292
|
593 initialize_exact_evalutation_queue();
|
mas01mc@292
|
594 add_point_func = &index_add_point_exact;
|
mas01mc@292
|
595 }
|
mas01mc@292
|
596 else
|
mas01mc@292
|
597 add_point_func = &index_add_point_approximate;
|
mas01mc@292
|
598
|
mas01cr@455
|
599 if(!index_init_query(adb->path)) // sets-up LSH index structures for querying
|
mas01mc@292
|
600 return 0;
|
mas01mc@292
|
601
|
mas01cr@455
|
602 char* database = index_get_name(adb->path, radius, sequenceLength);
|
mas01mc@292
|
603
|
mas01cr@444
|
604 if(audiodb_query_spec_qpointers(adb, spec, &query_data, &query, &qpointers)) {
|
mas01cr@444
|
605 error("failed to set up qpointers");
|
mas01cr@444
|
606 }
|
mas01mc@292
|
607
|
mas01mc@292
|
608 // query vector index
|
mas01cr@437
|
609 Uns32T Nq = (qpointers.nvectors>O2_MAXTRACKLEN?O2_MAXTRACKLEN:qpointers.nvectors) - sequenceLength + 1;
|
mas01mc@292
|
610 vv = index_initialize_shingles(Nq); // allocate memory to copy query vectors to shingles
|
mas01cr@447
|
611
|
mas01mc@292
|
612 // Construct shingles from query features
|
mas01mc@292
|
613 for( Uns32T pointID = 0 ; pointID < Nq ; pointID++ )
|
mas01cr@456
|
614 audiodb_index_make_shingle(vv, pointID, query, dbH->dim, sequenceLength);
|
mas01mc@292
|
615
|
mas01mc@292
|
616 // Normalize query vectors
|
mas01cr@437
|
617 Uns32T numVecsAboveThreshold = index_norm_shingles( vv, qpointers.l2norm, qpointers.power );
|
mas01mc@292
|
618
|
mas01mc@292
|
619 // Nq contains number of inspected points in query file,
|
mas01mc@292
|
620 // numVecsAboveThreshold is number of points with power >= absolute_threshold
|
mas01cr@437
|
621 double* qpp = qpointers.power; // Keep original qpPtr for possible exact evaluation
|
mas01mc@292
|
622 if(usingQueryPoint && numVecsAboveThreshold){
|
mas01mc@292
|
623 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core)
|
mas01mc@292
|
624 lsh->retrieve_point((*vv)[0], queryPoint, add_point_func, (void*)this);
|
mas01mc@292
|
625 else
|
mas01mc@292
|
626 lsh->serial_retrieve_point(database, (*vv)[0], queryPoint, add_point_func, (void*)this);
|
mas01mc@292
|
627 }
|
mas01mc@292
|
628 else if(numVecsAboveThreshold)
|
mas01mc@292
|
629 for( Uns32T pointID = 0 ; pointID < Nq; pointID++ )
|
mas01cr@370
|
630 if(!use_absolute_threshold || (use_absolute_threshold && (*qpp++ >= absolute_threshold))) {
|
mas01cr@370
|
631 if((lsh->get_lshHeader()->flags&O2_SERIAL_FILEFORMAT2) || lsh_in_core) {
|
mas01mc@292
|
632 lsh->retrieve_point((*vv)[pointID], pointID, add_point_func, (void*)this);
|
mas01cr@370
|
633 } else {
|
mas01mc@292
|
634 lsh->serial_retrieve_point(database, (*vv)[pointID], pointID, add_point_func, (void*)this);
|
mas01cr@370
|
635 }
|
mas01cr@370
|
636 }
|
mas01mc@292
|
637
|
mas01mc@292
|
638 if(lsh_exact)
|
mas01mc@292
|
639 // Perform exact distance computation on point pairs in exact_evaluation_queue
|
mas01cr@455
|
640 query_loop_points(adb, spec, query, &qpointers);
|
mas01mc@292
|
641
|
mas01mc@292
|
642 // Close the index file
|
mas01mc@292
|
643 close(lshfid);
|
mas01mc@292
|
644
|
mas01mc@292
|
645 // Clean up
|
mas01mc@292
|
646 if(query_data)
|
mas01mc@292
|
647 delete[] query_data;
|
mas01cr@437
|
648 if(qpointers.l2norm_data)
|
mas01cr@437
|
649 delete[] qpointers.l2norm_data;
|
mas01cr@437
|
650 if(qpointers.power_data)
|
mas01cr@437
|
651 delete[] qpointers.power_data;
|
mas01cr@437
|
652 if(qpointers.mean_duration)
|
mas01cr@437
|
653 delete[] qpointers.mean_duration;
|
mas01mc@292
|
654 if(database)
|
mas01mc@292
|
655 delete[] database;
|
mas01mc@292
|
656
|
mas01mc@292
|
657 return Nq;
|
mas01mc@292
|
658 }
|
mas01mc@292
|
659
|