mas01mc@292
|
1 #include "lshlib.h"
|
mas01mc@292
|
2
|
mas01cr@648
|
3 #define getpagesize() (64*1024)
|
mas01mc@292
|
4
|
mas01mc@764
|
5 Uns32T get_page_logn(){
|
mas01mc@764
|
6 int pagesz = (int)sysconf(_SC_PAGESIZE);
|
mas01mc@764
|
7 return (Uns32T)log2((double)pagesz);
|
mas01mc@292
|
8 }
|
mas01mc@292
|
9
|
mas01mc@764
|
10 unsigned align_up(unsigned x, unsigned w) { return (((x) + ((1<<w)-1)) & ~((1<<w)-1)); }
|
mas01mc@764
|
11
|
mas01mc@292
|
12 void H::error(const char* a, const char* b, const char *sysFunc) {
|
mas01mc@292
|
13 cerr << a << ": " << b << endl;
|
mas01mc@292
|
14 if (sysFunc) {
|
mas01mc@292
|
15 perror(sysFunc);
|
mas01mc@292
|
16 }
|
mas01mc@292
|
17 exit(1);
|
mas01mc@292
|
18 }
|
mas01mc@292
|
19
|
mas01mc@764
|
20 H::H():
|
mas01mc@764
|
21 multiProbePtr(new MultiProbe()),
|
mas01mc@764
|
22 boundaryDistances(0)
|
mas01mc@764
|
23 {
|
mas01mc@293
|
24 // Delay initialization of lsh functions until we know the parameters
|
mas01mc@293
|
25 }
|
mas01mc@293
|
26
|
mas01mc@293
|
27 H::H(Uns32T kk, Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float ww, float rr):
|
mas01mc@292
|
28 #ifdef USE_U_FUNCTIONS
|
mas01mc@292
|
29 use_u_functions(true),
|
mas01mc@292
|
30 #else
|
mas01mc@292
|
31 use_u_functions(false),
|
mas01mc@292
|
32 #endif
|
mas01mc@293
|
33 maxp(0),
|
mas01mc@292
|
34 bucketCount(0),
|
mas01mc@292
|
35 pointCount(0),
|
mas01mc@292
|
36 N(NN),
|
mas01mc@292
|
37 C(CC),
|
mas01mc@292
|
38 k(kk),
|
mas01mc@292
|
39 m(mm),
|
mas01mc@293
|
40 L((mm*(mm-1))/2),
|
mas01mc@293
|
41 d(dd),
|
mas01mc@293
|
42 w(ww),
|
mas01mc@764
|
43 radius(rr),
|
mas01mc@764
|
44 multiProbePtr(new MultiProbe()),
|
mas01mc@764
|
45 boundaryDistances(0)
|
mas01mc@292
|
46 {
|
mas01mc@292
|
47
|
mas01mc@292
|
48 if(m<2){
|
mas01mc@292
|
49 m=2;
|
mas01mc@292
|
50 L=1; // check value of L
|
mas01mc@292
|
51 cout << "warning: setting m=2, L=1" << endl;
|
mas01mc@292
|
52 }
|
mas01mc@292
|
53 if(use_u_functions && k%2){
|
mas01mc@292
|
54 k++; // make sure k is even
|
mas01mc@292
|
55 cout << "warning: setting k even" << endl;
|
mas01mc@292
|
56 }
|
mas01mc@293
|
57
|
mas01mc@293
|
58 // We have the necessary parameters, so construct hashfunction datastructures
|
mas01mc@293
|
59 initialize_lsh_functions();
|
mas01mc@292
|
60 }
|
mas01mc@292
|
61
|
mas01mc@293
|
62 void H::initialize_lsh_functions(){
|
mas01mc@292
|
63 H::P = UH_PRIME_DEFAULT;
|
mas01mc@292
|
64
|
mas01mc@292
|
65 /* FIXME: don't use time(); instead use /dev/random or similar */
|
mas01mc@292
|
66 /* FIXME: write out the seed somewhere, so that we can get
|
mas01mc@292
|
67 repeatability */
|
mas01mc@292
|
68 #ifdef MT19937
|
mas01mc@292
|
69 init_genrand(time(NULL));
|
mas01mc@292
|
70 #else
|
mas01mc@292
|
71 srand(time(NULL)); // seed random number generator
|
mas01mc@292
|
72 #endif
|
mas01mc@293
|
73 Uns32T i,j, kk;
|
mas01mc@293
|
74 #ifdef USE_U_FUNCTIONS
|
mas01mc@293
|
75 H::A = new float**[ H::m ]; // m x k x d random projectors
|
mas01mc@293
|
76 H::b = new float*[ H::m ]; // m x k random biases
|
mas01mc@293
|
77 #else
|
mas01mc@293
|
78 H::A = new float**[ H::L ]; // m x k x d random projectors
|
mas01mc@293
|
79 H::b = new float*[ H::L ]; // m x k random biases
|
mas01mc@293
|
80 #endif
|
mas01mc@293
|
81 H::g = new Uns32T*[ H::L ]; // L x k random projections
|
mas01mc@293
|
82 assert( H::g && H::A && H::b ); // failure
|
mas01mc@293
|
83 #ifdef USE_U_FUNCTIONS
|
mas01mc@293
|
84 // Use m \times u_i functions \in R^{(k/2) \times (d)}
|
mas01mc@293
|
85 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
|
mas01mc@293
|
86 for( j = 0; j < H::m ; j++ ){ // m functions u_i(v)
|
mas01mc@293
|
87 H::A[j] = new float*[ H::k/2 ]; // k/2 x d 2-stable distribution coefficients
|
mas01mc@293
|
88 H::b[j] = new float[ H::k/2 ]; // bias
|
mas01mc@293
|
89 assert( H::A[j] && H::b[j] ); // failure
|
mas01mc@293
|
90 for( kk = 0; kk < H::k/2 ; kk++ ){
|
mas01mc@293
|
91 H::A[j][kk] = new float[ H::d ];
|
mas01mc@293
|
92 assert( H::A[j][kk] ); // failure
|
mas01mc@293
|
93 for(Uns32T i = 0 ; i < H::d ; i++ )
|
mas01mc@293
|
94 H::A[j][kk][i] = H::randn(); // Normal
|
mas01mc@293
|
95 H::b[j][kk] = H::ranf()*H::w; // Uniform
|
mas01mc@293
|
96 }
|
mas01mc@293
|
97 }
|
mas01mc@293
|
98 #else
|
mas01mc@293
|
99 // Use m \times u_i functions \in R^{k \times (d)}
|
mas01mc@293
|
100 // Combine to make L=m(m-1)/2 hash functions \in R^{k \times d}
|
mas01mc@293
|
101 for( j = 0; j < H::L ; j++ ){ // m functions u_i(v)
|
mas01mc@293
|
102 H::A[j] = new float*[ H::k ]; // k x d 2-stable distribution coefficients
|
mas01mc@293
|
103 H::b[j] = new float[ H::k ]; // bias
|
mas01mc@293
|
104 assert( H::A[j] && H::b[j] ); // failure
|
mas01mc@293
|
105 for( kk = 0; kk < H::k ; kk++ ){
|
mas01mc@293
|
106 H::A[j][kk] = new float[ H::d ];
|
mas01mc@293
|
107 assert( H::A[j][kk] ); // failure
|
mas01mc@293
|
108 for(Uns32T i = 0 ; i < H::d ; i++ )
|
mas01mc@293
|
109 H::A[j][kk][i] = H::randn(); // Normal
|
mas01mc@293
|
110 H::b[j][kk] = H::ranf()*H::w; // Uniform
|
mas01mc@293
|
111 }
|
mas01mc@293
|
112 }
|
mas01mc@293
|
113 #endif
|
mas01mc@293
|
114
|
mas01mc@293
|
115 // Storage for LSH hash function output (Uns32T)
|
mas01mc@293
|
116 for( j = 0 ; j < H::L ; j++ ){ // L functions g_j(u_a, u_b) a,b \in nchoosek(m,2)
|
mas01mc@293
|
117 H::g[j] = new Uns32T[ H::k ]; // k x 32-bit hash values, gj(v)=[x0 x1 ... xk-1] xk \in Z
|
mas01mc@293
|
118 assert( H::g[j] );
|
mas01mc@292
|
119 }
|
mas01mc@292
|
120
|
mas01mc@293
|
121 // LSH Hash tables
|
mas01mc@293
|
122 H::h = new bucket**[ H::L ];
|
mas01mc@293
|
123 assert( H::h );
|
mas01mc@292
|
124 for( j = 0 ; j < H::L ; j++ ){
|
mas01mc@292
|
125 H::h[j] = new bucket*[ H::N ];
|
mas01mc@292
|
126 assert( H::h[j] );
|
mas01mc@292
|
127 for( i = 0 ; i < H::N ; i++)
|
mas01mc@292
|
128 H::h[j][i] = 0;
|
mas01mc@292
|
129 }
|
mas01mc@293
|
130
|
mas01mc@293
|
131 // Standard hash functions
|
mas01mc@293
|
132 H::r1 = new Uns32T*[ H::L ];
|
mas01mc@293
|
133 H::r2 = new Uns32T*[ H::L ];
|
mas01mc@293
|
134 assert( H::r1 && H::r2 ); // failure
|
mas01mc@293
|
135 for( j = 0 ; j < H::L ; j++ ){
|
mas01mc@293
|
136 H::r1[ j ] = new Uns32T[ H::k ];
|
mas01mc@293
|
137 H::r2[ j ] = new Uns32T[ H::k ];
|
mas01mc@293
|
138 assert( H::r1[j] && H::r2[j] ); // failure
|
mas01mc@293
|
139 for( i = 0; i<H::k; i++){
|
mas01mc@293
|
140 H::r1[j][i] = randr();
|
mas01mc@293
|
141 H::r2[j][i] = randr();
|
mas01mc@293
|
142 }
|
mas01mc@293
|
143 }
|
mas01mc@293
|
144
|
mas01mc@534
|
145 // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS
|
mas01mc@293
|
146 H::initialize_partial_functions();
|
mas01mc@764
|
147
|
mas01mc@764
|
148 // MultiProbe distance functions, there are 2*k per hashtable
|
mas01mc@764
|
149 H::boundaryDistances = new float*[ H::L ]; // L x 2k boundary distances
|
mas01mc@764
|
150 assert( H::boundaryDistances ); // failure
|
mas01mc@764
|
151 for( j = 0; j < H::L ; j++ ){ // 2*k functions x_i(q)
|
mas01mc@764
|
152 H::boundaryDistances[j] = new float[ 2*H::k ];
|
mas01mc@764
|
153 assert( H::boundaryDistances[j] ); // failure
|
mas01mc@764
|
154 for( kk = 0; kk < 2*H::k ; kk++ )
|
mas01mc@764
|
155 H::boundaryDistances[j][kk] = 0.0f; // initialize with zeros
|
mas01mc@764
|
156 }
|
mas01mc@293
|
157 }
|
mas01mc@293
|
158
|
mas01mc@293
|
159 void H::initialize_partial_functions(){
|
mas01mc@293
|
160
|
mas01mc@293
|
161 #ifdef USE_U_FUNCTIONS
|
mas01mc@293
|
162 H::uu = vector<vector<Uns32T> >(H::m);
|
mas01mc@293
|
163 for( Uns32T aa=0 ; aa < H::m ; aa++ )
|
mas01mc@293
|
164 H::uu[aa] = vector<Uns32T>( H::k/2 );
|
mas01mc@293
|
165 #endif
|
mas01mc@293
|
166 }
|
mas01mc@293
|
167
|
mas01mc@293
|
168
|
mas01mc@293
|
169 // Generate z ~ N(0,1)
|
mas01mc@293
|
170 float H::randn(){
|
mas01mc@293
|
171 // Box-Muller
|
mas01mc@293
|
172 float x1, x2;
|
mas01mc@293
|
173 do{
|
mas01mc@293
|
174 x1 = ranf();
|
mas01mc@293
|
175 } while (x1 == 0); // cannot take log of 0
|
mas01mc@293
|
176 x2 = ranf();
|
mas01mc@293
|
177 float z;
|
mas01mc@293
|
178 z = sqrtf(-2.0 * logf(x1)) * cosf(2.0 * M_PI * x2);
|
mas01mc@293
|
179 return z;
|
mas01mc@293
|
180 }
|
mas01mc@293
|
181
|
mas01mc@293
|
182 float H::ranf(){
|
mas01mc@293
|
183 #ifdef MT19937
|
mas01mc@293
|
184 return (float) genrand_real2();
|
mas01mc@293
|
185 #else
|
mas01mc@293
|
186 return (float)( (double)rand() / ((double)(RAND_MAX)+(double)(1)) );
|
mas01mc@293
|
187 #endif
|
mas01mc@293
|
188 }
|
mas01mc@293
|
189
|
mas01mc@293
|
190 // range is 1..2^29
|
mas01mc@293
|
191 /* FIXME: that looks like an ... odd range. Still. */
|
mas01mc@293
|
192 Uns32T H::randr(){
|
mas01mc@293
|
193 #ifdef MT19937
|
mas01mc@293
|
194 return (Uns32T)((genrand_int32() >> 3) + 1);
|
mas01mc@293
|
195 #else
|
mas01mc@293
|
196 return (Uns32T) ((rand() >> 2) + 1);
|
mas01mc@293
|
197 #endif
|
mas01mc@292
|
198 }
|
mas01mc@292
|
199
|
mas01mc@292
|
200 // Destruct hash tables
|
mas01mc@292
|
201 H::~H(){
|
mas01mc@293
|
202 Uns32T i,j,kk;
|
mas01mc@340
|
203 bucket** pp;
|
mas01mc@293
|
204 #ifdef USE_U_FUNCTIONS
|
mas01mc@293
|
205 for( j = 0 ; j < H::m ; j++ ){
|
mas01mc@293
|
206 for( kk = 0 ; kk < H::k/2 ; kk++ )
|
mas01mc@293
|
207 delete[] A[j][kk];
|
mas01mc@293
|
208 delete[] A[j];
|
mas01mc@293
|
209 }
|
mas01mc@293
|
210 delete[] A;
|
mas01mc@293
|
211 for( j = 0 ; j < H::m ; j++ )
|
mas01mc@293
|
212 delete[] b[j];
|
mas01mc@293
|
213 delete[] b;
|
mas01mc@293
|
214 #else
|
mas01mc@293
|
215 for( j = 0 ; j < H::L ; j++ ){
|
mas01mc@293
|
216 for( kk = 0 ; kk < H::k ; kk++ )
|
mas01mc@293
|
217 delete[] A[j][kk];
|
mas01mc@293
|
218 delete[] A[j];
|
mas01mc@293
|
219 }
|
mas01mc@293
|
220 delete[] A;
|
mas01mc@293
|
221 for( j = 0 ; j < H::L ; j++ )
|
mas01mc@293
|
222 delete[] b[j];
|
mas01mc@293
|
223 delete[] b;
|
mas01mc@293
|
224 #endif
|
mas01mc@293
|
225
|
mas01mc@293
|
226 for( j = 0 ; j < H::L ; j++ )
|
mas01mc@293
|
227 delete[] g[j];
|
mas01mc@293
|
228 delete[] g;
|
mas01mc@292
|
229 for( j=0 ; j < H::L ; j++ ){
|
mas01mc@292
|
230 delete[] H::r1[ j ];
|
mas01mc@292
|
231 delete[] H::r2[ j ];
|
mas01mc@340
|
232 for(i = 0; i< H::N ; i++){
|
mas01mc@340
|
233 pp = 0;
|
mas01mc@340
|
234 #ifdef LSH_CORE_ARRAY
|
mas01mc@340
|
235 if(H::h[ j ][ i ])
|
mas01mc@340
|
236 if(H::h[ j ][ i ]->t2 & LSH_CORE_ARRAY_BIT){
|
mas01mc@340
|
237 pp = get_pointer_to_bucket_linked_list(H::h[ j ][ i ]);
|
mas01mc@340
|
238 if(*pp){
|
mas01mc@344
|
239 (*pp)->snext.ptr=0; // Head of list uses snext as a non-pointer value
|
mas01mc@340
|
240 delete *pp; // Now the destructor can do its work properly
|
mas01mc@340
|
241 }
|
mas01mc@340
|
242 free(H::h[ j ][ i ]->next);
|
mas01mc@340
|
243 H::h[ j ][ i ]->next = 0; // Zero next pointer
|
mas01mc@344
|
244 H::h[ j ][ i ]->snext.ptr = 0; // Zero head-of-list pointer as above
|
mas01mc@340
|
245 }
|
mas01mc@340
|
246 #endif
|
mas01mc@292
|
247 delete H::h[ j ][ i ];
|
mas01mc@340
|
248 }
|
mas01mc@292
|
249 delete[] H::h[ j ];
|
mas01mc@292
|
250 }
|
mas01mc@474
|
251 delete[] H::r1;
|
mas01mc@474
|
252 delete[] H::r2;
|
mas01mc@474
|
253 delete[] H::h;
|
mas01mc@764
|
254
|
mas01mc@764
|
255 // MultiProbe cleanup
|
mas01mc@764
|
256 for( j = 0 ; j < H::L ; j++ )
|
mas01mc@764
|
257 delete[] H::boundaryDistances[j];
|
mas01mc@764
|
258 delete[] H::boundaryDistances;
|
mas01mc@764
|
259 delete multiProbePtr;
|
mas01mc@292
|
260 }
|
mas01mc@292
|
261
|
mas01mc@292
|
262
|
mas01mc@293
|
263 // Compute all hash functions for vector v
|
mas01mc@293
|
264 // #ifdef USE_U_FUNCTIONS use Combination of m \times h_i \in R^{(k/2) \times d}
|
mas01mc@293
|
265 // to make L \times g_j functions \in Z^k
|
mas01mc@293
|
266 void H::compute_hash_functions(vector<float>& v){ // v \in R^d
|
mas01mc@293
|
267 float iw = 1. / H::w; // hash bucket width
|
mas01mc@293
|
268 Uns32T aa, kk;
|
mas01mc@293
|
269 if( v.size() != H::d )
|
mas01mc@293
|
270 error("v.size != H::d","","compute_hash_functions"); // check input vector dimensionality
|
mas01mc@293
|
271 double tmp = 0;
|
mas01mc@764
|
272 float *pA, *pb, *bd;
|
mas01mc@293
|
273 Uns32T *pg;
|
mas01mc@293
|
274 int dd;
|
mas01mc@293
|
275 vector<float>::iterator vi;
|
mas01mc@293
|
276 vector<Uns32T>::iterator ui;
|
mas01mc@293
|
277
|
mas01mc@293
|
278 #ifdef USE_U_FUNCTIONS
|
mas01mc@293
|
279 Uns32T bb;
|
mas01mc@293
|
280 // Store m dot products to expand
|
mas01mc@293
|
281 for( aa=0; aa < H::m ; aa++ ){
|
mas01mc@293
|
282 ui = H::uu[aa].begin();
|
mas01mc@293
|
283 for( kk = 0 ; kk < H::k/2 ; kk++ ){
|
mas01mc@293
|
284 pb = *( H::b + aa ) + kk;
|
mas01mc@293
|
285 pA = * ( * ( H::A + aa ) + kk );
|
mas01mc@293
|
286 dd = H::d;
|
mas01mc@293
|
287 tmp = 0.;
|
mas01mc@293
|
288 vi = v.begin();
|
mas01mc@293
|
289 while( dd-- )
|
mas01mc@293
|
290 tmp += *pA++ * *vi++; // project
|
mas01mc@293
|
291 tmp += *pb; // translate
|
mas01mc@293
|
292 tmp *= iw; // scale
|
mas01mc@293
|
293 *ui++ = (Uns32T) floor(tmp); // floor
|
mas01mc@293
|
294 }
|
mas01mc@293
|
295 }
|
mas01mc@293
|
296 // Binomial combinations of functions u_{a,b} \in Z^{(k/2) \times d}
|
mas01mc@293
|
297 Uns32T j;
|
mas01mc@293
|
298 for( aa=0, j=0 ; aa < H::m-1 ; aa++ )
|
mas01mc@293
|
299 for( bb = aa + 1 ; bb < H::m ; bb++, j++ ){
|
mas01mc@293
|
300 pg= *( H::g + j ); // L \times functions g_j(v) \in Z^k
|
mas01mc@293
|
301 // u_1 \in Z^{(k/2) \times d}
|
mas01mc@293
|
302 ui = H::uu[aa].begin();
|
mas01mc@293
|
303 kk=H::k/2;
|
mas01mc@293
|
304 while( kk-- )
|
mas01mc@293
|
305 *pg++ = *ui++; // hash function g_j(v)=[x1 x2 ... x(k/2)]; xk \in Z
|
mas01mc@293
|
306 // u_2 \in Z^{(k/2) \times d}
|
mas01mc@293
|
307 ui = H::uu[bb].begin();
|
mas01mc@293
|
308 kk=H::k/2;
|
mas01mc@293
|
309 while( kk--)
|
mas01mc@293
|
310 *pg++ = *ui++; // hash function g_j(v)=[x(k/2+1) x(k/2+2) ... xk]; xk \in Z
|
mas01mc@293
|
311 }
|
mas01mc@293
|
312 #else
|
mas01mc@293
|
313 for( aa=0; aa < H::L ; aa++ ){
|
mas01mc@514
|
314 pg= *( H::g + aa ); // L \times functions g_j(v) \in Z^k
|
mas01mc@764
|
315 bd= *( H::boundaryDistances + aa);
|
mas01mc@514
|
316 for( kk = 0 ; kk != H::k ; kk++ ){
|
mas01mc@293
|
317 pb = *( H::b + aa ) + kk;
|
mas01mc@293
|
318 pA = * ( * ( H::A + aa ) + kk );
|
mas01mc@293
|
319 dd = H::d;
|
mas01mc@293
|
320 tmp = 0.;
|
mas01mc@293
|
321 vi = v.begin();
|
mas01mc@293
|
322 while( dd-- )
|
mas01mc@293
|
323 tmp += *pA++ * *vi++; // project
|
mas01mc@293
|
324 tmp += *pb; // translate
|
mas01mc@764
|
325 tmp *= iw; // scale
|
mas01mc@764
|
326 tmp = floor(tmp); // handle negative values
|
mas01mc@764
|
327 while(tmp<0) // wrap around 0 to N
|
mas01mc@764
|
328 tmp += H::N;
|
mas01mc@764
|
329 *pg = (Uns32T) tmp; // hash function g_j(v)=[x1 x2 ... xk]; xk \in Z
|
mas01mc@764
|
330 *bd = (tmp - *pg++);//*w; // boundary distance -1
|
mas01mc@764
|
331 *(bd+1) = (1.0f - *bd); //*w; // boundary distance +1
|
mas01mc@764
|
332 bd+=2;
|
mas01mc@293
|
333 }
|
mas01mc@293
|
334 }
|
mas01mc@293
|
335 #endif
|
mas01mc@293
|
336 }
|
mas01mc@293
|
337
|
mas01mc@292
|
338 // make hash value \in Z
|
mas01mc@293
|
339 void H::generate_hash_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){
|
mas01mc@293
|
340 H::t1 = computeProductModDefaultPrime( g, r1, H::k ) % H::N;
|
mas01mc@293
|
341 H::t2 = computeProductModDefaultPrime( g, r2, H::k );
|
mas01mc@292
|
342 }
|
mas01mc@292
|
343
|
mas01mc@764
|
344 // make hash value by purturbating the given hash functions
|
mas01mc@764
|
345 // according the the boundary distances of the current query
|
mas01mc@764
|
346 void H::generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2){
|
mas01mc@764
|
347 assert(!multiProbePtr->empty()); // Test this for now, until all is stable
|
mas01mc@764
|
348 Uns32T* mpg = new Uns32T[H::k]; // temporary array storage
|
mas01mc@764
|
349
|
mas01mc@764
|
350 // Copy the hash bucket identifiers
|
mas01mc@764
|
351 Uns32T* mpgPtr = mpg;
|
mas01mc@764
|
352 Uns32T kk = H::k;
|
mas01mc@764
|
353 while(kk--)
|
mas01mc@764
|
354 *mpgPtr++ = *g++;
|
mas01mc@764
|
355
|
mas01mc@764
|
356 // Retrieve the next purturbation set
|
mas01mc@764
|
357 perturbation_set ps = multiProbePtr->getNextPerturbationSet();
|
mas01mc@764
|
358 perturbation_set::iterator it = ps.begin();
|
mas01mc@764
|
359
|
mas01mc@764
|
360 // Perturbate the hash functions g
|
mas01mc@764
|
361 while( it != ps.end() ){
|
mas01mc@764
|
362 *(mpg + multiProbePtr->getIndex(it)) += multiProbePtr->getBoundary(it);
|
mas01mc@764
|
363 it++;
|
mas01mc@764
|
364 }
|
mas01mc@764
|
365
|
mas01mc@764
|
366 H::t1 = computeProductModDefaultPrime( mpg, r1, H::k ) % H::N;
|
mas01mc@764
|
367 H::t2 = computeProductModDefaultPrime( mpg, r2, H::k );
|
mas01mc@764
|
368
|
mas01mc@764
|
369 delete[] mpg; // free up temporary storage
|
mas01mc@764
|
370 }
|
mas01mc@764
|
371
|
mas01mc@764
|
372
|
mas01mc@292
|
373 #define CR_ASSERT(b){if(!(b)){fprintf(stderr, "ASSERT failed on line %d, file %s.\n", __LINE__, __FILE__); exit(1);}}
|
mas01mc@292
|
374
|
mas01mc@292
|
375 // Computes (a.b) mod UH_PRIME_DEFAULT
|
mas01mc@293
|
376 inline Uns32T H::computeProductModDefaultPrime(Uns32T *a, Uns32T *b, IntT size){
|
mas01mc@292
|
377 LongUns64T h = 0;
|
mas01mc@292
|
378
|
mas01mc@292
|
379 for(IntT i = 0; i < size; i++){
|
mas01mc@292
|
380 h = h + (LongUns64T)a[i] * (LongUns64T)b[i];
|
mas01mc@292
|
381 h = (h & TWO_TO_32_MINUS_1) + 5 * (h >> 32);
|
mas01mc@292
|
382 if (h >= UH_PRIME_DEFAULT) {
|
mas01mc@292
|
383 h = h - UH_PRIME_DEFAULT;
|
mas01mc@292
|
384 }
|
mas01mc@292
|
385 CR_ASSERT(h < UH_PRIME_DEFAULT);
|
mas01mc@292
|
386 }
|
mas01mc@292
|
387 return h;
|
mas01mc@292
|
388 }
|
mas01mc@292
|
389
|
mas01mc@292
|
390 Uns32T H::bucket_insert_point(bucket **pp){
|
mas01mc@296
|
391 collisionCount = 0;
|
mas01mc@340
|
392 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@292
|
393 if(!*pp){
|
mas01mc@292
|
394 *pp = new bucket();
|
mas01mc@292
|
395 (*pp)->t2 = 0; // Use t2 as a collision counter for the row
|
mas01mc@292
|
396 (*pp)->next = new bucket();
|
mas01mc@292
|
397 }
|
mas01mc@340
|
398 // The list head holds point collision count
|
mas01mc@340
|
399 if( (*pp)->t2 & LSH_CORE_ARRAY_BIT )
|
mas01mc@340
|
400 bucket_insert_point(get_pointer_to_bucket_linked_list(*pp)); // recurse
|
mas01mc@340
|
401 else{
|
mas01mc@340
|
402 collisionCount = (*pp)->t2;
|
mas01mc@340
|
403 if(collisionCount < H::C){ // Block if row is full
|
mas01mc@340
|
404 (*pp)->t2++; // Increment collision counter (numPoints in row)
|
mas01mc@340
|
405 pointCount++;
|
mas01mc@340
|
406 collisionCount++;
|
mas01mc@340
|
407 // Locate the bucket linked list
|
mas01mc@340
|
408 __bucket_insert_point( (*pp)->next );
|
mas01mc@340
|
409 }
|
mas01mc@292
|
410 }
|
mas01mc@340
|
411 #else // NOT USING LSH_LIST_HEAD_COUNTERS
|
mas01mc@340
|
412 if(!*pp)
|
mas01mc@340
|
413 *pp = new bucket();
|
mas01mc@296
|
414 pointCount++;
|
mas01mc@296
|
415 __bucket_insert_point(*pp); // No collision count storage
|
mas01mc@292
|
416 #endif
|
mas01mc@292
|
417 return collisionCount;
|
mas01mc@292
|
418 }
|
mas01mc@292
|
419
|
mas01mc@340
|
420 // insert points into hashtable row collision chain
|
mas01mc@292
|
421 void H::__bucket_insert_point(bucket* p){
|
mas01mc@292
|
422 if(p->t2 == IFLAG){ // initialization flag, is it in the domain of t2?
|
mas01mc@292
|
423 p->t2 = H::t2;
|
mas01mc@292
|
424 bucketCount++; // Record start of new point-locale collision chain
|
mas01mc@344
|
425 p->snext.ptr = new sbucket();
|
mas01mc@344
|
426 __sbucket_insert_point(p->snext.ptr);
|
mas01mc@292
|
427 return;
|
mas01mc@292
|
428 }
|
mas01mc@292
|
429
|
mas01mc@292
|
430 if(p->t2 == H::t2){
|
mas01mc@344
|
431 __sbucket_insert_point(p->snext.ptr);
|
mas01mc@292
|
432 return;
|
mas01mc@292
|
433 }
|
mas01mc@764
|
434
|
mas01mc@764
|
435 // insert bucket before current bucket
|
mas01mc@764
|
436 if(H::t2 < p->t2){
|
mas01mc@764
|
437 bucket* tmp = new bucket();
|
mas01mc@764
|
438 // copy current bucket contents into new bucket
|
mas01mc@764
|
439 tmp->next = p->next;
|
mas01mc@764
|
440 tmp->t2 = p->t2;
|
mas01mc@764
|
441 tmp->snext.ptr = p->snext.ptr;
|
mas01mc@764
|
442 p->next = tmp;
|
mas01mc@764
|
443 p->t2 = IFLAG;
|
mas01mc@764
|
444 p->snext.ptr=0;
|
mas01mc@764
|
445 __bucket_insert_point(p);
|
mas01mc@764
|
446 return;
|
mas01mc@764
|
447 }
|
mas01mc@292
|
448
|
mas01mc@764
|
449 if(p->next)
|
mas01mc@764
|
450 __bucket_insert_point(p->next);
|
mas01mc@340
|
451 else {
|
mas01mc@292
|
452 p->next = new bucket();
|
mas01mc@292
|
453 __bucket_insert_point(p->next);
|
mas01mc@292
|
454 }
|
mas01mc@292
|
455 }
|
mas01mc@292
|
456
|
mas01mc@340
|
457 // insert points into point-locale collision chain
|
mas01mc@292
|
458 void H::__sbucket_insert_point(sbucket* p){
|
mas01mc@292
|
459 if(p->pointID==IFLAG){
|
mas01mc@292
|
460 p->pointID = H::p;
|
mas01mc@292
|
461 return;
|
mas01mc@292
|
462 }
|
mas01mc@340
|
463
|
mas01mc@292
|
464 // Search for pointID
|
mas01mc@340
|
465 if(p->snext){
|
mas01mc@292
|
466 __sbucket_insert_point(p->snext);
|
mas01mc@292
|
467 }
|
mas01mc@292
|
468 else{
|
mas01mc@340
|
469 // Make new point collision bucket at end of list
|
mas01mc@340
|
470 p->snext = new sbucket();
|
mas01mc@340
|
471 __sbucket_insert_point(p->snext);
|
mas01mc@292
|
472 }
|
mas01mc@292
|
473 }
|
mas01mc@292
|
474
|
mas01mc@293
|
475 inline bucket** H::get_bucket(int j){
|
mas01mc@292
|
476 return *(h+j);
|
mas01mc@292
|
477 }
|
mas01mc@292
|
478
|
mas01mc@340
|
479 // Find the linked-list pointer at the end of the CORE_ARRAY
|
mas01mc@340
|
480 bucket** H::get_pointer_to_bucket_linked_list(bucket* rowPtr){
|
mas01mc@344
|
481 Uns32T numBuckets = rowPtr->snext.numBuckets; // Cast pointer to unsigned int
|
mas01mc@343
|
482 Uns32T numPoints = rowPtr->t2 & 0x7FFFFFFF; // Value is stored in low 31 bits of t2 field
|
mas01mc@343
|
483 bucket** listPtr = reinterpret_cast<bucket**> (reinterpret_cast<unsigned int*>(rowPtr->next)+numPoints+numBuckets+1);
|
mas01mc@343
|
484 return listPtr;
|
mas01mc@340
|
485 }
|
mas01mc@340
|
486
|
mas01mc@293
|
487 // Interface to Locality Sensitive Hashing G
|
mas01mc@293
|
488 G::G(float ww, Uns32T kk,Uns32T mm, Uns32T dd, Uns32T NN, Uns32T CC, float rr):
|
mas01mc@293
|
489 H(kk,mm,dd,NN,CC,ww,rr), // constructor to initialize data structures
|
mas01mc@308
|
490 indexName(0),
|
mas01mc@293
|
491 lshHeader(0),
|
mas01mc@292
|
492 calling_instance(0),
|
mas01mc@293
|
493 add_point_callback(0)
|
mas01mc@292
|
494 {
|
mas01mc@293
|
495
|
mas01mc@292
|
496 }
|
mas01mc@292
|
497
|
mas01mc@292
|
498 // Serialize from file LSH constructor
|
mas01mc@292
|
499 // Read parameters from database file
|
mas01mc@292
|
500 // Load the hash functions, close the database
|
mas01mc@292
|
501 // Optionally load the LSH tables into head-allocated lists in core
|
mas01mc@292
|
502 G::G(char* filename, bool lshInCoreFlag):
|
mas01mc@293
|
503 H(), // default base-class constructor call delays data-structure initialization
|
mas01mc@309
|
504 indexName(0),
|
mas01mc@293
|
505 lshHeader(0),
|
mas01mc@292
|
506 calling_instance(0),
|
mas01mc@292
|
507 add_point_callback(0)
|
mas01mc@292
|
508 {
|
mas01mc@474
|
509 FILE* dbFile = 0;
|
mas01mc@292
|
510 int dbfid = unserialize_lsh_header(filename);
|
mas01mc@764
|
511
|
mas01mc@309
|
512 indexName = new char[O2_INDEX_MAXSTR];
|
mas01mc@309
|
513 strncpy(indexName, filename, O2_INDEX_MAXSTR); // COPY THE CONTENTS TO THE NEW POINTER
|
mas01mc@293
|
514 H::initialize_lsh_functions(); // Base-class data-structure initialization
|
mas01mc@293
|
515 unserialize_lsh_functions(dbfid); // populate with on-disk hashfunction values
|
mas01mc@292
|
516
|
mas01mc@292
|
517 // Format1 only needs unserializing if specifically requested
|
mas01mc@292
|
518 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2) && lshInCoreFlag){
|
mas01mc@292
|
519 unserialize_lsh_hashtables_format1(dbfid);
|
mas01mc@292
|
520 }
|
mas01mc@292
|
521
|
mas01mc@292
|
522 // Format2 always needs unserializing
|
mas01mc@292
|
523 if(lshHeader->flags&O2_SERIAL_FILEFORMAT2 && lshInCoreFlag){
|
mas01mc@474
|
524 dbFile = fdopen(dbfid, "rb");
|
mas01mc@336
|
525 if(!dbFile)
|
mas01mc@336
|
526 error("Cannot open LSH file for reading", filename);
|
mas01mc@336
|
527 unserialize_lsh_hashtables_format2(dbFile);
|
mas01mc@292
|
528 }
|
mas01mc@336
|
529 serial_close(dbfid);
|
mas01mc@474
|
530 if(dbFile){
|
mas01mc@474
|
531 fclose(dbFile);
|
mas01mc@474
|
532 dbFile = 0;
|
mas01mc@474
|
533 }
|
mas01mc@336
|
534 }
|
mas01mc@292
|
535
|
mas01mc@292
|
536 G::~G(){
|
mas01mc@292
|
537 delete lshHeader;
|
mas01mc@474
|
538 delete[] indexName;
|
mas01mc@292
|
539 }
|
mas01mc@292
|
540
|
mas01mc@292
|
541 // single point insertion; inserted values are hash value and pointID
|
mas01mc@292
|
542 Uns32T G::insert_point(vector<float>& v, Uns32T pp){
|
mas01mc@292
|
543 Uns32T collisionCount = 0;
|
mas01mc@292
|
544 H::p = pp;
|
mas01mc@299
|
545 if(H::maxp && pp<=H::maxp)
|
mas01mc@296
|
546 error("points must be indexed in strict ascending order", "LSH::insert_point(vector<float>&, Uns32T pointID)");
|
mas01mc@296
|
547 H::maxp=pp; // Store highest pointID in database
|
mas01mc@293
|
548 H::compute_hash_functions( v );
|
mas01mc@292
|
549 for(Uns32T j = 0 ; j < H::L ; j++ ){ // insertion
|
mas01mc@293
|
550 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
|
mas01mc@292
|
551 collisionCount += bucket_insert_point( *(h + j) + t1 );
|
mas01mc@292
|
552 }
|
mas01mc@292
|
553 return collisionCount;
|
mas01mc@292
|
554 }
|
mas01mc@292
|
555
|
mas01mc@292
|
556
|
mas01mc@292
|
557 // batch insert for a point set
|
mas01mc@292
|
558 // inserted values are vector hash value and pointID starting at basePointID
|
mas01mc@292
|
559 void G::insert_point_set(vector<vector<float> >& vv, Uns32T basePointID){
|
mas01mc@292
|
560 for(Uns32T point=0; point<vv.size(); point++)
|
mas01mc@292
|
561 insert_point(vv[point], basePointID+point);
|
mas01mc@292
|
562 }
|
mas01mc@292
|
563
|
mas01mc@292
|
564 // point retrieval routine
|
mas01mc@292
|
565 void G::retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
|
mas01mc@764
|
566 // assert(LSH_MULTI_PROBE_COUNT);
|
mas01mc@292
|
567 calling_instance = caller;
|
mas01mc@292
|
568 add_point_callback = add_point;
|
mas01mc@293
|
569 H::compute_hash_functions( v );
|
mas01mc@292
|
570 for(Uns32T j = 0 ; j < H::L ; j++ ){
|
mas01mc@764
|
571 // MultiProbe loop
|
mas01mc@764
|
572 multiProbePtr->generatePerturbationSets( *( H::boundaryDistances + j ) , 2*H::k, (unsigned)LSH_MULTI_PROBE_COUNT);
|
mas01mc@764
|
573 for(Uns32T multiProbeIdx = 0 ; multiProbeIdx < multiProbePtr->size()+1 ; multiProbeIdx++ ){
|
mas01mc@764
|
574 if(!multiProbeIdx)
|
mas01mc@764
|
575 H::generate_hash_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
|
mas01mc@764
|
576 else
|
mas01mc@764
|
577 H::generate_multiprobe_keys( *( H::g + j ), *( H::r1 + j ), *( H::r2 + j ) );
|
mas01mc@764
|
578 if( bucket* bPtr = *(get_bucket(j) + get_t1()) ) {
|
mas01mc@340
|
579 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@764
|
580 if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
|
mas01mc@764
|
581 retrieve_from_core_hashtable_array((Uns32T*)(bPtr->next), qpos);
|
mas01mc@764
|
582 } else {
|
mas01mc@764
|
583 bucket_chain_point( bPtr->next, qpos);
|
mas01mc@764
|
584 }
|
mas01mc@764
|
585 #else
|
mas01mc@764
|
586 bucket_chain_point( bPtr , qpos);
|
mas01mc@764
|
587 #endif
|
mas01cr@370
|
588 }
|
mas01cr@370
|
589 }
|
mas01mc@292
|
590 }
|
mas01mc@292
|
591 }
|
mas01mc@292
|
592
|
mas01mc@292
|
593 void G::retrieve_point_set(vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller){
|
mas01mc@292
|
594 for(Uns32T qpos = 0 ; qpos < vv.size() ; qpos++ )
|
mas01mc@292
|
595 retrieve_point(vv[qpos], qpos, add_point, caller);
|
mas01mc@292
|
596 }
|
mas01mc@292
|
597
|
mas01mc@292
|
598 // export lsh tables to table structure on disk
|
mas01mc@292
|
599 //
|
mas01mc@292
|
600 // LSH TABLE STRUCTURE
|
mas01mc@292
|
601 // ---header 64 bytes ---
|
mas01mc@292
|
602 // [magic #tables #rows #cols elementSize databaseSize version flags dim #funs 0 0 0 0 0 0]
|
mas01mc@292
|
603 //
|
mas01mc@292
|
604 // ---random projections L x k x d float ---
|
mas01mc@292
|
605 // A[0][0][0] A[0][0][1] ... A[0][0][d-1]
|
mas01mc@292
|
606 // A[0][1][0] A[0][1][1] ... A[1][1][d-1]
|
mas01mc@292
|
607 // ...
|
mas01mc@292
|
608 // A[0][K-1][0] A[0][1][1] ... A[0][k-1][d-1]
|
mas01mc@292
|
609 // ...
|
mas01mc@292
|
610 // ...
|
mas01mc@292
|
611 // A[L-1][0][0] A[M-1][0][1] ... A[L-1][0][d-1]
|
mas01mc@292
|
612 // A[L-1][1][0] A[M-1][1][1] ... A[L-1][1][d-1]
|
mas01mc@292
|
613 // ...
|
mas01mc@292
|
614 // A[L-1][k-1][0] A[M-1][1][1] ... A[L-1][k-1][d-1]
|
mas01mc@292
|
615 //
|
mas01mc@292
|
616 // ---bias L x k float ---
|
mas01mc@292
|
617 // b[0][0] b[0][1] ... b[0][k-1]
|
mas01mc@292
|
618 // b[1][0] b[1][1] ... b[1][k-1]
|
mas01mc@292
|
619 // ...
|
mas01mc@292
|
620 // b[L-1][0] b[L-1][1] ... b[L-1][k-1]
|
mas01mc@292
|
621 //
|
mas01mc@292
|
622 // ---random r1 L x k float ---
|
mas01mc@292
|
623 // r1[0][0] r1[0][1] ... r1[0][k-1]
|
mas01mc@292
|
624 // r1[1][0] r1[1][1] ... r1[1][k-1]
|
mas01mc@292
|
625 // ...
|
mas01mc@292
|
626 // r1[L-1][0] r1[L-1][1] ... r1[L-1][k-1]
|
mas01mc@292
|
627 //
|
mas01mc@292
|
628 // ---random r2 L x k float ---
|
mas01mc@292
|
629 // r2[0][0] r2[0][1] ... r2[0][k-1]
|
mas01mc@292
|
630 // r2[1][0] r2[1][1] ... r2[1][k-1]
|
mas01mc@292
|
631 // ...
|
mas01mc@292
|
632 // r2[L-1][0] r2[L-1][1] ... r2[L-1][k-1]
|
mas01mc@292
|
633 //
|
mas01mc@293
|
634 // ******* HASHTABLES FORMAT1 (optimized for LSH_ON_DISK retrieval) *******
|
mas01mc@292
|
635 // ---hash table 0: N x C x 8 ---
|
mas01mc@292
|
636 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
637 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
638 // ...
|
mas01mc@292
|
639 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
640 //
|
mas01mc@292
|
641 // ---hash table 1: N x C x 8 ---
|
mas01mc@292
|
642 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
643 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
644 // ...
|
mas01mc@292
|
645 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
646 //
|
mas01mc@292
|
647 // ...
|
mas01mc@292
|
648 //
|
mas01mc@292
|
649 // ---hash table L-1: N x C x 8 ---
|
mas01mc@292
|
650 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
651 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
652 // ...
|
mas01mc@292
|
653 // [t2 pointID][t2 pointID]...[t2 pointID]
|
mas01mc@292
|
654 //
|
mas01mc@293
|
655 // ******* HASHTABLES FORMAT2 (optimized for LSH_IN_CORE retrieval) *******
|
mas01mc@293
|
656 //
|
mas01mc@293
|
657 // State machine controlled by regular expression.
|
mas01mc@293
|
658 // legend:
|
mas01mc@293
|
659 //
|
mas01mc@306
|
660 // O2_SERIAL_TOKEN_T1 = 0xFFFFFFFCU
|
mas01mc@306
|
661 // O2_SERIAL_TOKEN_T2 = 0xFFFFFFFDU
|
mas01mc@306
|
662 // O2_SERIAL_TOKEN_ENDTABLE = 0xFFFFFFFEU
|
mas01mc@293
|
663 //
|
mas01mc@306
|
664 // T1 - T1 hash token
|
mas01mc@306
|
665 // t1 - t1 hash key (t1 range 0..2^29-1)
|
mas01mc@764
|
666 // %buckets+points% numElements in row for ARRAY encoding
|
mas01mc@306
|
667 // T2 - T2 token
|
mas01mc@293
|
668 // t2 - t2 hash key (range 1..2^32-6)
|
mas01mc@293
|
669 // p - point identifier (range 0..2^32-1)
|
mas01mc@306
|
670 // E - end hash table token
|
mas01mc@293
|
671 // {...} required arguments
|
mas01mc@293
|
672 // [...] optional arguments
|
mas01mc@293
|
673 // * - match zero or more occurences
|
mas01mc@293
|
674 // + - match one or more occurences
|
mas01mc@293
|
675 // {...}^L - repeat argument L times
|
mas01mc@293
|
676 //
|
mas01mc@293
|
677 // FORMAT2 Regular expression:
|
mas01mc@764
|
678 // { [T1 t1 %buckets+points% [T2 t2 p+]+ ]* E }^L
|
mas01mc@293
|
679 //
|
mas01mc@292
|
680
|
mas01mc@292
|
681 // Serial header constructors
|
mas01mc@292
|
682 SerialHeader::SerialHeader(){;}
|
mas01mc@296
|
683 SerialHeader::SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float r, Uns32T p, Uns32T FMT, Uns32T pc):
|
mas01mc@292
|
684 lshMagic(O2_SERIAL_MAGIC),
|
mas01mc@292
|
685 binWidth(W),
|
mas01mc@292
|
686 numTables(L),
|
mas01mc@292
|
687 numRows(N),
|
mas01mc@292
|
688 numCols(C),
|
mas01mc@292
|
689 elementSize(O2_SERIAL_ELEMENT_SIZE),
|
mas01mc@296
|
690 version(O2_SERIAL_VERSION),
|
mas01mc@296
|
691 size(0), // we are deprecating this value
|
mas01mc@292
|
692 flags(FMT),
|
mas01mc@292
|
693 dataDim(d),
|
mas01mc@292
|
694 numFuns(k),
|
mas01mc@292
|
695 radius(r),
|
mas01mc@296
|
696 maxp(p),
|
mas01mc@296
|
697 size_long((unsigned long long)L * align_up((unsigned long long)N * C * O2_SERIAL_ELEMENT_SIZE, get_page_logn()) // hash tables
|
mas01mc@296
|
698 + align_up(O2_SERIAL_HEADER_SIZE + // header + hash functions
|
mas01mc@296
|
699 (unsigned long long)L*k*( sizeof(float)*d+2*sizeof(Uns32T)+sizeof(float)),get_page_logn())),
|
mas01mc@296
|
700 pointCount(pc){
|
mas01mc@296
|
701
|
mas01mc@296
|
702 if(FMT==O2_SERIAL_FILEFORMAT2)
|
mas01mc@296
|
703 size_long = (unsigned long long)align_up(O2_SERIAL_HEADER_SIZE
|
mas01mc@296
|
704 + (unsigned long long)L*k*(sizeof(float)*d+2+sizeof(Uns32T)
|
mas01mc@296
|
705 +sizeof(float)) + (unsigned long long)pc*16UL,get_page_logn());
|
mas01mc@296
|
706 } // header
|
mas01mc@292
|
707
|
mas01mc@292
|
708 float* G::get_serial_hashfunction_base(char* db){
|
mas01mc@292
|
709 if(db&&lshHeader)
|
mas01mc@292
|
710 return (float*)(db+O2_SERIAL_HEADER_SIZE);
|
mas01mc@292
|
711 else return NULL;
|
mas01mc@292
|
712 }
|
mas01mc@292
|
713
|
mas01mc@292
|
714 SerialElementT* G::get_serial_hashtable_base(char* db){
|
mas01mc@292
|
715 if(db&&lshHeader)
|
mas01mc@292
|
716 return (SerialElementT*)(db+get_serial_hashtable_offset());
|
mas01mc@292
|
717 else
|
mas01mc@292
|
718 return NULL;
|
mas01mc@292
|
719 }
|
mas01mc@292
|
720
|
mas01mc@292
|
721 Uns32T G::get_serial_hashtable_offset(){
|
mas01mc@292
|
722 if(lshHeader)
|
mas01mc@292
|
723 return align_up(O2_SERIAL_HEADER_SIZE +
|
mas01mc@292
|
724 L*lshHeader->numFuns*( sizeof(float)*lshHeader->dataDim+2*sizeof(Uns32T)+sizeof(float)),get_page_logn());
|
mas01mc@292
|
725 else
|
mas01mc@292
|
726 return 0;
|
mas01mc@292
|
727 }
|
mas01mc@292
|
728
|
mas01mc@292
|
729 void G::serialize(char* filename, Uns32T serialFormat){
|
mas01mc@292
|
730 int dbfid;
|
mas01mc@764
|
731 char* db;
|
mas01mc@292
|
732 int dbIsNew=0;
|
mas01mc@474
|
733 FILE* dbFile = 0;
|
mas01mc@292
|
734 // Check requested serialFormat
|
mas01mc@292
|
735 if(!(serialFormat==O2_SERIAL_FILEFORMAT1 || serialFormat==O2_SERIAL_FILEFORMAT2))
|
mas01mc@292
|
736 error("Unrecognized serial file format request: ", "serialize()");
|
mas01mc@296
|
737
|
mas01mc@292
|
738 // Test to see if file exists
|
mas01cr@370
|
739 if((dbfid = open (filename, O_RDONLY)) < 0) {
|
mas01mc@292
|
740 // If it doesn't, then create the file (CREATE)
|
mas01cr@370
|
741 if(errno == ENOENT) {
|
mas01mc@292
|
742 // Create the file
|
mas01mc@292
|
743 std::cout << "Creating new serialized LSH database:" << filename << "...";
|
mas01mc@292
|
744 std::cout.flush();
|
mas01mc@292
|
745 serial_create(filename, serialFormat);
|
mas01mc@292
|
746 dbIsNew=1;
|
mas01cr@370
|
747 } else {
|
mas01mc@292
|
748 // The file can't be opened
|
mas01mc@292
|
749 error("Can't open the file", filename, "open");
|
mas01cr@370
|
750 }
|
mas01cr@370
|
751 }
|
mas01mc@292
|
752
|
mas01mc@292
|
753 // Load the on-disk header into core
|
mas01mc@292
|
754 dbfid = serial_open(filename, 1); // open for write
|
mas01mc@764
|
755 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
|
mas01mc@764
|
756 serial_get_header(db); // read header
|
mas01mc@764
|
757 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
|
mas01mc@292
|
758
|
mas01mc@292
|
759 // Check compatibility of core and disk data structures
|
mas01mc@292
|
760 if( !serial_can_merge(serialFormat) )
|
mas01mc@292
|
761 error("Incompatible core and serial LSH, data structure dimensions mismatch.");
|
mas01mc@292
|
762
|
mas01mc@292
|
763 // For new LSH databases write the hashfunctions
|
mas01mc@292
|
764 if(dbIsNew)
|
mas01mc@292
|
765 serialize_lsh_hashfunctions(dbfid);
|
mas01mc@292
|
766 // Write the hashtables in the requested format
|
mas01mc@292
|
767 if(serialFormat == O2_SERIAL_FILEFORMAT1)
|
mas01mc@292
|
768 serialize_lsh_hashtables_format1(dbfid, !dbIsNew);
|
mas01mc@336
|
769 else{
|
mas01mc@474
|
770 dbFile = fdopen(dbfid, "r+b");
|
mas01mc@336
|
771 if(!dbFile)
|
mas01mc@336
|
772 error("Cannot open LSH file for writing",filename);
|
mas01mc@336
|
773 serialize_lsh_hashtables_format2(dbFile, !dbIsNew);
|
mas01mc@336
|
774 fflush(dbFile);
|
mas01mc@336
|
775 }
|
mas01mc@292
|
776
|
mas01mc@336
|
777 if(!dbIsNew) {
|
mas01mc@764
|
778 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);// get database pointer
|
mas01mc@764
|
779 //serial_get_header(db); // read header
|
mas01mc@293
|
780 cout << "maxp = " << H::maxp << endl;
|
mas01mc@293
|
781 lshHeader->maxp=H::maxp;
|
mas01mc@292
|
782 // Default to FILEFORMAT1
|
mas01mc@292
|
783 if(!(lshHeader->flags&O2_SERIAL_FILEFORMAT2))
|
mas01mc@294
|
784 lshHeader->flags|=O2_SERIAL_FILEFORMAT1;
|
mas01mc@764
|
785 memcpy((char*)db, (char*)lshHeader, sizeof(SerialHeaderT));
|
mas01mc@764
|
786 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
|
mas01mc@336
|
787 }
|
mas01mc@336
|
788 serial_close(dbfid);
|
mas01mc@474
|
789 if(dbFile){
|
mas01mc@474
|
790 fclose(dbFile);
|
mas01mc@474
|
791 dbFile = 0;
|
mas01mc@474
|
792 }
|
mas01mc@292
|
793 }
|
mas01mc@292
|
794
|
mas01mc@292
|
795 // Test to see if core structure and requested format is
|
mas01mc@292
|
796 // compatible with currently opened database
|
mas01mc@292
|
797 int G::serial_can_merge(Uns32T format){
|
mas01mc@292
|
798 SerialHeaderT* that = lshHeader;
|
mas01mc@292
|
799 if( (format==O2_SERIAL_FILEFORMAT2 && !that->flags&O2_SERIAL_FILEFORMAT2)
|
mas01mc@292
|
800 || (format!=O2_SERIAL_FILEFORMAT2 && that->flags&O2_SERIAL_FILEFORMAT2)
|
mas01mc@292
|
801 || !( this->w == that->binWidth &&
|
mas01mc@296
|
802 this->L == that->numTables &&
|
mas01mc@296
|
803 this->N == that->numRows &&
|
mas01mc@296
|
804 this->k == that->numFuns &&
|
mas01mc@296
|
805 this->d == that->dataDim &&
|
mas01mc@296
|
806 sizeof(SerialElementT) == that->elementSize &&
|
mas01mc@296
|
807 this->radius == that->radius)){
|
mas01mc@292
|
808 serial_print_header(format);
|
mas01mc@292
|
809 return 0;
|
mas01mc@292
|
810 }
|
mas01mc@292
|
811 else
|
mas01mc@292
|
812 return 1;
|
mas01mc@292
|
813 }
|
mas01mc@292
|
814
|
mas01mc@292
|
815 // Used as an error message for serial_can_merge()
|
mas01mc@292
|
816 void G::serial_print_header(Uns32T format){
|
mas01mc@292
|
817 std::cout << "Fc:" << format << " Fs:" << lshHeader->flags << endl;
|
mas01mc@292
|
818 std::cout << "Wc:" << w << " Ls:" << lshHeader->binWidth << endl;
|
mas01mc@292
|
819 std::cout << "Lc:" << L << " Ls:" << lshHeader->numTables << endl;
|
mas01mc@292
|
820 std::cout << "Nc:" << N << " Ns:" << lshHeader->numRows << endl;
|
mas01mc@292
|
821 std::cout << "kc:" << k << " ks:" << lshHeader->numFuns << endl;
|
mas01mc@292
|
822 std::cout << "dc:" << d << " ds:" << lshHeader->dataDim << endl;
|
mas01mc@292
|
823 std::cout << "sc:" << sizeof(SerialElementT) << " ss:" << lshHeader->elementSize << endl;
|
mas01mc@292
|
824 std::cout << "rc:" << this->radius << " rs:" << lshHeader->radius << endl;
|
mas01mc@292
|
825 }
|
mas01mc@292
|
826
|
mas01mc@292
|
827 int G::serialize_lsh_hashfunctions(int fid){
|
mas01mc@292
|
828 float* pf;
|
mas01mc@292
|
829 Uns32T *pu;
|
mas01mc@292
|
830 Uns32T x,y,z;
|
mas01mc@292
|
831
|
mas01mc@764
|
832 char* db = serial_mmap(fid, get_serial_hashtable_offset(), 1);// get database pointer
|
mas01mc@764
|
833 pf = get_serial_hashfunction_base(db);
|
mas01mc@764
|
834
|
mas01mc@292
|
835 // HASH FUNCTIONS
|
mas01mc@292
|
836 // Write the random projectors A[][][]
|
mas01mc@292
|
837 #ifdef USE_U_FUNCTIONS
|
mas01mc@292
|
838 for( x = 0 ; x < H::m ; x++ )
|
mas01mc@292
|
839 for( y = 0 ; y < H::k/2 ; y++ )
|
mas01mc@292
|
840 #else
|
mas01cr@648
|
841 for( x = 0 ; x < H::L ; x++ )
|
mas01cr@648
|
842 for( y = 0 ; y < H::k ; y++ )
|
mas01mc@292
|
843 #endif
|
mas01cr@648
|
844 for( z = 0 ; z < d ; z++ )
|
mas01cr@648
|
845 *pf++ = H::A[x][y][z];
|
mas01mc@292
|
846
|
mas01mc@292
|
847 // Write the random biases b[][]
|
mas01mc@292
|
848 #ifdef USE_U_FUNCTIONS
|
mas01mc@292
|
849 for( x = 0 ; x < H::m ; x++ )
|
mas01mc@292
|
850 for( y = 0 ; y < H::k/2 ; y++ )
|
mas01mc@292
|
851 #else
|
mas01cr@648
|
852 for( x = 0 ; x < H::L ; x++ )
|
mas01cr@648
|
853 for( y = 0 ; y < H::k ; y++ )
|
mas01mc@292
|
854 #endif
|
mas01cr@648
|
855 *pf++ = H::b[x][y];
|
mas01mc@292
|
856
|
mas01mc@292
|
857 pu = (Uns32T*)pf;
|
mas01mc@292
|
858
|
mas01mc@292
|
859 // Write the Z projectors r1[][]
|
mas01mc@292
|
860 for( x = 0 ; x < H::L ; x++)
|
mas01mc@292
|
861 for( y = 0 ; y < H::k ; y++)
|
mas01mc@293
|
862 *pu++ = H::r1[x][y];
|
mas01mc@292
|
863
|
mas01mc@292
|
864 // Write the Z projectors r2[][]
|
mas01mc@292
|
865 for( x = 0 ; x < H::L ; x++)
|
mas01mc@292
|
866 for( y = 0; y < H::k ; y++)
|
mas01mc@293
|
867 *pu++ = H::r2[x][y];
|
mas01cr@648
|
868
|
mas01mc@764
|
869 serial_munmap(db, get_serial_hashtable_offset());
|
mas01mc@292
|
870 return 1;
|
mas01mc@292
|
871 }
|
mas01mc@292
|
872
|
mas01mc@292
|
873 int G::serialize_lsh_hashtables_format1(int fid, int merge){
|
mas01mc@292
|
874 SerialElementT *pe, *pt;
|
mas01mc@292
|
875 Uns32T x,y;
|
mas01mc@292
|
876
|
mas01mc@292
|
877 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT1) )
|
mas01mc@292
|
878 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
|
mas01mc@292
|
879
|
mas01mc@764
|
880 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
|
mas01mc@292
|
881 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount;
|
mas01mc@292
|
882 // Write the hash tables
|
mas01mc@292
|
883 for( x = 0 ; x < H::L ; x++ ){
|
mas01mc@292
|
884 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT1...";
|
mas01mc@292
|
885 std::cout.flush();
|
mas01mc@764
|
886 // memory map a single hash table for sequential access
|
mas01mc@764
|
887 // Align each hash table to page boundary
|
mas01mc@764
|
888 char* dbtable = serial_mmap(fid, hashTableSize, 1,
|
mas01mc@764
|
889 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
|
mas01mc@764
|
890 #ifdef __CYGWIN__
|
mas01mc@764
|
891 // No madvise in CYGWIN
|
mas01mc@764
|
892 #else
|
mas01mc@764
|
893 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
|
mas01mc@764
|
894 error("could not advise hashtable memory","","madvise");
|
mas01mc@764
|
895 #endif
|
mas01mc@292
|
896 maxColCount=0;
|
mas01mc@292
|
897 minColCount=O2_SERIAL_MAX_COLS;
|
mas01mc@292
|
898 meanColCount=0;
|
mas01mc@292
|
899 colCountN=0;
|
mas01mc@764
|
900 pt=(SerialElementT*)dbtable;
|
mas01mc@292
|
901 for( y = 0 ; y < H::N ; y++ ){
|
mas01mc@292
|
902 // Move disk pointer to beginning of row
|
mas01mc@292
|
903 pe=pt+y*lshHeader->numCols;
|
mas01mc@292
|
904
|
mas01mc@292
|
905 colCount=0;
|
mas01cr@370
|
906 if(bucket* bPtr = h[x][y]) {
|
mas01cr@370
|
907 if(merge) {
|
mas01mc@340
|
908 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@292
|
909 serial_merge_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
|
mas01cr@370
|
910 } else {
|
mas01mc@292
|
911 serial_write_hashtable_row_format1(pe, bPtr->next, colCount); // skip collision counter bucket
|
mas01mc@292
|
912 #else
|
mas01cr@370
|
913 serial_merge_hashtable_row_format1(pe, bPtr, colCount);
|
mas01cr@370
|
914 } else {
|
mas01cr@370
|
915 serial_write_hashtable_row_format1(pe, bPtr, colCount);
|
mas01mc@292
|
916 #endif
|
mas01cr@370
|
917 }
|
mas01cr@370
|
918 }
|
mas01mc@292
|
919 if(colCount){
|
mas01mc@292
|
920 if(colCount<minColCount)
|
mas01mc@292
|
921 minColCount=colCount;
|
mas01mc@292
|
922 if(colCount>maxColCount)
|
mas01mc@292
|
923 maxColCount=colCount;
|
mas01mc@292
|
924 meanColCount+=colCount;
|
mas01mc@292
|
925 colCountN++;
|
mas01mc@292
|
926 }
|
mas01mc@292
|
927 }
|
mas01mc@292
|
928 if(colCountN)
|
mas01mc@292
|
929 std::cout << "#rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
|
mas01mc@292
|
930 << ", min = " << minColCount << ", max = " << maxColCount
|
mas01mc@292
|
931 << endl;
|
mas01mc@764
|
932 serial_munmap(dbtable, hashTableSize);
|
mas01mc@292
|
933 }
|
mas01mc@292
|
934
|
mas01mc@292
|
935 // We're done writing
|
mas01mc@292
|
936 return 1;
|
mas01mc@292
|
937 }
|
mas01mc@292
|
938
|
mas01mc@292
|
939 void G::serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* b, Uns32T& colCount){
|
mas01mc@292
|
940 while(b && b->t2!=IFLAG){
|
mas01mc@292
|
941 SerialElementT*pe=pr; // reset disk pointer to beginning of row
|
mas01mc@344
|
942 serial_merge_element_format1(pe, b->snext.ptr, b->t2, colCount);
|
mas01mc@292
|
943 b=b->next;
|
mas01mc@292
|
944 }
|
mas01mc@292
|
945 }
|
mas01mc@292
|
946
|
mas01mc@292
|
947 void G::serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
|
mas01mc@292
|
948 while(sb){
|
mas01mc@292
|
949 if(colCount==lshHeader->numCols){
|
mas01mc@292
|
950 std::cout << "!point-chain full " << endl;
|
mas01mc@292
|
951 return;
|
mas01mc@292
|
952 }
|
mas01mc@292
|
953 Uns32T c=0;
|
mas01mc@292
|
954 // Merge collision chains
|
mas01mc@292
|
955 while(c<lshHeader->numCols){
|
mas01mc@292
|
956 if( (pe+c)->hashValue==IFLAG){
|
mas01mc@292
|
957 (pe+c)->hashValue=t2;
|
mas01mc@292
|
958 (pe+c)->pointID=sb->pointID;
|
mas01mc@292
|
959 colCount=c+1;
|
mas01mc@292
|
960 if(c+1<lshHeader->numCols)
|
mas01mc@292
|
961 (pe+c+1)->hashValue=IFLAG;
|
mas01mc@292
|
962 break;
|
mas01mc@292
|
963 }
|
mas01mc@292
|
964 c++;
|
mas01mc@292
|
965 }
|
mas01mc@292
|
966 sb=sb->snext;
|
mas01mc@292
|
967 }
|
mas01mc@292
|
968 return;
|
mas01mc@292
|
969 }
|
mas01mc@292
|
970
|
mas01mc@292
|
971 void G::serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* b, Uns32T& colCount){
|
mas01mc@292
|
972 pe->hashValue=IFLAG;
|
mas01mc@292
|
973 while(b && b->t2!=IFLAG){
|
mas01mc@344
|
974 serial_write_element_format1(pe, b->snext.ptr, b->t2, colCount);
|
mas01mc@292
|
975 b=b->next;
|
mas01mc@292
|
976 }
|
mas01mc@292
|
977 }
|
mas01mc@292
|
978
|
mas01mc@292
|
979 void G::serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount){
|
mas01mc@292
|
980 while(sb){
|
mas01mc@292
|
981 if(colCount==lshHeader->numCols){
|
mas01mc@292
|
982 std::cout << "!point-chain full " << endl;
|
mas01mc@292
|
983 return;
|
mas01mc@292
|
984 }
|
mas01mc@292
|
985 pe->hashValue=t2;
|
mas01mc@292
|
986 pe->pointID=sb->pointID;
|
mas01mc@292
|
987 pe++;
|
mas01mc@292
|
988 colCount++;
|
mas01mc@292
|
989 sb=sb->snext;
|
mas01mc@292
|
990 }
|
mas01mc@292
|
991 pe->hashValue=IFLAG;
|
mas01mc@292
|
992 return;
|
mas01mc@292
|
993 }
|
mas01mc@292
|
994
|
mas01mc@336
|
995 int G::serialize_lsh_hashtables_format2(FILE* dbFile, int merge){
|
mas01mc@292
|
996 Uns32T x,y;
|
mas01mc@292
|
997
|
mas01mc@292
|
998 if( merge && !serial_can_merge(O2_SERIAL_FILEFORMAT2) )
|
mas01mc@292
|
999 error("Cannot merge core and serial LSH, data structure dimensions mismatch.");
|
mas01mc@292
|
1000
|
mas01mc@340
|
1001 // We must pereform FORMAT2 merges in core FORMAT1 (dynamic list structure)
|
mas01mc@292
|
1002 if(merge)
|
mas01mc@340
|
1003 unserialize_lsh_hashtables_format2(dbFile, merge);
|
mas01mc@292
|
1004
|
mas01mc@292
|
1005 Uns32T colCount, meanColCount, colCountN, maxColCount, minColCount, t1;
|
mas01mc@336
|
1006 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
|
mas01mc@336
|
1007 fclose(dbFile);
|
mas01mc@336
|
1008 error("fSeek error in serialize_lsh_hashtables_format2");
|
mas01mc@336
|
1009 }
|
mas01mc@292
|
1010
|
mas01mc@292
|
1011 // Write the hash tables
|
mas01mc@292
|
1012 for( x = 0 ; x < H::L ; x++ ){
|
mas01mc@292
|
1013 std::cout << (merge ? "merging":"writing") << " hash table " << x << " FORMAT2...";
|
mas01mc@292
|
1014 std::cout.flush();
|
mas01mc@292
|
1015 maxColCount=0;
|
mas01mc@292
|
1016 minColCount=O2_SERIAL_MAX_COLS;
|
mas01mc@292
|
1017 meanColCount=0;
|
mas01mc@292
|
1018 colCountN=0;
|
mas01mc@764
|
1019 H::tablesPointCount = 0;
|
mas01mc@292
|
1020 for( y = 0 ; y < H::N ; y++ ){
|
mas01mc@292
|
1021 colCount=0;
|
mas01mc@292
|
1022 if(bucket* bPtr = h[x][y]){
|
mas01mc@306
|
1023 // Check for empty row (even though row was allocated)
|
mas01mc@340
|
1024 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@306
|
1025 if(bPtr->next->t2==IFLAG){
|
mas01mc@336
|
1026 fclose(dbFile);
|
mas01mc@306
|
1027 error("b->next->t2==IFLAG","serialize_lsh_hashtables_format2()");
|
mas01mc@306
|
1028 }
|
mas01mc@306
|
1029 #else
|
mas01mc@306
|
1030 if(bPtr->t2==IFLAG){
|
mas01mc@336
|
1031 fclose(dbFile);
|
mas01mc@306
|
1032 error("b->t2==IFLAG","serialize_lsh_hashtables_format2()");
|
mas01mc@306
|
1033 }
|
mas01mc@306
|
1034 #endif
|
mas01mc@306
|
1035 t1 = O2_SERIAL_TOKEN_T1;
|
mas01mc@340
|
1036 WRITE_UNS32(&t1, "[T1]");
|
mas01mc@306
|
1037 t1 = y;
|
mas01mc@340
|
1038 WRITE_UNS32(&t1, "[t1]");
|
mas01mc@340
|
1039 #ifdef LSH_CORE_ARRAY
|
mas01mc@340
|
1040 t1 = count_buckets_and_points_hashtable_row(bPtr);
|
mas01mc@340
|
1041 WRITE_UNS32(&t1,"[count]"); // write numElements
|
mas01mc@340
|
1042 #endif
|
mas01mc@340
|
1043 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@336
|
1044 serial_write_hashtable_row_format2(dbFile, bPtr->next, colCount); // skip collision counter bucket
|
mas01mc@292
|
1045 #else
|
mas01mc@340
|
1046 serial_write_hashtable_row_format2(dbFile, bPtr, colCount);
|
mas01mc@292
|
1047 #endif
|
mas01mc@292
|
1048 }
|
mas01mc@292
|
1049 if(colCount){
|
mas01mc@292
|
1050 if(colCount<minColCount)
|
mas01mc@292
|
1051 minColCount=colCount;
|
mas01mc@292
|
1052 if(colCount>maxColCount)
|
mas01mc@292
|
1053 maxColCount=colCount;
|
mas01mc@292
|
1054 meanColCount+=colCount;
|
mas01mc@292
|
1055 colCountN++;
|
mas01mc@292
|
1056 }
|
mas01mc@764
|
1057 H::tablesPointCount+=colCount;
|
mas01mc@292
|
1058 }
|
mas01mc@292
|
1059 // Write END of table marker
|
mas01mc@306
|
1060 t1 = O2_SERIAL_TOKEN_ENDTABLE;
|
mas01mc@340
|
1061 WRITE_UNS32(&t1,"[end]");
|
mas01mc@292
|
1062 if(colCountN)
|
mas01mc@764
|
1063 std::cout << "#points: " << H::tablesPointCount << " #rows with collisions =" << colCountN << ", mean = " << meanColCount/(float)colCountN
|
mas01mc@292
|
1064 << ", min = " << minColCount << ", max = " << maxColCount
|
mas01mc@292
|
1065 << endl;
|
mas01mc@340
|
1066 }
|
mas01mc@292
|
1067 // We're done writing
|
mas01mc@292
|
1068 return 1;
|
mas01mc@292
|
1069 }
|
mas01mc@292
|
1070
|
mas01mc@340
|
1071 Uns32T G::count_buckets_and_points_hashtable_row(bucket* bPtr){
|
mas01mc@340
|
1072 Uns32T total_count = 0;
|
mas01mc@340
|
1073 bucket* p = 0;
|
mas01mc@340
|
1074
|
mas01mc@340
|
1075 // count points
|
mas01mc@340
|
1076 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@340
|
1077 total_count = bPtr->t2; // points already counted
|
mas01mc@340
|
1078 p = bPtr->next;
|
mas01mc@340
|
1079 #else
|
mas01mc@340
|
1080 total_count = count_points_hashtable_row(bPtr);
|
mas01mc@340
|
1081 p = bPtr;
|
mas01mc@340
|
1082 #endif
|
mas01mc@340
|
1083
|
mas01mc@340
|
1084 // count buckets
|
mas01mc@340
|
1085 do{
|
mas01mc@340
|
1086 total_count++;
|
mas01mc@340
|
1087 }while((p=p->next));
|
mas01mc@340
|
1088
|
mas01mc@340
|
1089 return total_count;
|
mas01mc@340
|
1090 }
|
mas01mc@340
|
1091
|
mas01mc@340
|
1092 Uns32T G::count_points_hashtable_row(bucket* bPtr){
|
mas01mc@340
|
1093 Uns32T point_count = 0;
|
mas01mc@340
|
1094 bucket* p = bPtr;
|
mas01mc@340
|
1095 sbucket* s = 0;
|
mas01mc@340
|
1096 while(p){
|
mas01mc@344
|
1097 s = p->snext.ptr;
|
mas01mc@340
|
1098 while(s){
|
mas01mc@340
|
1099 point_count++;
|
mas01mc@340
|
1100 s=s->snext;
|
mas01mc@340
|
1101 }
|
mas01mc@340
|
1102 p=p->next;
|
mas01mc@340
|
1103 }
|
mas01mc@340
|
1104 return point_count;
|
mas01mc@340
|
1105 }
|
mas01mc@340
|
1106
|
mas01mc@336
|
1107 void G::serial_write_hashtable_row_format2(FILE* dbFile, bucket* b, Uns32T& colCount){
|
mas01mc@764
|
1108 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1109 Uns32T last_t2 = 0;
|
mas01mc@764
|
1110 #endif
|
mas01mc@292
|
1111 while(b && b->t2!=IFLAG){
|
mas01mc@344
|
1112 if(!b->snext.ptr){
|
mas01mc@336
|
1113 fclose(dbFile);
|
mas01mc@306
|
1114 error("Empty collision chain in serial_write_hashtable_row_format2()");
|
mas01mc@306
|
1115 }
|
mas01mc@306
|
1116 t2 = O2_SERIAL_TOKEN_T2;
|
mas01mc@764
|
1117
|
mas01mc@336
|
1118 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
|
mas01mc@336
|
1119 fclose(dbFile);
|
mas01mc@292
|
1120 error("write error in serial_write_hashtable_row_format2()");
|
mas01mc@292
|
1121 }
|
mas01mc@292
|
1122 t2 = b->t2;
|
mas01mc@764
|
1123 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1124 if(t2 < last_t2){
|
mas01mc@764
|
1125 fclose(dbFile);
|
mas01mc@764
|
1126 error("t2<last_t2 in serial_write_hashtable_row_format2()");
|
mas01mc@764
|
1127 }
|
mas01mc@764
|
1128 last_t2 = t2;
|
mas01mc@764
|
1129 #endif
|
mas01mc@764
|
1130
|
mas01mc@336
|
1131 if( fwrite(&t2, sizeof(Uns32T), 1, dbFile) != 1 ){
|
mas01mc@336
|
1132 fclose(dbFile);
|
mas01mc@292
|
1133 error("write error in serial_write_hashtable_row_format2()");
|
mas01mc@292
|
1134 }
|
mas01mc@344
|
1135 serial_write_element_format2(dbFile, b->snext.ptr, colCount);
|
mas01mc@292
|
1136 b=b->next;
|
mas01mc@292
|
1137 }
|
mas01mc@292
|
1138 }
|
mas01mc@292
|
1139
|
mas01mc@336
|
1140 void G::serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount){
|
mas01mc@292
|
1141 while(sb){
|
mas01mc@336
|
1142 if(fwrite(&sb->pointID, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@336
|
1143 fclose(dbFile);
|
mas01mc@292
|
1144 error("Write error in serial_write_element_format2()");
|
mas01mc@292
|
1145 }
|
mas01mc@292
|
1146 colCount++;
|
mas01mc@292
|
1147 sb=sb->snext;
|
mas01mc@292
|
1148 }
|
mas01mc@292
|
1149 }
|
mas01mc@292
|
1150
|
mas01mc@292
|
1151
|
mas01mc@292
|
1152 int G::serial_create(char* filename, Uns32T FMT){
|
mas01mc@292
|
1153 return serial_create(filename, w, L, N, C, k, d, FMT);
|
mas01mc@292
|
1154 }
|
mas01mc@292
|
1155
|
mas01mc@292
|
1156
|
mas01mc@292
|
1157 int G::serial_create(char* filename, float binWidth, Uns32T numTables, Uns32T numRows, Uns32T numCols,
|
mas01mc@292
|
1158 Uns32T numFuns, Uns32T dim, Uns32T FMT){
|
mas01mc@292
|
1159
|
mas01mc@292
|
1160 if(numTables > O2_SERIAL_MAX_TABLES || numRows > O2_SERIAL_MAX_ROWS
|
mas01mc@292
|
1161 || numCols > O2_SERIAL_MAX_COLS || numFuns > O2_SERIAL_MAX_FUNS
|
mas01mc@292
|
1162 || dim>O2_SERIAL_MAX_DIM){
|
mas01mc@292
|
1163 error("LSH parameters out of bounds for serialization");
|
mas01mc@292
|
1164 }
|
mas01mc@292
|
1165
|
mas01mc@292
|
1166 int dbfid;
|
mas01cr@648
|
1167 if ((dbfid = open (filename, O_RDWR|O_CREAT|O_EXCL,
|
mas01cr@654
|
1168 #if defined(WIN32)
|
mas01cr@648
|
1169 _S_IREAD|_S_IWRITE
|
mas01cr@648
|
1170 #else
|
mas01cr@648
|
1171 S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH
|
mas01cr@648
|
1172 #endif
|
mas01cr@648
|
1173 )) < 0)
|
mas01cr@648
|
1174
|
mas01mc@292
|
1175 error("Can't create serial file", filename, "open");
|
mas01mc@292
|
1176 get_lock(dbfid, 1);
|
mas01mc@292
|
1177
|
mas01mc@292
|
1178 // Make header first to get size of serialized database
|
mas01mc@296
|
1179 lshHeader = new SerialHeaderT(binWidth, numTables, numRows, numCols, numFuns, dim, radius, maxp, FMT, pointCount);
|
mas01mc@296
|
1180
|
mas01mc@296
|
1181 cout << "file size: <=" << lshHeader->get_size()/1024UL << "KB" << endl;
|
mas01mc@296
|
1182 if(lshHeader->get_size()>O2_SERIAL_MAXFILESIZE)
|
mas01mc@296
|
1183 error("Maximum size of LSH file exceded: > 4000MB");
|
mas01mc@296
|
1184
|
mas01mc@292
|
1185 // go to the location corresponding to the last byte
|
mas01mc@292
|
1186 if (lseek (dbfid, lshHeader->get_size() - 1, SEEK_SET) == -1)
|
mas01mc@292
|
1187 error("lseek error in db file", "", "lseek");
|
mas01mc@292
|
1188
|
mas01mc@292
|
1189 // write a dummy byte at the last location
|
mas01mc@292
|
1190 if (write (dbfid, "", 1) != 1)
|
mas01mc@292
|
1191 error("write error", "", "write");
|
mas01mc@764
|
1192
|
mas01mc@764
|
1193 char* db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 1);
|
mas01mc@764
|
1194
|
mas01mc@764
|
1195 memcpy (db, lshHeader, O2_SERIAL_HEADER_SIZE);
|
mas01mc@764
|
1196
|
mas01mc@764
|
1197 serial_munmap(db, O2_SERIAL_HEADER_SIZE);
|
mas01mc@764
|
1198
|
mas01cr@648
|
1199
|
mas01mc@292
|
1200 close(dbfid);
|
mas01mc@292
|
1201
|
mas01mc@292
|
1202 std::cout << "done initializing tables." << endl;
|
mas01mc@292
|
1203
|
mas01mc@292
|
1204 return 1;
|
mas01mc@292
|
1205 }
|
mas01mc@292
|
1206
|
mas01mc@764
|
1207 char* G::serial_mmap(int dbfid, Uns32T memSize, Uns32T forWrite, off_t offset){
|
mas01mc@764
|
1208 char* db;
|
mas01mc@764
|
1209 if(forWrite){
|
mas01mc@764
|
1210 if ((db = (char*) mmap(0, memSize, PROT_READ | PROT_WRITE,
|
mas01mc@764
|
1211 MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
|
mas01mc@764
|
1212 error("mmap error in request for writable serialized database", "", "mmap");
|
mas01mc@764
|
1213 }
|
mas01mc@764
|
1214 else if ((db = (char*) mmap(0, memSize, PROT_READ, MAP_SHARED, dbfid, offset)) == (caddr_t) -1)
|
mas01mc@764
|
1215 error("mmap error in read-only serialized database", "", "mmap");
|
mas01mc@764
|
1216
|
mas01mc@764
|
1217 return db;
|
mas01mc@764
|
1218 }
|
mas01mc@764
|
1219
|
mas01mc@764
|
1220 SerialHeaderT* G::serial_get_header(char* db){
|
mas01mc@292
|
1221 lshHeader = new SerialHeaderT();
|
mas01mc@764
|
1222 memcpy((char*)lshHeader, db, sizeof(SerialHeaderT));
|
mas01mc@292
|
1223
|
mas01mc@292
|
1224 if(lshHeader->lshMagic!=O2_SERIAL_MAGIC)
|
mas01mc@292
|
1225 error("Not an LSH database file");
|
mas01mc@764
|
1226
|
mas01mc@292
|
1227 return lshHeader;
|
mas01mc@292
|
1228 }
|
mas01mc@292
|
1229
|
mas01mc@764
|
1230 void G::serial_munmap(char* db, Uns32T N){
|
mas01mc@764
|
1231 munmap(db, N);
|
mas01mc@292
|
1232 }
|
mas01mc@292
|
1233
|
mas01mc@292
|
1234 int G::serial_open(char* filename, int writeFlag){
|
mas01mc@292
|
1235 int dbfid;
|
mas01mc@292
|
1236 if(writeFlag){
|
mas01mc@292
|
1237 if ((dbfid = open (filename, O_RDWR)) < 0)
|
mas01mc@292
|
1238 error("Can't open serial file for read/write", filename, "open");
|
mas01mc@292
|
1239 get_lock(dbfid, writeFlag);
|
mas01mc@292
|
1240 }
|
mas01mc@292
|
1241 else{
|
mas01mc@292
|
1242 if ((dbfid = open (filename, O_RDONLY)) < 0)
|
mas01mc@292
|
1243 error("Can't open serial file for read", filename, "open");
|
mas01mc@292
|
1244 get_lock(dbfid, 0);
|
mas01mc@292
|
1245 }
|
mas01mc@292
|
1246
|
mas01mc@292
|
1247 return dbfid;
|
mas01mc@292
|
1248 }
|
mas01mc@292
|
1249
|
mas01mc@292
|
1250 void G::serial_close(int dbfid){
|
mas01mc@292
|
1251
|
mas01mc@292
|
1252 release_lock(dbfid);
|
mas01mc@292
|
1253 close(dbfid);
|
mas01mc@292
|
1254 }
|
mas01mc@292
|
1255
|
mas01mc@292
|
1256 int G::unserialize_lsh_header(char* filename){
|
mas01mc@292
|
1257 int dbfid;
|
mas01mc@764
|
1258 char* db;
|
mas01mc@292
|
1259 // Test to see if file exists
|
mas01mc@292
|
1260 if((dbfid = open (filename, O_RDONLY)) < 0)
|
mas01mc@292
|
1261 error("Can't open the file", filename, "open");
|
mas01mc@292
|
1262 close(dbfid);
|
mas01mc@292
|
1263 dbfid = serial_open(filename, 0); // open for read
|
mas01mc@764
|
1264 db = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
|
mas01mc@764
|
1265 serial_get_header(db); // read header
|
mas01mc@764
|
1266 serial_munmap(db, O2_SERIAL_HEADER_SIZE); // drop mmap
|
mas01mc@292
|
1267
|
mas01mc@292
|
1268 // Unserialize header parameters
|
mas01mc@292
|
1269 H::L = lshHeader->numTables;
|
mas01mc@292
|
1270 H::m = (Uns32T)( (1.0 + sqrt(1 + 8.0*(int)H::L)) / 2.0);
|
mas01mc@292
|
1271 H::N = lshHeader->numRows;
|
mas01mc@292
|
1272 H::C = lshHeader->numCols;
|
mas01mc@292
|
1273 H::k = lshHeader->numFuns;
|
mas01mc@292
|
1274 H::d = lshHeader->dataDim;
|
mas01mc@293
|
1275 H::w = lshHeader->binWidth;
|
mas01mc@293
|
1276 H::radius = lshHeader->radius;
|
mas01mc@293
|
1277 H::maxp = lshHeader->maxp;
|
mas01mc@296
|
1278 H::pointCount = lshHeader->pointCount;
|
mas01mc@292
|
1279
|
mas01mc@292
|
1280 return dbfid;
|
mas01mc@292
|
1281 }
|
mas01mc@292
|
1282
|
mas01mc@292
|
1283 // unserialize the LSH parameters
|
mas01mc@292
|
1284 // we leave the LSH tree on disk as a flat file
|
mas01mc@292
|
1285 // it is this flat file that we search by memory mapping
|
mas01mc@292
|
1286 void G::unserialize_lsh_functions(int dbfid){
|
mas01mc@292
|
1287 Uns32T j, kk;
|
mas01mc@292
|
1288 float* pf;
|
mas01mc@292
|
1289 Uns32T* pu;
|
mas01mc@292
|
1290
|
mas01mc@292
|
1291 // Load the hash functions into core
|
mas01mc@764
|
1292 char* db = serial_mmap(dbfid, get_serial_hashtable_offset(), 0);// get database pointer again
|
mas01mc@764
|
1293
|
mas01mc@764
|
1294 pf = get_serial_hashfunction_base(db);
|
mas01mc@292
|
1295
|
mas01mc@292
|
1296 #ifdef USE_U_FUNCTIONS
|
mas01mc@292
|
1297 for( j = 0 ; j < H::m ; j++ ){ // L functions gj(v)
|
mas01mc@292
|
1298 for( kk = 0 ; kk < H::k/2 ; kk++ ){ // Normally distributed hash functions
|
mas01mc@292
|
1299 #else
|
mas01mc@764
|
1300 for( j = 0 ; j < H::L ; j++ ){ // L functions gj(v)
|
mas01mc@764
|
1301 for( kk = 0 ; kk < H::k ; kk++ ){ // Normally distributed hash functions
|
mas01mc@292
|
1302 #endif
|
mas01mc@764
|
1303 for(Uns32T i = 0 ; i < H::d ; i++ )
|
mas01mc@764
|
1304 H::A[j][kk][i] = *pf++; // Normally distributed random vectors
|
mas01mc@764
|
1305 }
|
mas01mc@764
|
1306 }
|
mas01mc@292
|
1307 #ifdef USE_U_FUNCTIONS
|
mas01mc@764
|
1308 for( j = 0 ; j < H::m ; j++ ) // biases b
|
mas01mc@764
|
1309 for( kk = 0 ; kk < H::k/2 ; kk++ )
|
mas01mc@292
|
1310 #else
|
mas01mc@764
|
1311 for( j = 0 ; j < H::L ; j++ ) // biases b
|
mas01mc@764
|
1312 for( kk = 0 ; kk < H::k ; kk++ )
|
mas01mc@292
|
1313 #endif
|
mas01mc@764
|
1314 H::b[j][kk] = *pf++;
|
mas01mc@292
|
1315
|
mas01mc@764
|
1316 pu = (Uns32T*)pf;
|
mas01mc@764
|
1317 for( j = 0 ; j < H::L ; j++ ) // Z projectors r1
|
mas01mc@764
|
1318 for( kk = 0 ; kk < H::k ; kk++ )
|
mas01mc@764
|
1319 H::r1[j][kk] = *pu++;
|
mas01mc@764
|
1320
|
mas01mc@764
|
1321 for( j = 0 ; j < H::L ; j++ ) // Z projectors r2
|
mas01mc@764
|
1322 for( kk = 0 ; kk < H::k ; kk++ )
|
mas01mc@764
|
1323 H::r2[j][kk] = *pu++;
|
mas01mc@292
|
1324
|
mas01mc@764
|
1325 serial_munmap(db, get_serial_hashtable_offset());
|
mas01mc@292
|
1326 }
|
mas01mc@292
|
1327
|
mas01mc@292
|
1328 void G::unserialize_lsh_hashtables_format1(int fid){
|
mas01mc@292
|
1329 SerialElementT *pe, *pt;
|
mas01mc@292
|
1330 Uns32T x,y;
|
mas01mc@292
|
1331 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
|
mas01mc@292
|
1332 // Read the hash tables into core
|
mas01mc@292
|
1333 for( x = 0 ; x < H::L ; x++ ){
|
mas01mc@292
|
1334 // memory map a single hash table
|
mas01mc@292
|
1335 // Align each hash table to page boundary
|
mas01mc@764
|
1336 char* dbtable = serial_mmap(fid, hashTableSize, 0,
|
mas01mc@764
|
1337 align_up(get_serial_hashtable_offset()+x*hashTableSize, get_page_logn()));
|
mas01mc@764
|
1338 #ifdef __CYGWIN__
|
mas01mc@764
|
1339 // No madvise in CYGWIN
|
mas01mc@764
|
1340 #else
|
mas01mc@764
|
1341 if(madvise(dbtable, hashTableSize, MADV_SEQUENTIAL)<0)
|
mas01mc@764
|
1342 error("could not advise hashtable memory","","madvise");
|
mas01mc@764
|
1343 #endif
|
mas01mc@764
|
1344 pt=(SerialElementT*)dbtable;
|
mas01mc@292
|
1345 for( y = 0 ; y < H::N ; y++ ){
|
mas01mc@292
|
1346 // Move disk pointer to beginning of row
|
mas01mc@292
|
1347 pe=pt+y*lshHeader->numCols;
|
mas01mc@292
|
1348 unserialize_hashtable_row_format1(pe, h[x]+y);
|
mas01mc@340
|
1349 #ifdef LSH_DUMP_CORE_TABLES
|
mas01mc@292
|
1350 printf("S[%d,%d]", x, y);
|
mas01mc@292
|
1351 serial_bucket_dump(pe);
|
mas01mc@292
|
1352 printf("C[%d,%d]", x, y);
|
mas01mc@292
|
1353 dump_hashtable_row(h[x][y]);
|
mas01mc@292
|
1354 #endif
|
mas01mc@292
|
1355 }
|
mas01mc@764
|
1356 serial_munmap(dbtable, hashTableSize);
|
mas01mc@292
|
1357 }
|
mas01mc@292
|
1358 }
|
mas01mc@292
|
1359
|
mas01mc@292
|
1360 void G::unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b){
|
mas01mc@292
|
1361 Uns32T colCount = 0;
|
mas01mc@292
|
1362 while(colCount!=lshHeader->numCols && pe->hashValue !=IFLAG){
|
mas01mc@292
|
1363 H::p = pe->pointID; // current point ID
|
mas01mc@292
|
1364 t2 = pe->hashValue;
|
mas01mc@292
|
1365 bucket_insert_point(b);
|
mas01mc@292
|
1366 pe++;
|
mas01mc@292
|
1367 colCount++;
|
mas01mc@292
|
1368 }
|
mas01mc@292
|
1369 }
|
mas01mc@292
|
1370
|
mas01mc@340
|
1371 void G::unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge){
|
mas01mc@292
|
1372 Uns32T x=0,y=0;
|
mas01mc@764
|
1373 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1374 cout << "Loading hashtables..." << endl;
|
mas01mc@764
|
1375 cout << "header pointCount = " << pointCount << endl;
|
mas01mc@764
|
1376 cout << "forMerge = " << forMerge << endl;
|
mas01mc@764
|
1377 Uns32T sumTablesPointCount = 0;
|
mas01mc@764
|
1378 #endif
|
mas01mc@292
|
1379 // Seek to hashtable base offset
|
mas01mc@336
|
1380 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
|
mas01mc@336
|
1381 fclose(dbFile);
|
mas01mc@336
|
1382 error("fSeek error in unserialize_lsh_hashtables_format2");
|
mas01mc@292
|
1383 }
|
mas01mc@292
|
1384
|
mas01mc@292
|
1385 // Read the hash tables into core (structure is given in header)
|
mas01mc@292
|
1386 while( x < H::L){
|
mas01mc@764
|
1387 tablesPointCount=0;
|
mas01mc@336
|
1388 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@336
|
1389 fclose(dbFile);
|
mas01mc@292
|
1390 error("Read error","unserialize_lsh_hashtables_format2()");
|
mas01mc@292
|
1391 }
|
mas01mc@306
|
1392 if(H::t1==O2_SERIAL_TOKEN_ENDTABLE)
|
mas01mc@292
|
1393 x++; // End of table
|
mas01mc@292
|
1394 else
|
mas01mc@292
|
1395 while(y < H::N){
|
mas01mc@306
|
1396 // Read a row and move file pointer to beginning of next row or _bittable
|
mas01mc@306
|
1397 if(!(H::t1==O2_SERIAL_TOKEN_T1)){
|
mas01mc@336
|
1398 fclose(dbFile);
|
mas01mc@306
|
1399 error("State matchine error T1","unserialize_lsh_hashtables_format2()");
|
mas01mc@292
|
1400 }
|
mas01mc@336
|
1401 if(fread(&(H::t1), sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@336
|
1402 fclose(dbFile);
|
mas01mc@306
|
1403 error("Read error: t1","unserialize_lsh_hashtables_format2()");
|
mas01mc@306
|
1404 }
|
mas01mc@306
|
1405 y = H::t1;
|
mas01mc@292
|
1406 if(y>=H::N){
|
mas01mc@336
|
1407 fclose(dbFile);
|
mas01mc@292
|
1408 error("Unserialized hashtable row pointer out of range","unserialize_lsh_hashtables_format2()");
|
mas01mc@292
|
1409 }
|
mas01mc@340
|
1410 Uns32T token = 0;
|
mas01mc@340
|
1411 #ifdef LSH_CORE_ARRAY
|
mas01mc@340
|
1412 Uns32T numElements;
|
mas01mc@340
|
1413 if(fread(&numElements, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@340
|
1414 fclose(dbFile);
|
mas01mc@340
|
1415 error("Read error: numElements","unserialize_lsh_hashtables_format2()");
|
mas01mc@340
|
1416 }
|
mas01mc@764
|
1417
|
mas01mc@764
|
1418 /*
|
mas01mc@764
|
1419 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1420 cout << "[" << x << "," << y << "] numElements(disk) = " << numElements;
|
mas01mc@764
|
1421 #endif
|
mas01mc@764
|
1422 */
|
mas01mc@340
|
1423 // BACKWARD COMPATIBILITY: check to see if T2 or END token was read
|
mas01mc@340
|
1424 if(numElements==O2_SERIAL_TOKEN_T2 || numElements==O2_SERIAL_TOKEN_ENDTABLE ){
|
mas01mc@340
|
1425 forMerge=true; // Force use of dynamic linked list core format
|
mas01mc@340
|
1426 token = numElements;
|
mas01mc@340
|
1427 }
|
mas01mc@340
|
1428
|
mas01mc@764
|
1429 if(forMerge) // FOR INDEXING use dynamic linked list data structure
|
mas01mc@340
|
1430 // Use linked list CORE format
|
mas01mc@340
|
1431 token = unserialize_hashtable_row_format2(dbFile, h[x]+y, token);
|
mas01mc@764
|
1432 else // FOR QUERY use static array data structure
|
mas01mc@340
|
1433 // Use ARRAY CORE format with numElements counter
|
mas01mc@340
|
1434 token = unserialize_hashtable_row_to_array(dbFile, h[x]+y, numElements);
|
mas01mc@340
|
1435 #else
|
mas01mc@764
|
1436 token = unserialize_hashtable_row_format2(dbFile, h[x]+y);
|
mas01mc@513
|
1437 #endif
|
mas01mc@292
|
1438 // Check that token is valid
|
mas01mc@306
|
1439 if( !(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE) ){
|
mas01mc@336
|
1440 fclose(dbFile);
|
mas01mc@292
|
1441 error("State machine error end of row/table", "unserialize_lsh_hashtables_format2()");
|
mas01mc@292
|
1442 }
|
mas01mc@292
|
1443 // Check for end of table flag
|
mas01mc@306
|
1444 if(token==O2_SERIAL_TOKEN_ENDTABLE){
|
mas01mc@292
|
1445 x++;
|
mas01mc@292
|
1446 break;
|
mas01mc@292
|
1447 }
|
mas01mc@292
|
1448 // Check for new row flag
|
mas01mc@306
|
1449 if(token==O2_SERIAL_TOKEN_T1)
|
mas01mc@292
|
1450 H::t1 = token;
|
mas01mc@292
|
1451 }
|
mas01mc@764
|
1452 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1453 cout << "[T " << x-1 << "] pointCount = " << tablesPointCount << endl;
|
mas01mc@764
|
1454 sumTablesPointCount+=tablesPointCount;
|
mas01mc@764
|
1455 #endif
|
mas01mc@292
|
1456 }
|
mas01mc@764
|
1457 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1458 cout << "TOTAL pointCount = " << sumTablesPointCount << endl;
|
mas01mc@764
|
1459 #endif
|
mas01mc@513
|
1460 #ifdef LSH_DUMP_CORE_TABLES
|
mas01mc@513
|
1461 dump_hashtables();
|
mas01mc@513
|
1462 #endif
|
mas01mc@306
|
1463 }
|
mas01mc@292
|
1464
|
mas01mc@340
|
1465 Uns32T G::unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token){
|
mas01mc@292
|
1466 bool pointFound = false;
|
mas01mc@340
|
1467
|
mas01mc@340
|
1468 if(token)
|
mas01mc@340
|
1469 H::t2 = token;
|
mas01mc@340
|
1470 else if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@340
|
1471 fclose(dbFile);
|
mas01mc@340
|
1472 error("Read error T2 token","unserialize_hashtable_row_format2");
|
mas01mc@340
|
1473 }
|
mas01mc@340
|
1474
|
mas01mc@306
|
1475 if( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2)){
|
mas01mc@336
|
1476 fclose(dbFile);
|
mas01mc@292
|
1477 error("State machine error: expected E or T2");
|
mas01mc@292
|
1478 }
|
mas01mc@340
|
1479
|
mas01mc@306
|
1480 while(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1)){
|
mas01mc@292
|
1481 pointFound=false;
|
mas01mc@292
|
1482 // Check for T2 token
|
mas01mc@306
|
1483 if(H::t2!=O2_SERIAL_TOKEN_T2){
|
mas01mc@336
|
1484 fclose(dbFile);
|
mas01mc@292
|
1485 error("State machine error T2 token", "unserialize_hashtable_row_format2()");
|
mas01mc@306
|
1486 }
|
mas01mc@292
|
1487 // Read t2 value
|
mas01mc@336
|
1488 if(fread(&(H::t2), sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@336
|
1489 fclose(dbFile);
|
mas01mc@292
|
1490 error("Read error t2","unserialize_hashtable_row_format2");
|
mas01mc@292
|
1491 }
|
mas01mc@336
|
1492 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@336
|
1493 fclose(dbFile);
|
mas01mc@292
|
1494 error("Read error H::p","unserialize_hashtable_row_format2");
|
mas01mc@292
|
1495 }
|
mas01mc@306
|
1496 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
|
mas01mc@292
|
1497 pointFound=true;
|
mas01mc@292
|
1498 bucket_insert_point(b);
|
mas01mc@764
|
1499 tablesPointCount++;
|
mas01mc@336
|
1500 if(fread(&(H::p), sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@336
|
1501 fclose(dbFile);
|
mas01mc@292
|
1502 error("Read error H::p","unserialize_hashtable_row_format2");
|
mas01mc@292
|
1503 }
|
mas01mc@292
|
1504 }
|
mas01mc@292
|
1505 if(!pointFound)
|
mas01mc@292
|
1506 error("State machine error: point", "unserialize_hashtable_row_format2()");
|
mas01mc@306
|
1507 H::t2 = H::p; // Copy last found token to t2
|
mas01mc@292
|
1508 }
|
mas01mc@292
|
1509 return H::t2; // holds current token
|
mas01mc@292
|
1510 }
|
mas01mc@292
|
1511
|
mas01mc@340
|
1512 // Unserialize format2 hashtable row to a CORE_ARRAY pointed to
|
mas01mc@340
|
1513 // by the hashtable row pointer: rowPtr
|
mas01mc@340
|
1514 //
|
mas01mc@340
|
1515 // numElements is the total number of t2 buckets plus points
|
mas01mc@340
|
1516 // memory required is numElements+1+sizeof(hashtable ptr)
|
mas01mc@340
|
1517 //
|
mas01mc@340
|
1518 // numElements = numPoints + numBuckets
|
mas01mc@340
|
1519 //
|
mas01mc@340
|
1520 // During inserts (merges) new hashtable entries are appened at rowPtr+numPoints+numBuckets+1
|
mas01mc@340
|
1521 //
|
mas01mc@340
|
1522 // ASSUME: that LSH_LIST_HEAD_COUNTERS is set so that the first hashtable bucket is used to count
|
mas01mc@340
|
1523 // point and bucket entries
|
mas01mc@340
|
1524 //
|
mas01mc@340
|
1525 // We store the values of numPoints and numBuckets in separate fields of the first bucket
|
mas01mc@340
|
1526 // rowPtr->t2 // numPoints
|
mas01mc@764
|
1527 // (rowPtr->snext.numBuckets) // numBuckets
|
mas01mc@340
|
1528 //
|
mas01mc@340
|
1529 // We cast the rowPtr->next pointer to (Uns32*) malloc(numElements*sizeof(Uns32T) + sizeof(bucket*))
|
mas01mc@340
|
1530 // To get to the fist bucket, we use
|
mas01mc@340
|
1531 //
|
mas01mc@340
|
1532
|
mas01mc@340
|
1533 #define READ_UNS32T(VAL,TOKENSTR) if(fread(VAL, sizeof(Uns32T), 1, dbFile) != 1){\
|
mas01mc@340
|
1534 fclose(dbFile);error("Read error unserialize_hashtable_format2",TOKENSTR);}
|
mas01mc@340
|
1535
|
mas01mc@340
|
1536 #define TEST_TOKEN(TEST, TESTSTR) if(TEST){fclose(dbFile);error("State machine error: ", TESTSTR);}
|
mas01mc@340
|
1537
|
mas01mc@340
|
1538 #define SKIP_BITS_LEFT_SHIFT_MSB (30)
|
mas01mc@340
|
1539
|
mas01mc@340
|
1540 #define SKIP_BITS_RIGHT_SHIFT_MSB (28)
|
mas01mc@340
|
1541 #define SKIP_BITS_RIGHT_SHIFT_LSB (30)
|
mas01mc@340
|
1542
|
mas01mc@340
|
1543 #define MAX_POINTS_IN_BUCKET_CORE_ARRAY (16)
|
mas01mc@340
|
1544 #define LSH_CORE_ARRAY_END_ROW_TOKEN (0xFFFFFFFD)
|
mas01mc@340
|
1545
|
mas01mc@340
|
1546 // Encode the skip bits. Zero if only one point, MAX 8 (plus first == 9)
|
mas01mc@340
|
1547 #define ENCODE_POINT_SKIP_BITS TEST_TOKEN(!numPointsThisBucket, "no points found");\
|
mas01mc@340
|
1548 if(numPointsThisBucket==1){\
|
mas01mc@340
|
1549 secondPtr=ap++;\
|
mas01mc@340
|
1550 *secondPtr=0;\
|
mas01mc@340
|
1551 numPoints++;\
|
mas01mc@764
|
1552 numSingletons++;\
|
mas01mc@340
|
1553 }\
|
mas01mc@340
|
1554 if(numPointsThisBucket>1){\
|
mas01mc@340
|
1555 *firstPtr |= ( (numPointsThisBucket-1) & 0x3 ) << SKIP_BITS_LEFT_SHIFT_MSB;\
|
mas01mc@340
|
1556 *secondPtr |= ( ( (numPointsThisBucket-1) & 0xC) >> 2 ) << SKIP_BITS_LEFT_SHIFT_MSB;}
|
mas01mc@340
|
1557
|
mas01mc@340
|
1558 Uns32T G::unserialize_hashtable_row_to_array(FILE* dbFile, bucket** rowPP, Uns32T numElements){
|
mas01mc@340
|
1559 Uns32T numPointsThisBucket = 0;
|
mas01mc@340
|
1560 Uns32T numBuckets = 0;
|
mas01mc@340
|
1561 Uns32T numPoints = 0;
|
mas01mc@340
|
1562 Uns32T* firstPtr = 0;
|
mas01mc@340
|
1563 Uns32T* secondPtr = 0;
|
mas01mc@764
|
1564 Uns32T numSingletons = 0; // Count single point puckets because we encode them with 2 points (for skip)
|
mas01mc@340
|
1565
|
mas01mc@340
|
1566 // Initialize new row
|
mas01mc@340
|
1567 if(!*rowPP){
|
mas01mc@340
|
1568 *rowPP = new bucket();
|
mas01mc@340
|
1569 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@340
|
1570 (*rowPP)->t2 = 0; // Use t2 as a collision counter for the row
|
mas01mc@340
|
1571 (*rowPP)->next = 0;
|
mas01mc@340
|
1572 #endif
|
mas01mc@340
|
1573 }
|
mas01mc@340
|
1574 bucket* rowPtr = *rowPP;
|
mas01mc@764
|
1575 Uns32T last_t2 = 0;
|
mas01mc@340
|
1576 READ_UNS32T(&(H::t2),"t2");
|
mas01mc@340
|
1577 TEST_TOKEN(!(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T2), "expected E or T2");
|
mas01mc@340
|
1578 // Because we encode points in 16-point blocks, we sometimes allocate repeated t2 elements
|
mas01mc@340
|
1579 // So over-allocate by a factor of two and realloc later to actual numElements
|
mas01mc@340
|
1580 CR_ASSERT(rowPtr->next = (bucket*) malloc((2*numElements+1)*sizeof(Uns32T)+sizeof(bucket**)));
|
mas01mc@340
|
1581 Uns32T* ap = reinterpret_cast<Uns32T*>(rowPtr->next); // Cast pointer to Uns32T* for array format
|
mas01mc@340
|
1582 while( !(H::t2==O2_SERIAL_TOKEN_ENDTABLE || H::t2==O2_SERIAL_TOKEN_T1) ){
|
mas01mc@340
|
1583 numPointsThisBucket = 0;// reset bucket point counter
|
mas01mc@340
|
1584 secondPtr = 0; // reset second-point pointer
|
mas01mc@340
|
1585 TEST_TOKEN(H::t2!=O2_SERIAL_TOKEN_T2, "expected T2");
|
mas01mc@340
|
1586 READ_UNS32T(&(H::t2), "Read error t2");
|
mas01mc@764
|
1587 if(H::t2<last_t2)
|
mas01mc@764
|
1588 cout << "last_t2=" << last_t2 << ", t2=" << H::t2 << endl;
|
mas01mc@764
|
1589 TEST_TOKEN(H::t2<last_t2, "t2 tokens not in ascending order");
|
mas01mc@764
|
1590 last_t2 = H::t2;
|
mas01mc@764
|
1591 /*
|
mas01mc@764
|
1592 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1593 cout << "+" << H::t2 << "+";
|
mas01mc@764
|
1594 #endif
|
mas01mc@764
|
1595 */
|
mas01mc@340
|
1596 *ap++ = H::t2; // Insert t2 value into array
|
mas01mc@340
|
1597 numBuckets++;
|
mas01mc@340
|
1598 READ_UNS32T(&(H::p), "Read error H::p");
|
mas01mc@340
|
1599 while(!(H::p==O2_SERIAL_TOKEN_ENDTABLE || H::p==O2_SERIAL_TOKEN_T1 || H::p==O2_SERIAL_TOKEN_T2 )){
|
mas01mc@340
|
1600 if(numPointsThisBucket==MAX_POINTS_IN_BUCKET_CORE_ARRAY){
|
mas01mc@340
|
1601 ENCODE_POINT_SKIP_BITS;
|
mas01mc@764
|
1602 /*
|
mas01mc@764
|
1603 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1604 cout << "*" << H::t2 << "*";
|
mas01mc@764
|
1605 #endif
|
mas01mc@764
|
1606 */
|
mas01mc@340
|
1607 *ap++ = H::t2; // Extra element
|
mas01mc@340
|
1608 numBuckets++; // record this as a new bucket
|
mas01mc@340
|
1609 numPointsThisBucket=0; // reset bucket point counter
|
mas01mc@340
|
1610 secondPtr = 0; // reset second-point pointer
|
mas01mc@340
|
1611 }
|
mas01mc@340
|
1612 if( ++numPointsThisBucket == 1 )
|
mas01mc@340
|
1613 firstPtr = ap; // store pointer to first point to insert skip bits later on
|
mas01mc@340
|
1614 else if( numPointsThisBucket == 2 )
|
mas01mc@340
|
1615 secondPtr = ap; // store pointer to first point to insert skip bits later on
|
mas01mc@340
|
1616 numPoints++;
|
mas01mc@764
|
1617 /*
|
mas01mc@764
|
1618 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1619 cout << "(" << H::p << ":" << numPoints << ")";
|
mas01mc@764
|
1620 #endif
|
mas01mc@764
|
1621 */
|
mas01mc@764
|
1622
|
mas01mc@340
|
1623 *ap++ = H::p;
|
mas01mc@340
|
1624 READ_UNS32T(&(H::p), "Read error H::p");
|
mas01mc@340
|
1625 }
|
mas01mc@340
|
1626 ENCODE_POINT_SKIP_BITS;
|
mas01mc@340
|
1627 H::t2 = H::p; // Copy last found token to t2
|
mas01mc@340
|
1628 }
|
mas01mc@340
|
1629 // Reallocate the row to its actual size
|
mas01mc@340
|
1630 CR_ASSERT(rowPtr->next = (bucket*) realloc(rowPtr->next, (numBuckets+numPoints+1)*sizeof(Uns32T)+sizeof(bucket**)));
|
mas01mc@340
|
1631 // Record the sizes at the head of the row
|
mas01mc@344
|
1632 rowPtr->snext.numBuckets = numBuckets;
|
mas01mc@340
|
1633 rowPtr->t2 = numPoints;
|
mas01mc@340
|
1634 // Place end of row marker
|
mas01mc@340
|
1635 *ap++ = LSH_CORE_ARRAY_END_ROW_TOKEN;
|
mas01mc@340
|
1636 // Set the LSH_CORE_ARRAY_BIT to identify data structure for insertion and retrieval
|
mas01mc@340
|
1637 rowPtr->t2 |= LSH_CORE_ARRAY_BIT;
|
mas01mc@340
|
1638 // Allocate a new dynamic list head at the end of the array
|
mas01mc@340
|
1639 bucket** listPtr = reinterpret_cast<bucket**> (ap);
|
mas01mc@340
|
1640 *listPtr = 0;
|
mas01mc@764
|
1641 /*
|
mas01mc@764
|
1642 #ifdef _LSH_DEBUG_
|
mas01mc@764
|
1643 cout << " numBuckets=" << numBuckets << " numPoints=" << numPoints - numSingletons << " numElements(array) " << numBuckets+numPoints - numSingletons << " " << endl;
|
mas01mc@764
|
1644 #endif
|
mas01mc@764
|
1645 */
|
mas01mc@764
|
1646 H::tablesPointCount += numPoints - numSingletons;
|
mas01mc@340
|
1647 // Return current token
|
mas01mc@340
|
1648 return H::t2; // return H::t2 which holds current token [E or T1]
|
mas01mc@340
|
1649 }
|
mas01mc@340
|
1650
|
mas01mc@340
|
1651
|
mas01mc@340
|
1652
|
mas01mc@340
|
1653 // *p is a pointer to the beginning of a hashtable row array
|
mas01mc@340
|
1654 // The array consists of t2 hash keys and one or more point identifiers p for each hash key
|
mas01mc@340
|
1655 // Retrieval is performed by generating a hash key query_t2 for query point q
|
mas01mc@340
|
1656 // We identify the row that t2 is stored in using a secondary hash t1, this row is the entry
|
mas01mc@340
|
1657 // point for retrieve_from_core_hashtable_array
|
mas01mc@764
|
1658 #define SKIP_BITS (0xC0000000U)
|
mas01mc@340
|
1659 void G::retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos){
|
mas01mc@340
|
1660 Uns32T skip;
|
mas01mc@340
|
1661 Uns32T t2;
|
mas01mc@340
|
1662 Uns32T p1;
|
mas01mc@340
|
1663 Uns32T p2;
|
mas01mc@340
|
1664
|
mas01mc@340
|
1665 CR_ASSERT(p);
|
mas01mc@340
|
1666
|
mas01mc@340
|
1667 do{
|
mas01mc@340
|
1668 t2 = *p++;
|
mas01mc@340
|
1669 if( t2 > H::t2 )
|
mas01mc@340
|
1670 return;
|
mas01mc@340
|
1671 p1 = *p++;
|
mas01mc@340
|
1672 p2 = *p++;
|
mas01mc@340
|
1673 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
|
mas01mc@340
|
1674 if( t2 == H::t2 ){
|
mas01mc@764
|
1675 add_point_callback(calling_instance, p1 & ~SKIP_BITS, qpos, radius);
|
mas01mc@340
|
1676 if(skip--){
|
mas01mc@764
|
1677 add_point_callback(calling_instance, p2 & ~SKIP_BITS, qpos, radius);
|
mas01mc@340
|
1678 while(skip-- )
|
mas01mc@340
|
1679 add_point_callback(calling_instance, *p++, qpos, radius);
|
mas01mc@340
|
1680 }
|
mas01mc@340
|
1681 }
|
mas01mc@340
|
1682 else
|
mas01mc@340
|
1683 if(*p != LSH_CORE_ARRAY_END_ROW_TOKEN)
|
mas01mc@340
|
1684 p = p + skip;
|
mas01mc@340
|
1685 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
|
mas01mc@340
|
1686 }
|
mas01mc@513
|
1687
|
mas01mc@513
|
1688 void G::dump_hashtables(){
|
mas01mc@513
|
1689 for(Uns32T x = 0; x < H::L ; x++)
|
mas01mc@513
|
1690 for(Uns32T y = 0; y < H::N ; y++){
|
mas01mc@513
|
1691 bucket* bPtr = h[x][y];
|
mas01mc@513
|
1692 if(bPtr){
|
mas01mc@513
|
1693 printf("C[%d,%d]", x, y);
|
mas01mc@513
|
1694 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@513
|
1695 printf("[numBuckets=%d]",bPtr->snext.numBuckets);
|
mas01mc@513
|
1696 if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
|
mas01mc@513
|
1697 dump_core_hashtable_array((Uns32T*)(bPtr->next));
|
mas01mc@513
|
1698 }
|
mas01mc@513
|
1699 else {
|
mas01mc@513
|
1700 dump_hashtable_row(bPtr->next);
|
mas01mc@513
|
1701 }
|
mas01mc@513
|
1702 #else
|
mas01mc@513
|
1703 dump_hashtable_row(bPtr);
|
mas01mc@513
|
1704 #endif
|
mas01mc@513
|
1705 printf("\n");
|
mas01mc@513
|
1706 fflush(stdout);
|
mas01mc@513
|
1707 }
|
mas01mc@513
|
1708 }
|
mas01mc@513
|
1709 }
|
mas01mc@513
|
1710
|
mas01mc@764
|
1711 void G::dump_core_row(Uns32T n){
|
mas01mc@764
|
1712 if(!(n<H::N)){
|
mas01mc@764
|
1713 printf("ROW OUT OF RANGE:%d (MAX:%d)\n", n, H::N-1);
|
mas01mc@764
|
1714 return;
|
mas01mc@764
|
1715 }
|
mas01mc@764
|
1716 for(Uns32T j = 0 ; j < H::L ; j++ ){
|
mas01mc@764
|
1717 bucket* bPtr = h[j][n];
|
mas01mc@764
|
1718 if(bPtr){
|
mas01mc@764
|
1719 printf("C[%d,%d]", j, n);
|
mas01mc@764
|
1720 #ifdef LSH_LIST_HEAD_COUNTERS
|
mas01mc@764
|
1721 printf("[numBuckets=%d]",bPtr->snext.numBuckets);
|
mas01mc@764
|
1722 if(bPtr->t2&LSH_CORE_ARRAY_BIT) {
|
mas01mc@764
|
1723 dump_core_hashtable_array((Uns32T*)(bPtr->next));
|
mas01mc@764
|
1724 }
|
mas01mc@764
|
1725 else {
|
mas01mc@764
|
1726 dump_hashtable_row(bPtr->next);
|
mas01mc@764
|
1727 }
|
mas01mc@764
|
1728 #else
|
mas01mc@764
|
1729 dump_hashtable_row(bPtr);
|
mas01mc@764
|
1730 #endif
|
mas01mc@764
|
1731 printf("\n");
|
mas01mc@764
|
1732 }
|
mas01mc@764
|
1733 }
|
mas01mc@764
|
1734 }
|
mas01mc@764
|
1735
|
mas01mc@764
|
1736 void G::dump_disk_row(char* filename, Uns32T n){
|
mas01mc@764
|
1737 int dbfid = unserialize_lsh_header(filename);
|
mas01mc@764
|
1738 if(dbfid<0){
|
mas01mc@764
|
1739 cerr << "Could not read header from " << filename << endl;
|
mas01mc@764
|
1740 return;
|
mas01mc@764
|
1741 }
|
mas01mc@764
|
1742 FILE* dbFile = 0;
|
mas01mc@764
|
1743 dbFile = fdopen(dbfid, "rb");
|
mas01mc@764
|
1744 if(!dbFile){
|
mas01mc@764
|
1745 cerr << "Could not create FILE pointer from file:" << filename << " with fid:" << dbfid << endl;
|
mas01mc@764
|
1746 close(dbfid);
|
mas01mc@764
|
1747 return;
|
mas01mc@764
|
1748 }
|
mas01mc@764
|
1749
|
mas01mc@764
|
1750 Uns32T x=0,y=0;
|
mas01mc@764
|
1751
|
mas01mc@764
|
1752 // Seek to hashtable base offset
|
mas01mc@764
|
1753 if(fseek(dbFile, get_serial_hashtable_offset(), SEEK_SET)){
|
mas01mc@764
|
1754 fclose(dbFile);
|
mas01mc@764
|
1755 error("fSeek error in unserialize_lsh_hashtables_format2");
|
mas01mc@764
|
1756 }
|
mas01mc@764
|
1757 Uns32T token = 0;
|
mas01mc@764
|
1758 Uns32T pointID;
|
mas01mc@764
|
1759
|
mas01mc@764
|
1760 // Read the hash tables into core (structure is given in header)
|
mas01mc@764
|
1761 while( x < H::L){
|
mas01mc@764
|
1762 y=0;
|
mas01mc@764
|
1763 if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1764 fclose(dbFile);
|
mas01mc@764
|
1765 error("Read error T1","unserialize_lsh_hashtables_format2()");
|
mas01mc@764
|
1766 }
|
mas01mc@764
|
1767 while(token != O2_SERIAL_TOKEN_ENDTABLE){
|
mas01mc@764
|
1768 if(token == O2_SERIAL_TOKEN_T1){
|
mas01mc@764
|
1769 if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1770 fclose(dbFile);
|
mas01mc@764
|
1771 error("Read error t1","unserialize_lsh_hashtables_format2()");
|
mas01mc@764
|
1772 }
|
mas01mc@764
|
1773 y=token;
|
mas01mc@764
|
1774 if(y==n){
|
mas01mc@764
|
1775 printf("D[%d,%d]", x, y);
|
mas01mc@764
|
1776 if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1777 fclose(dbFile);
|
mas01mc@764
|
1778 error("Read error 2","unserialize_lsh_hashtables_format2()");
|
mas01mc@764
|
1779 }
|
mas01mc@764
|
1780 printf("[numElements=%d]", token);
|
mas01mc@764
|
1781 if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1782 fclose(dbFile);
|
mas01mc@764
|
1783 error("Read error 3","unserialize_lsh_hashtables_format2()");
|
mas01mc@764
|
1784 }
|
mas01mc@764
|
1785 while(!(token==O2_SERIAL_TOKEN_ENDTABLE || token==O2_SERIAL_TOKEN_T1)){
|
mas01mc@764
|
1786 // Check for T2 token
|
mas01mc@764
|
1787 if(token!=O2_SERIAL_TOKEN_T2){
|
mas01mc@764
|
1788 printf("t2=%d",token);
|
mas01mc@764
|
1789 fclose(dbFile);
|
mas01mc@764
|
1790 error("State machine error T2 token", "unserialize_hashtable_row_format2()");
|
mas01mc@764
|
1791 }
|
mas01mc@764
|
1792 // Read t2 value
|
mas01mc@764
|
1793 if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1794 fclose(dbFile);
|
mas01mc@764
|
1795 error("Read error t2","unserialize_hashtable_row_format2");
|
mas01mc@764
|
1796 }
|
mas01mc@764
|
1797 if(fread(&pointID, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1798 fclose(dbFile);
|
mas01mc@764
|
1799 error("Read error pointID","unserialize_hashtable_row_format2");
|
mas01mc@764
|
1800 }
|
mas01mc@764
|
1801 while(!(pointID==O2_SERIAL_TOKEN_ENDTABLE || pointID==O2_SERIAL_TOKEN_T1 || pointID==O2_SERIAL_TOKEN_T2 )){
|
mas01mc@764
|
1802 printf("(%0X,%u)", token, pointID);
|
mas01mc@764
|
1803 if(fread(&pointID, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1804 fclose(dbFile);
|
mas01mc@764
|
1805 error("Read error H::p","unserialize_hashtable_row_format2");
|
mas01mc@764
|
1806 }
|
mas01mc@764
|
1807 }
|
mas01mc@764
|
1808 token = pointID; // Copy last found token
|
mas01mc@764
|
1809 }
|
mas01mc@764
|
1810 printf("\n");
|
mas01mc@764
|
1811 }
|
mas01mc@764
|
1812 else{ // gobble up rest of row
|
mas01mc@764
|
1813 while(!(token==O2_SERIAL_TOKEN_T1 || token==O2_SERIAL_TOKEN_ENDTABLE)){
|
mas01mc@764
|
1814 if(fread(&token, sizeof(Uns32T), 1, dbFile) != 1){
|
mas01mc@764
|
1815 fclose(dbFile);
|
mas01mc@764
|
1816 error("Read error 4","unserialize_lsh_hashtables_format2()");
|
mas01mc@764
|
1817 }
|
mas01mc@764
|
1818 }
|
mas01mc@764
|
1819 }
|
mas01mc@764
|
1820 }
|
mas01mc@764
|
1821 }
|
mas01mc@764
|
1822 if(token==O2_SERIAL_TOKEN_ENDTABLE){
|
mas01mc@764
|
1823 x++;
|
mas01mc@764
|
1824 }
|
mas01mc@764
|
1825 }
|
mas01mc@764
|
1826 close(dbfid);
|
mas01mc@764
|
1827 }
|
mas01mc@764
|
1828
|
mas01mc@764
|
1829
|
mas01mc@513
|
1830 void G::dump_core_hashtable_array(Uns32T* p){
|
mas01mc@513
|
1831 Uns32T skip;
|
mas01mc@513
|
1832 Uns32T t2;
|
mas01mc@513
|
1833 Uns32T p1;
|
mas01mc@513
|
1834 Uns32T p2;
|
mas01mc@513
|
1835 CR_ASSERT(p);
|
mas01mc@513
|
1836 do{
|
mas01mc@513
|
1837 t2 = *p++;
|
mas01mc@513
|
1838 p1 = *p++;
|
mas01mc@513
|
1839 p2 = *p++;
|
mas01mc@513
|
1840 skip = (( p1 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_LSB) + (( p2 & SKIP_BITS ) >> SKIP_BITS_RIGHT_SHIFT_MSB);
|
mas01mc@764
|
1841 printf("(%0X, %u)", t2, p1 & ~SKIP_BITS);
|
mas01mc@513
|
1842 if(skip--){
|
mas01mc@764
|
1843 printf("(%0X, %u)", t2, p2 & ~SKIP_BITS);
|
mas01mc@513
|
1844 while(skip-- )
|
mas01mc@764
|
1845 printf("(%0X, %u)", t2, *p++);
|
mas01mc@513
|
1846 }
|
mas01mc@513
|
1847 }while( *p != LSH_CORE_ARRAY_END_ROW_TOKEN );
|
mas01mc@513
|
1848 }
|
mas01mc@340
|
1849
|
mas01mc@292
|
1850 void G::dump_hashtable_row(bucket* p){
|
mas01mc@292
|
1851 while(p && p->t2!=IFLAG){
|
mas01mc@344
|
1852 sbucket* sbp = p->snext.ptr;
|
mas01mc@292
|
1853 while(sbp){
|
mas01mc@292
|
1854 printf("(%0X,%u)", p->t2, sbp->pointID);
|
mas01mc@292
|
1855 fflush(stdout);
|
mas01mc@292
|
1856 sbp=sbp->snext;
|
mas01mc@292
|
1857 }
|
mas01mc@292
|
1858 p=p->next;
|
mas01mc@292
|
1859 }
|
mas01mc@292
|
1860 printf("\n");
|
mas01mc@292
|
1861 }
|
mas01mc@292
|
1862
|
mas01mc@292
|
1863
|
mas01mc@292
|
1864 // G::serial_retrieve_point( ... )
|
mas01mc@292
|
1865 // retrieves (pointID) from a serialized LSH database
|
mas01mc@292
|
1866 //
|
mas01mc@292
|
1867 // inputs:
|
mas01mc@292
|
1868 // filename - file name of serialized LSH database
|
mas01mc@292
|
1869 // vv - query point set
|
mas01mc@292
|
1870 //
|
mas01mc@292
|
1871 // outputs:
|
mas01mc@292
|
1872 // inserts retrieved points into add_point() callback method
|
mas01mc@292
|
1873 void G::serial_retrieve_point_set(char* filename, vector<vector<float> >& vv, ReporterCallbackPtr add_point, void* caller)
|
mas01mc@292
|
1874 {
|
mas01mc@292
|
1875 int dbfid = serial_open(filename, 0); // open for read
|
mas01mc@764
|
1876 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
|
mas01mc@764
|
1877 serial_get_header(dbheader); // read header
|
mas01mc@764
|
1878 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
|
mas01mc@292
|
1879
|
mas01mc@292
|
1880 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
|
mas01mc@336
|
1881 serial_close(dbfid);
|
mas01mc@292
|
1882 error("serial_retrieve_point_set is for SERIAL_FILEFORMAT1 only");
|
mas01mc@292
|
1883 }
|
mas01mc@292
|
1884
|
mas01mc@292
|
1885 // size of each hash table
|
mas01mc@292
|
1886 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
|
mas01mc@292
|
1887 calling_instance = caller; // class instance variable used in ...bucket_chain_point()
|
mas01mc@292
|
1888 add_point_callback = add_point;
|
mas01mc@292
|
1889
|
mas01mc@292
|
1890 for(Uns32T j=0; j<L; j++){
|
mas01mc@764
|
1891 // memory map a single hash table for random access
|
mas01mc@764
|
1892 char* db = serial_mmap(dbfid, hashTableSize, 0,
|
mas01mc@764
|
1893 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
|
mas01mc@764
|
1894 #ifdef __CYGWIN__
|
mas01mc@764
|
1895 // No madvise in CYGWIN
|
mas01mc@764
|
1896 #else
|
mas01mc@764
|
1897 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
|
mas01mc@764
|
1898 error("could not advise local hashtable memory","","madvise");
|
mas01mc@764
|
1899 #endif
|
mas01mc@764
|
1900 SerialElementT* pe = (SerialElementT*)db ;
|
mas01mc@292
|
1901 for(Uns32T qpos=0; qpos<vv.size(); qpos++){
|
mas01mc@293
|
1902 H::compute_hash_functions(vv[qpos]);
|
mas01mc@293
|
1903 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
|
mas01mc@292
|
1904 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
|
mas01mc@292
|
1905 }
|
mas01mc@764
|
1906 serial_munmap(db, hashTableSize); // drop hashtable mmap
|
mas01mc@292
|
1907 }
|
mas01mc@292
|
1908 serial_close(dbfid);
|
mas01mc@292
|
1909 }
|
mas01mc@292
|
1910
|
mas01mc@292
|
1911 void G::serial_retrieve_point(char* filename, vector<float>& v, Uns32T qpos, ReporterCallbackPtr add_point, void* caller){
|
mas01mc@292
|
1912 int dbfid = serial_open(filename, 0); // open for read
|
mas01mc@764
|
1913 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
|
mas01mc@764
|
1914 serial_get_header(dbheader); // read header
|
mas01mc@764
|
1915 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
|
mas01mc@292
|
1916
|
mas01mc@292
|
1917 if((lshHeader->flags & O2_SERIAL_FILEFORMAT2)){
|
mas01mc@336
|
1918 serial_close(dbfid);
|
mas01mc@292
|
1919 error("serial_retrieve_point is for SERIAL_FILEFORMAT1 only");
|
mas01mc@292
|
1920 }
|
mas01mc@292
|
1921
|
mas01mc@292
|
1922 // size of each hash table
|
mas01mc@292
|
1923 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
|
mas01mc@292
|
1924 calling_instance = caller;
|
mas01mc@292
|
1925 add_point_callback = add_point;
|
mas01mc@293
|
1926 H::compute_hash_functions(v);
|
mas01mc@292
|
1927 for(Uns32T j=0; j<L; j++){
|
mas01mc@764
|
1928 // memory map a single hash table for random access
|
mas01mc@764
|
1929 char* db = serial_mmap(dbfid, hashTableSize, 0,
|
mas01mc@764
|
1930 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
|
mas01mc@764
|
1931 #ifdef __CYGWIN__
|
mas01mc@764
|
1932 // No madvise in CYGWIN
|
mas01mc@764
|
1933 #else
|
mas01mc@764
|
1934 if(madvise(db, hashTableSize, MADV_RANDOM)<0)
|
mas01mc@764
|
1935 error("could not advise local hashtable memory","","madvise");
|
mas01mc@764
|
1936 #endif
|
mas01mc@764
|
1937 SerialElementT* pe = (SerialElementT*)db ;
|
mas01mc@293
|
1938 H::generate_hash_keys(*(g+j),*(r1+j),*(r2+j));
|
mas01mc@292
|
1939 serial_bucket_chain_point(pe+t1*lshHeader->numCols, qpos); // Point to correct row
|
mas01mc@764
|
1940 serial_munmap(db, hashTableSize); // drop hashtable mmap
|
mas01mc@764
|
1941 }
|
mas01mc@292
|
1942 serial_close(dbfid);
|
mas01mc@292
|
1943 }
|
mas01mc@292
|
1944
|
mas01mc@292
|
1945 void G::serial_dump_tables(char* filename){
|
mas01mc@292
|
1946 int dbfid = serial_open(filename, 0); // open for read
|
mas01mc@764
|
1947 char* dbheader = serial_mmap(dbfid, O2_SERIAL_HEADER_SIZE, 0);// get database pointer
|
mas01mc@764
|
1948 serial_get_header(dbheader); // read header
|
mas01mc@764
|
1949 serial_munmap(dbheader, O2_SERIAL_HEADER_SIZE); // drop header mmap
|
mas01mc@292
|
1950 Uns32T hashTableSize=sizeof(SerialElementT)*lshHeader->numRows*lshHeader->numCols;
|
mas01mc@292
|
1951 for(Uns32T j=0; j<L; j++){
|
mas01mc@764
|
1952 // memory map a single hash table for random access
|
mas01mc@764
|
1953 char* db = serial_mmap(dbfid, hashTableSize, 0,
|
mas01mc@764
|
1954 align_up(get_serial_hashtable_offset()+j*hashTableSize,get_page_logn()));
|
mas01mc@764
|
1955 #ifdef __CYGWIN__
|
mas01mc@764
|
1956 // No madvise in CYGWIN
|
mas01mc@764
|
1957 #else
|
mas01mc@764
|
1958 if(madvise(db, hashTableSize, MADV_SEQUENTIAL)<0)
|
mas01mc@764
|
1959 error("could not advise local hashtable memory","","madvise");
|
mas01mc@764
|
1960 #endif
|
mas01mc@764
|
1961 SerialElementT* pe = (SerialElementT*)db ;
|
mas01mc@292
|
1962 printf("*********** TABLE %d ***************\n", j);
|
mas01mc@292
|
1963 fflush(stdout);
|
mas01mc@292
|
1964 int count=0;
|
mas01mc@292
|
1965 do{
|
mas01mc@292
|
1966 printf("[%d,%d]", j, count++);
|
mas01mc@292
|
1967 fflush(stdout);
|
mas01mc@292
|
1968 serial_bucket_dump(pe);
|
mas01mc@292
|
1969 pe+=lshHeader->numCols;
|
mas01mc@292
|
1970 }while(pe<(SerialElementT*)db+lshHeader->numRows*lshHeader->numCols);
|
mas01mc@292
|
1971 }
|
mas01mc@292
|
1972 }
|
mas01mc@292
|
1973
|
mas01mc@292
|
1974 void G::serial_bucket_dump(SerialElementT* pe){
|
mas01mc@292
|
1975 SerialElementT* pend = pe+lshHeader->numCols;
|
mas01mc@292
|
1976 while( !(pe->hashValue==IFLAG || pe==pend ) ){
|
mas01mc@292
|
1977 printf("(%0X,%u)",pe->hashValue,pe->pointID);
|
mas01mc@292
|
1978 pe++;
|
mas01mc@292
|
1979 }
|
mas01mc@292
|
1980 printf("\n");
|
mas01mc@292
|
1981 fflush(stdout);
|
mas01mc@292
|
1982 }
|
mas01mc@292
|
1983
|
mas01mc@292
|
1984 void G::serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos){
|
mas01mc@292
|
1985 SerialElementT* pend = pe+lshHeader->numCols;
|
mas01mc@292
|
1986 while( !(pe->hashValue==IFLAG || pe==pend ) ){
|
mas01mc@292
|
1987 if(pe->hashValue==t2){ // new match
|
mas01mc@292
|
1988 add_point_callback(calling_instance, pe->pointID, qpos, radius);
|
mas01mc@292
|
1989 }
|
mas01mc@292
|
1990 pe++;
|
mas01mc@292
|
1991 }
|
mas01mc@292
|
1992 }
|
mas01mc@292
|
1993
|
mas01mc@292
|
1994 void G::bucket_chain_point(bucket* p, Uns32T qpos){
|
mas01mc@292
|
1995 if(!p || p->t2==IFLAG)
|
mas01mc@292
|
1996 return;
|
mas01mc@292
|
1997 if(p->t2==t2){ // match
|
mas01mc@344
|
1998 sbucket_chain_point(p->snext.ptr, qpos); // add to reporter
|
mas01mc@292
|
1999 }
|
mas01mc@292
|
2000 if(p->next){
|
mas01mc@292
|
2001 bucket_chain_point(p->next, qpos); // recurse
|
mas01mc@292
|
2002 }
|
mas01mc@292
|
2003 }
|
mas01mc@292
|
2004
|
mas01mc@292
|
2005 void G::sbucket_chain_point(sbucket* p, Uns32T qpos){
|
mas01mc@292
|
2006 add_point_callback(calling_instance, p->pointID, qpos, radius);
|
mas01mc@292
|
2007 if(p->snext){
|
mas01mc@292
|
2008 sbucket_chain_point(p->snext, qpos);
|
mas01mc@292
|
2009 }
|
mas01mc@292
|
2010 }
|
mas01mc@292
|
2011
|
mas01mc@292
|
2012 void G::get_lock(int fd, bool exclusive) {
|
mas01cr@648
|
2013 #ifndef WIN32
|
mas01mc@292
|
2014 struct flock lock;
|
mas01mc@292
|
2015 int status;
|
mas01mc@292
|
2016 lock.l_type = exclusive ? F_WRLCK : F_RDLCK;
|
mas01mc@292
|
2017 lock.l_whence = SEEK_SET;
|
mas01mc@292
|
2018 lock.l_start = 0;
|
mas01mc@292
|
2019 lock.l_len = 0; /* "the whole file" */
|
mas01mc@292
|
2020 retry:
|
mas01mc@292
|
2021 do {
|
mas01mc@292
|
2022 status = fcntl(fd, F_SETLKW, &lock);
|
mas01mc@292
|
2023 } while (status != 0 && errno == EINTR);
|
mas01mc@292
|
2024 if (status) {
|
mas01mc@292
|
2025 if (errno == EAGAIN) {
|
mas01mc@292
|
2026 sleep(1);
|
mas01mc@292
|
2027 goto retry;
|
mas01mc@292
|
2028 } else {
|
mas01mc@292
|
2029 error("fcntl lock error", "", "fcntl");
|
mas01mc@292
|
2030 }
|
mas01mc@292
|
2031 }
|
mas01cr@648
|
2032 #else
|
mas01cr@648
|
2033 int status;
|
mas01cr@648
|
2034
|
mas01cr@648
|
2035 retry:
|
mas01cr@648
|
2036 status = _locking(fd, _LK_NBLCK, getpagesize());
|
mas01cr@648
|
2037 if(status) {
|
mas01cr@648
|
2038 Sleep(1000);
|
mas01cr@648
|
2039 goto retry;
|
mas01cr@648
|
2040 }
|
mas01cr@648
|
2041 #endif
|
mas01mc@292
|
2042 }
|
mas01mc@292
|
2043
|
mas01mc@292
|
2044 void G::release_lock(int fd) {
|
mas01cr@648
|
2045 #ifndef WIN32
|
mas01mc@292
|
2046 struct flock lock;
|
mas01mc@292
|
2047 int status;
|
mas01mc@292
|
2048
|
mas01mc@292
|
2049 lock.l_type = F_UNLCK;
|
mas01mc@292
|
2050 lock.l_whence = SEEK_SET;
|
mas01mc@292
|
2051 lock.l_start = 0;
|
mas01mc@292
|
2052 lock.l_len = 0;
|
mas01mc@292
|
2053
|
mas01mc@292
|
2054 status = fcntl(fd, F_SETLKW, &lock);
|
mas01mc@292
|
2055
|
mas01mc@292
|
2056 if (status)
|
mas01mc@292
|
2057 error("fcntl unlock error", "", "fcntl");
|
mas01cr@648
|
2058 #else
|
mas01cr@648
|
2059 _locking(fd, _LK_UNLCK, getpagesize());
|
mas01cr@648
|
2060 #endif
|
mas01mc@292
|
2061 }
|