comparison lshlib.h @ 754:9bd13c7819ae mkc_lsh_update

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents 4b79043f90ba
children
comparison
equal deleted inserted replaced
747:fbf16508421f 754:9bd13c7819ae
9 */ 9 */
10 10
11 #ifndef __LSHLIB_H 11 #ifndef __LSHLIB_H
12 #define __LSHLIB_H 12 #define __LSHLIB_H
13 13
14 using namespace std; 14 #include <vector>
15 #include <queue>
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <sys/types.h>
19 #include <sys/stat.h>
20 #include <sys/mman.h>
21 #include <fcntl.h>
22 #include <string.h>
23 #include <iostream>
24 #include <fstream>
25 #include <math.h>
26 #include <sys/time.h>
27 #include <assert.h>
28 #include <float.h>
29 #include <signal.h>
30 #include <time.h>
31 #include <limits.h>
32 #include <errno.h>
33 #ifdef MT19937
34 #include "mt19937/mt19937ar.h"
35 #endif
36 #include "multiprobe.h"
15 37
16 #define IntT int 38 #define IntT int
17 #define LongUns64T long long unsigned 39 #define LongUns64T long long unsigned
18 #define Uns32T unsigned 40 #define Uns32T unsigned
19 #define Int32T int 41 #define Int32T int
20 #define BooleanT int 42 #define BooleanT int
43 #define TRUE 1
44 #define FALSE 0
21 45
22 // A big number (>> max # of points) 46 // A big number (>> max # of points)
23 #define INDEX_START_EMPTY 1000000000U 47 #define INDEX_START_EMPTY 1000000000U
24 48
25 // 4294967291 = 2^32-5 49 // 4294967291 = 2^32-5
53 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU) 77 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU)
54 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU) 78 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU)
55 79
56 #define O2_INDEX_MAXSTR (256) 80 #define O2_INDEX_MAXSTR (256)
57 81
58 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1)) 82 unsigned align_up(unsigned x, unsigned w);
59 83
60 #define O2_SERIAL_FUNCTIONS_SIZE (align_up(sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * O2_SERIAL_MAX_DIM \ 84 #define O2_SERIAL_FUNCTIONS_SIZE (align_up(sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * O2_SERIAL_MAX_DIM \
61 + sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS + \ 85 + sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS + \
62 + sizeof(Uns32T) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * 2 \ 86 + sizeof(Uns32T) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * 2 \
63 + O2_SERIAL_HEADER_SIZE,get_page_logn())) 87 + O2_SERIAL_HEADER_SIZE,get_page_logn()))
69 93
70 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\ 94 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\
71 fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);} 95 fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);}
72 96
73 //#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load 97 //#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load
98 //#define _LSH_DEBUG_ // turn on debugging information
74 //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use 99 //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use
75 100
76 // Backward-compatible CORE ARRAY lsh index 101 // Backward-compatible CORE ARRAY lsh index
77 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists 102 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists
78 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads 103 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads
82 #define LSH_LIST_HEAD_COUNTERS 107 #define LSH_LIST_HEAD_COUNTERS
83 #endif 108 #endif
84 109
85 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head 110 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head
86 111
112 #ifndef LSH_MULTI_PROBE_COUNT
113 #define LSH_MULTI_PROBE_COUNT 1 // How many adjacent hash-buckets to probe in LSH retrieval
114 #endif
115
87 Uns32T get_page_logn(); 116 Uns32T get_page_logn();
117
118 using namespace std;
88 119
89 // Disk table entry 120 // Disk table entry
90 typedef class SerialElement SerialElementT; 121 typedef class SerialElement SerialElementT;
91 class SerialElement { 122 class SerialElement {
92 public: 123 public:
200 #endif 231 #endif
201 Uns32T maxp; // highest pointID stored in database 232 Uns32T maxp; // highest pointID stored in database
202 Uns32T bucketCount; // count of number of point buckets allocated 233 Uns32T bucketCount; // count of number of point buckets allocated
203 Uns32T pointCount; // count of number of points inserted 234 Uns32T pointCount; // count of number of points inserted
204 Uns32T collisionCount; // number of points collided in a hash-table row 235 Uns32T collisionCount; // number of points collided in a hash-table row
236 Uns32T tablesPointCount; // count of points per hash table on load
205 237
206 Uns32T t1; // first hash table key 238 Uns32T t1; // first hash table key
207 Uns32T t2; // second hash table key 239 Uns32T t2; // second hash table key
208 Uns32T P; // hash table prime number 240 Uns32T P; // hash table prime number
241
209 242
210 Uns32T N; // num rows per table 243 Uns32T N; // num rows per table
211 Uns32T C; // num collision per row 244 Uns32T C; // num collision per row
212 Uns32T k; // num projections per hash function 245 Uns32T k; // num projections per hash function
213 Uns32T m; // ~sqrt num hash tables 246 Uns32T m; // ~sqrt num hash tables
214 Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0 247 Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0
215 Uns32T d; // dimensions 248 Uns32T d; // dimensions
216 Uns32T p; // current point 249 Uns32T p; // current point
217 float w; // width of hash slots (relative to normalized feature space) 250 float w; // width of hash slots (relative to normalized feature space)
218 float radius;// scaling coefficient for data (1./radius) 251 float radius;// scaling coefficient for data (1./radius)
252
253 MultiProbe* multiProbePtr; // Utility class for handling multi-probe queries
254 float ** boundaryDistances; // Array of query bucket-boundary-distances per hashtable
219 255
220 void initialize_data_structures(); 256 void initialize_data_structures();
221 void initialize_lsh_functions(); 257 void initialize_lsh_functions();
222 void initialize_partial_functions(); 258 void initialize_partial_functions();
223 void __bucket_insert_point(bucket*); 259 void __bucket_insert_point(bucket*);
248 Uns32T bucket_insert_point(bucket**); 284 Uns32T bucket_insert_point(bucket**);
249 285
250 // Interface to hash functions 286 // Interface to hash functions
251 void compute_hash_functions(vector<float>& v); 287 void compute_hash_functions(vector<float>& v);
252 void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2); 288 void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2);
289 void generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2);
253 Uns32T get_t1(){return t1;} // hash-key t1 290 Uns32T get_t1(){return t1;} // hash-key t1
254 Uns32T get_t2(){return t2;} // hash-key t2 291 Uns32T get_t2(){return t2;} // hash-key t2
255 }; 292 };
256 293
257 // Typedef for point-reporting callback function. Used to collect points during LSH retrieval 294 // Typedef for point-reporting callback function. Used to collect points during LSH retrieval
265 // LSH serial data structure file handling 302 // LSH serial data structure file handling
266 void get_lock(int fd, bool exclusive); 303 void get_lock(int fd, bool exclusive);
267 void release_lock(int fd); 304 void release_lock(int fd);
268 int serial_create(char* filename, Uns32T FMT); 305 int serial_create(char* filename, Uns32T FMT);
269 int serial_create(char* filename, float binWidth, Uns32T nTables, Uns32T nRows, Uns32T nCols, Uns32T k, Uns32T d, Uns32T FMT); 306 int serial_create(char* filename, float binWidth, Uns32T nTables, Uns32T nRows, Uns32T nCols, Uns32T k, Uns32T d, Uns32T FMT);
307 char* serial_mmap(int dbfid, Uns32T sz, Uns32T w, off_t offset = 0);
308 void serial_munmap(char* db, Uns32T N);
270 int serial_open(char* filename,int writeFlag); 309 int serial_open(char* filename,int writeFlag);
271 void serial_close(int dbfid); 310 void serial_close(int dbfid);
272 311
273 // Function to write hashfunctions to disk 312 // Function to write hashfunctions to disk
274 int serialize_lsh_hashfunctions(int fid); 313 int serialize_lsh_hashfunctions(int fid);
304 // Helper functions 343 // Helper functions
305 void serial_print_header(Uns32T requestedFormat); 344 void serial_print_header(Uns32T requestedFormat);
306 float* get_serial_hashfunction_base(char* db); 345 float* get_serial_hashfunction_base(char* db);
307 SerialElementT* get_serial_hashtable_base(char* db); 346 SerialElementT* get_serial_hashtable_base(char* db);
308 Uns32T get_serial_hashtable_offset(); // Size of SerialHeader + HashFunctions 347 Uns32T get_serial_hashtable_offset(); // Size of SerialHeader + HashFunctions
309 SerialHeaderT* serial_get_header(int fd); 348 SerialHeaderT* serial_get_header(char* db);
310 void serial_write_header(int fd, SerialHeaderT *header);
311 void serial_get_table(int, int, void *, size_t);
312 void serial_write_table(int, int, void *, size_t);
313 SerialHeaderT* lshHeader; 349 SerialHeaderT* lshHeader;
314 350
315 // Core Retrieval/Inspections Functions 351 // Core Retrieval/Inspections Functions
316 void bucket_chain_point(bucket* p, Uns32T qpos); 352 void bucket_chain_point(bucket* p, Uns32T qpos);
317 void sbucket_chain_point(sbucket* p, Uns32T qpos); 353 void sbucket_chain_point(sbucket* p, Uns32T qpos);
352 SerialHeaderT* get_lshHeader(){return lshHeader;} 388 SerialHeaderT* get_lshHeader(){return lshHeader;}
353 void serial_dump_tables(char* filename); 389 void serial_dump_tables(char* filename);
354 float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; } 390 float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; }
355 char* get_indexName(){return indexName;} 391 char* get_indexName(){return indexName;}
356 void dump_hashtables(); 392 void dump_hashtables();
357 393 void dump_core_row(Uns32T n);
394 void dump_disk_row(char*, Uns32T n);
358 }; 395 };
359 396
360 typedef class G LSH; 397 typedef class G LSH;
361 398
362 399