Mercurial > hg > audiodb
comparison lshlib.h @ 754:9bd13c7819ae mkc_lsh_update
Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author | mas01mc |
---|---|
date | Thu, 25 Nov 2010 13:42:40 +0000 |
parents | 4b79043f90ba |
children |
comparison
equal
deleted
inserted
replaced
747:fbf16508421f | 754:9bd13c7819ae |
---|---|
9 */ | 9 */ |
10 | 10 |
11 #ifndef __LSHLIB_H | 11 #ifndef __LSHLIB_H |
12 #define __LSHLIB_H | 12 #define __LSHLIB_H |
13 | 13 |
14 using namespace std; | 14 #include <vector> |
15 #include <queue> | |
16 #include <stdio.h> | |
17 #include <stdlib.h> | |
18 #include <sys/types.h> | |
19 #include <sys/stat.h> | |
20 #include <sys/mman.h> | |
21 #include <fcntl.h> | |
22 #include <string.h> | |
23 #include <iostream> | |
24 #include <fstream> | |
25 #include <math.h> | |
26 #include <sys/time.h> | |
27 #include <assert.h> | |
28 #include <float.h> | |
29 #include <signal.h> | |
30 #include <time.h> | |
31 #include <limits.h> | |
32 #include <errno.h> | |
33 #ifdef MT19937 | |
34 #include "mt19937/mt19937ar.h" | |
35 #endif | |
36 #include "multiprobe.h" | |
15 | 37 |
16 #define IntT int | 38 #define IntT int |
17 #define LongUns64T long long unsigned | 39 #define LongUns64T long long unsigned |
18 #define Uns32T unsigned | 40 #define Uns32T unsigned |
19 #define Int32T int | 41 #define Int32T int |
20 #define BooleanT int | 42 #define BooleanT int |
43 #define TRUE 1 | |
44 #define FALSE 0 | |
21 | 45 |
22 // A big number (>> max # of points) | 46 // A big number (>> max # of points) |
23 #define INDEX_START_EMPTY 1000000000U | 47 #define INDEX_START_EMPTY 1000000000U |
24 | 48 |
25 // 4294967291 = 2^32-5 | 49 // 4294967291 = 2^32-5 |
53 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU) | 77 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU) |
54 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU) | 78 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU) |
55 | 79 |
56 #define O2_INDEX_MAXSTR (256) | 80 #define O2_INDEX_MAXSTR (256) |
57 | 81 |
58 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1)) | 82 unsigned align_up(unsigned x, unsigned w); |
59 | 83 |
60 #define O2_SERIAL_FUNCTIONS_SIZE (align_up(sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * O2_SERIAL_MAX_DIM \ | 84 #define O2_SERIAL_FUNCTIONS_SIZE (align_up(sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * O2_SERIAL_MAX_DIM \ |
61 + sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS + \ | 85 + sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS + \ |
62 + sizeof(Uns32T) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * 2 \ | 86 + sizeof(Uns32T) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * 2 \ |
63 + O2_SERIAL_HEADER_SIZE,get_page_logn())) | 87 + O2_SERIAL_HEADER_SIZE,get_page_logn())) |
69 | 93 |
70 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\ | 94 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\ |
71 fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);} | 95 fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);} |
72 | 96 |
73 //#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load | 97 //#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load |
98 //#define _LSH_DEBUG_ // turn on debugging information | |
74 //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use | 99 //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use |
75 | 100 |
76 // Backward-compatible CORE ARRAY lsh index | 101 // Backward-compatible CORE ARRAY lsh index |
77 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists | 102 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists |
78 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads | 103 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads |
82 #define LSH_LIST_HEAD_COUNTERS | 107 #define LSH_LIST_HEAD_COUNTERS |
83 #endif | 108 #endif |
84 | 109 |
85 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head | 110 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head |
86 | 111 |
112 #ifndef LSH_MULTI_PROBE_COUNT | |
113 #define LSH_MULTI_PROBE_COUNT 1 // How many adjacent hash-buckets to probe in LSH retrieval | |
114 #endif | |
115 | |
87 Uns32T get_page_logn(); | 116 Uns32T get_page_logn(); |
117 | |
118 using namespace std; | |
88 | 119 |
89 // Disk table entry | 120 // Disk table entry |
90 typedef class SerialElement SerialElementT; | 121 typedef class SerialElement SerialElementT; |
91 class SerialElement { | 122 class SerialElement { |
92 public: | 123 public: |
200 #endif | 231 #endif |
201 Uns32T maxp; // highest pointID stored in database | 232 Uns32T maxp; // highest pointID stored in database |
202 Uns32T bucketCount; // count of number of point buckets allocated | 233 Uns32T bucketCount; // count of number of point buckets allocated |
203 Uns32T pointCount; // count of number of points inserted | 234 Uns32T pointCount; // count of number of points inserted |
204 Uns32T collisionCount; // number of points collided in a hash-table row | 235 Uns32T collisionCount; // number of points collided in a hash-table row |
236 Uns32T tablesPointCount; // count of points per hash table on load | |
205 | 237 |
206 Uns32T t1; // first hash table key | 238 Uns32T t1; // first hash table key |
207 Uns32T t2; // second hash table key | 239 Uns32T t2; // second hash table key |
208 Uns32T P; // hash table prime number | 240 Uns32T P; // hash table prime number |
241 | |
209 | 242 |
210 Uns32T N; // num rows per table | 243 Uns32T N; // num rows per table |
211 Uns32T C; // num collision per row | 244 Uns32T C; // num collision per row |
212 Uns32T k; // num projections per hash function | 245 Uns32T k; // num projections per hash function |
213 Uns32T m; // ~sqrt num hash tables | 246 Uns32T m; // ~sqrt num hash tables |
214 Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0 | 247 Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0 |
215 Uns32T d; // dimensions | 248 Uns32T d; // dimensions |
216 Uns32T p; // current point | 249 Uns32T p; // current point |
217 float w; // width of hash slots (relative to normalized feature space) | 250 float w; // width of hash slots (relative to normalized feature space) |
218 float radius;// scaling coefficient for data (1./radius) | 251 float radius;// scaling coefficient for data (1./radius) |
252 | |
253 MultiProbe* multiProbePtr; // Utility class for handling multi-probe queries | |
254 float ** boundaryDistances; // Array of query bucket-boundary-distances per hashtable | |
219 | 255 |
220 void initialize_data_structures(); | 256 void initialize_data_structures(); |
221 void initialize_lsh_functions(); | 257 void initialize_lsh_functions(); |
222 void initialize_partial_functions(); | 258 void initialize_partial_functions(); |
223 void __bucket_insert_point(bucket*); | 259 void __bucket_insert_point(bucket*); |
248 Uns32T bucket_insert_point(bucket**); | 284 Uns32T bucket_insert_point(bucket**); |
249 | 285 |
250 // Interface to hash functions | 286 // Interface to hash functions |
251 void compute_hash_functions(vector<float>& v); | 287 void compute_hash_functions(vector<float>& v); |
252 void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2); | 288 void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2); |
289 void generate_multiprobe_keys(Uns32T*g, Uns32T* r1, Uns32T* r2); | |
253 Uns32T get_t1(){return t1;} // hash-key t1 | 290 Uns32T get_t1(){return t1;} // hash-key t1 |
254 Uns32T get_t2(){return t2;} // hash-key t2 | 291 Uns32T get_t2(){return t2;} // hash-key t2 |
255 }; | 292 }; |
256 | 293 |
257 // Typedef for point-reporting callback function. Used to collect points during LSH retrieval | 294 // Typedef for point-reporting callback function. Used to collect points during LSH retrieval |
265 // LSH serial data structure file handling | 302 // LSH serial data structure file handling |
266 void get_lock(int fd, bool exclusive); | 303 void get_lock(int fd, bool exclusive); |
267 void release_lock(int fd); | 304 void release_lock(int fd); |
268 int serial_create(char* filename, Uns32T FMT); | 305 int serial_create(char* filename, Uns32T FMT); |
269 int serial_create(char* filename, float binWidth, Uns32T nTables, Uns32T nRows, Uns32T nCols, Uns32T k, Uns32T d, Uns32T FMT); | 306 int serial_create(char* filename, float binWidth, Uns32T nTables, Uns32T nRows, Uns32T nCols, Uns32T k, Uns32T d, Uns32T FMT); |
307 char* serial_mmap(int dbfid, Uns32T sz, Uns32T w, off_t offset = 0); | |
308 void serial_munmap(char* db, Uns32T N); | |
270 int serial_open(char* filename,int writeFlag); | 309 int serial_open(char* filename,int writeFlag); |
271 void serial_close(int dbfid); | 310 void serial_close(int dbfid); |
272 | 311 |
273 // Function to write hashfunctions to disk | 312 // Function to write hashfunctions to disk |
274 int serialize_lsh_hashfunctions(int fid); | 313 int serialize_lsh_hashfunctions(int fid); |
304 // Helper functions | 343 // Helper functions |
305 void serial_print_header(Uns32T requestedFormat); | 344 void serial_print_header(Uns32T requestedFormat); |
306 float* get_serial_hashfunction_base(char* db); | 345 float* get_serial_hashfunction_base(char* db); |
307 SerialElementT* get_serial_hashtable_base(char* db); | 346 SerialElementT* get_serial_hashtable_base(char* db); |
308 Uns32T get_serial_hashtable_offset(); // Size of SerialHeader + HashFunctions | 347 Uns32T get_serial_hashtable_offset(); // Size of SerialHeader + HashFunctions |
309 SerialHeaderT* serial_get_header(int fd); | 348 SerialHeaderT* serial_get_header(char* db); |
310 void serial_write_header(int fd, SerialHeaderT *header); | |
311 void serial_get_table(int, int, void *, size_t); | |
312 void serial_write_table(int, int, void *, size_t); | |
313 SerialHeaderT* lshHeader; | 349 SerialHeaderT* lshHeader; |
314 | 350 |
315 // Core Retrieval/Inspections Functions | 351 // Core Retrieval/Inspections Functions |
316 void bucket_chain_point(bucket* p, Uns32T qpos); | 352 void bucket_chain_point(bucket* p, Uns32T qpos); |
317 void sbucket_chain_point(sbucket* p, Uns32T qpos); | 353 void sbucket_chain_point(sbucket* p, Uns32T qpos); |
352 SerialHeaderT* get_lshHeader(){return lshHeader;} | 388 SerialHeaderT* get_lshHeader(){return lshHeader;} |
353 void serial_dump_tables(char* filename); | 389 void serial_dump_tables(char* filename); |
354 float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; } | 390 float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; } |
355 char* get_indexName(){return indexName;} | 391 char* get_indexName(){return indexName;} |
356 void dump_hashtables(); | 392 void dump_hashtables(); |
357 | 393 void dump_core_row(Uns32T n); |
394 void dump_disk_row(char*, Uns32T n); | |
358 }; | 395 }; |
359 | 396 |
360 typedef class G LSH; | 397 typedef class G LSH; |
361 | 398 |
362 | 399 |