annotate lshlib.h @ 601:82d23418d867

Fix some fd leaks in the command-line binary Strictly speaking, they're not really leaks, because the only codepath that suffers from these leaks exits immediately afterwards. On the other hand, this fix makes valgrind on e.g. tests/0025 happier, going from 5 errors to none.
author mas01cr
date Fri, 14 Aug 2009 16:39:32 +0000
parents 9119f2fa3efe
children 4b79043f90ba
rev   line source
mas01mc@292 1 // lshlib.h - a library for locality sensitive hashtable insertion and retrieval
mas01mc@292 2 //
mas01mc@292 3 // Author: Michael Casey
mas01mc@292 4 // Copyright (c) 2008 Michael Casey, All Rights Reserved
mas01mc@292 5
mas01mc@292 6 /* GNU GENERAL PUBLIC LICENSE
mas01mc@292 7 Version 2, June 1991
mas01mc@292 8 See LICENSE.txt
mas01mc@292 9 */
mas01mc@292 10
mas01mc@292 11 #ifndef __LSHLIB_H
mas01mc@292 12 #define __LSHLIB_H
mas01mc@292 13
mas01cr@589 14 using namespace std;
mas01mc@292 15
mas01mc@292 16 #define IntT int
mas01mc@292 17 #define LongUns64T long long unsigned
mas01mc@292 18 #define Uns32T unsigned
mas01mc@292 19 #define Int32T int
mas01mc@292 20 #define BooleanT int
mas01mc@292 21
mas01mc@292 22 // A big number (>> max # of points)
mas01mc@292 23 #define INDEX_START_EMPTY 1000000000U
mas01mc@292 24
mas01mc@292 25 // 4294967291 = 2^32-5
mas01mc@292 26 #define UH_PRIME_DEFAULT 4294967291U
mas01mc@292 27
mas01mc@292 28 // 2^29
mas01mc@292 29 #define MAX_HASH_RND 536870912U
mas01mc@292 30
mas01mc@292 31 // 2^32-1
mas01mc@292 32 #define TWO_TO_32_MINUS_1 4294967295U
mas01mc@292 33
mas01mc@292 34 #define O2_SERIAL_VERSION 1 // Sync with SVN version
mas01mc@292 35 #define O2_SERIAL_HEADER_SIZE sizeof(SerialHeaderT)
mas01mc@292 36 #define O2_SERIAL_ELEMENT_SIZE sizeof(SerialElementT)
mas01mc@292 37 #define O2_SERIAL_MAX_TABLES (200)
mas01mc@324 38 #define O2_SERIAL_MAX_ROWS (1000000000)
mas01mc@324 39 #define O2_SERIAL_MAX_COLS (1000000)
mas01mc@464 40 #define O2_SERIAL_MAX_DIM (20000)
mas01mc@292 41 #define O2_SERIAL_MAX_FUNS (100)
mas01mc@292 42 #define O2_SERIAL_MAX_BINWIDTH (200)
mas01mc@296 43 #define O2_SERIAL_MAXFILESIZE (4000000000UL)
mas01mc@292 44
mas01mc@292 45 // Flags for Serial Header
mas01mc@324 46 #define O2_SERIAL_FILEFORMAT1 (0x1U) // Optimize disk format for on-disk search
mas01mc@324 47 #define O2_SERIAL_FILEFORMAT2 (0x2U) // Optimize disk format for in-core search
mas01mc@324 48 #define O2_SERIAL_COREFORMAT1 (0x4U)
mas01mc@324 49 #define O2_SERIAL_COREFORMAT2 (0x8U)
mas01mc@292 50
mas01mc@292 51 // Flags for serialization fileformat2: use high 3 bits of Uns32T
mas01mc@324 52 #define O2_SERIAL_TOKEN_T1 (0xFFFFFFFCU)
mas01mc@306 53 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU)
mas01mc@306 54 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU)
mas01mc@292 55
mas01mc@324 56 #define O2_INDEX_MAXSTR (256)
mas01mc@309 57
mas01cr@589 58 #define align_up(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
mas01mc@292 59
mas01mc@292 60 #define O2_SERIAL_FUNCTIONS_SIZE (align_up(sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * O2_SERIAL_MAX_DIM \
mas01mc@292 61 + sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS + \
mas01mc@292 62 + sizeof(Uns32T) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * 2 \
mas01mc@292 63 + O2_SERIAL_HEADER_SIZE,get_page_logn()))
mas01mc@292 64
mas01mc@292 65 #define O2_SERIAL_MAX_LSH_SIZE (O2_SERIAL_ELEMENT_SIZE * O2_SERIAL_MAX_TABLES \
mas01mc@292 66 * O2_SERIAL_MAX_ROWS * O2_SERIAL_MAX_COLS + O2_SERIAL_FUNCTIONS_SIZE)
mas01mc@292 67
mas01mc@292 68 #define O2_SERIAL_MAGIC ('o'|'2'<<8|'l'<<16|'s'<<24)
mas01mc@292 69
mas01mc@340 70 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\
mas01mc@340 71 fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);}
mas01mc@340 72
mas01mc@514 73 //#define LSH_DUMP_CORE_TABLES // set to dump hashtables on load
mas01mc@514 74 //#define USE_U_FUNCTIONS // set to use partial hashfunction re-use
mas01mc@514 75
mas01mc@514 76 // Backward-compatible CORE ARRAY lsh index
mas01mc@514 77 #define LSH_CORE_ARRAY // Set to use arrays for hashtables rather than linked-lists
mas01mc@514 78 #define LSH_LIST_HEAD_COUNTERS // Enable counters in hashtable list heads
mas01mc@514 79
mas01mc@514 80 // Critical path logic
mas01mc@514 81 #if defined LSH_CORE_ARRAY && !defined LSH_LIST_HEAD_COUNTERS
mas01mc@514 82 #define LSH_LIST_HEAD_COUNTERS
mas01mc@514 83 #endif
mas01mc@514 84
mas01mc@514 85 #define LSH_CORE_ARRAY_BIT (0x80000000) // LSH_CORE_ARRAY test bit for list head
mas01mc@514 86
mas01mc@292 87 Uns32T get_page_logn();
mas01mc@292 88
mas01mc@292 89 // Disk table entry
mas01mc@292 90 typedef class SerialElement SerialElementT;
mas01mc@292 91 class SerialElement {
mas01mc@292 92 public:
mas01mc@292 93 Uns32T hashValue;
mas01mc@292 94 Uns32T pointID;
mas01mc@292 95
mas01mc@292 96 SerialElement(Uns32T h, Uns32T pID):
mas01mc@292 97 hashValue(h),
mas01mc@292 98 pointID(pID){}
mas01mc@292 99 };
mas01mc@292 100
mas01mc@292 101 // Disk header
mas01mc@292 102 typedef class SerialHeader SerialHeaderT;
mas01mc@292 103 class SerialHeader {
mas01mc@292 104 public:
mas01mc@292 105 Uns32T lshMagic; // unique identifier for file header
mas01mc@292 106 float binWidth; // hash-function bin width
mas01mc@292 107 Uns32T numTables; // number of hash tables in file
mas01mc@292 108 Uns32T numRows; // size of each hash table
mas01mc@292 109 Uns32T numCols; // max collisions in each hash table
mas01mc@292 110 Uns32T elementSize; // size of a hash bucket
mas01mc@292 111 Uns32T version; // version number of file format
mas01mc@292 112 Uns32T size; // total size of database (bytes)
mas01mc@292 113 Uns32T flags; // 32 bits of useful information
mas01mc@292 114 Uns32T dataDim; // vector dimensionality
mas01mc@292 115 Uns32T numFuns; // number of independent hash functions
mas01mc@292 116 float radius; // 32-bit floating point radius
mas01mc@292 117 Uns32T maxp; // number of unique IDs in the database
mas01mc@296 118 unsigned long long size_long; // long version of size
mas01mc@296 119 Uns32T pointCount; // number of points in the database
mas01mc@292 120
mas01mc@292 121 SerialHeader();
mas01mc@296 122 SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float radius, Uns32T p, Uns32T FMT, Uns32T pointCount);
mas01mc@292 123
mas01mc@292 124 float get_binWidth(){return binWidth;}
mas01mc@292 125 Uns32T get_numTables(){return numTables;}
mas01mc@292 126 Uns32T get_numRows(){return numRows;}
mas01mc@292 127 Uns32T get_numCols(){return numCols;}
mas01mc@292 128 Uns32T get_elementSize(){return elementSize;}
mas01mc@292 129 Uns32T get_version(){return version;}
mas01mc@292 130 Uns32T get_flags(){return flags;}
mas01mc@296 131 unsigned long long get_size(){return size_long;}
mas01mc@292 132 Uns32T get_dataDim(){return dataDim;}
mas01mc@292 133 Uns32T get_numFuns(){return numFuns;}
mas01mc@292 134 Uns32T get_maxp(){return maxp;}
mas01mc@296 135 Uns32T get_pointCount(){return pointCount;}
mas01mc@292 136 };
mas01mc@292 137
mas01mc@292 138 #define IFLAG 0xFFFFFFFF
mas01mc@292 139
mas01mc@292 140 // Point-set collision bucket (sbucket).
mas01mc@292 141 // sbuckets form a collision chain that identifies PointIDs falling in the same locale.
mas01mc@292 142 // sbuckets are chained from a bucket containing the collision list's t2 identifier
mas01mc@292 143 class sbucket {
mas01mc@292 144 friend class bucket;
mas01mc@292 145 friend class H;
mas01mc@292 146 friend class G;
mas01mc@292 147
mas01mc@292 148 public:
mas01mc@292 149 class sbucket* snext;
mas01mc@292 150 unsigned int pointID;
mas01mc@292 151
mas01mc@292 152 sbucket(){
mas01mc@292 153 snext=0;
mas01mc@292 154 pointID=IFLAG;
mas01mc@292 155 }
mas01mc@292 156 ~sbucket(){delete snext;}
mas01mc@292 157 sbucket* get_snext(){return snext;}
mas01mc@292 158 };
mas01mc@292 159
mas01mc@292 160 // bucket structure for a linked list of locales that collide with the same hash value t1
mas01mc@292 161 // different buckets represent different locales, collisions within a locale are chained
mas01mc@292 162 // in sbuckets
mas01mc@292 163 class bucket {
mas01mc@292 164 friend class H;
mas01mc@292 165 friend class G;
mas01mc@292 166 bucket* next;
mas01mc@344 167 union {
mas01mc@344 168 sbucket* ptr;
mas01mc@344 169 Uns32T numBuckets;
mas01mc@344 170 } snext;
mas01mc@292 171 public:
mas01mc@292 172 unsigned int t2;
mas01mc@292 173 bucket(){
mas01mc@292 174 next=0;
mas01mc@344 175 snext.ptr=0;
mas01mc@292 176 t2=IFLAG;
mas01mc@292 177 }
mas01mc@344 178 ~bucket(){delete next;delete snext.ptr;}
mas01mc@292 179 bucket* get_next(){return next;}
mas01mc@292 180 };
mas01mc@292 181
mas01mc@292 182
mas01mc@292 183 // The hash_functions for locality-sensitive hashing
mas01mc@292 184 class H{
mas01mc@292 185 friend class G;
mas01mc@292 186 private:
mas01mc@293 187
mas01mc@293 188 float *** A; // m x k x d random projectors from R^d->R^k
mas01mc@293 189 float ** b; // m x k uniform additive constants
mas01mc@293 190
mas01mc@293 191 Uns32T ** g; // L x k random hash projections \in Z^k
mas01mc@292 192 Uns32T** r1; // random ints for hashing
mas01mc@292 193 Uns32T** r2; // random ints for hashing
mas01mc@293 194
mas01mc@293 195 bucket*** h; // The LSH hash tables
mas01mc@293 196
mas01mc@293 197 bool use_u_functions; // flag to optimize computation of hashes
mas01mc@514 198 #ifdef USE_U_FUNCTIONS
mas01mc@293 199 vector<vector<Uns32T> > uu; // Storage for m patial hash evaluations ( g_j = [u_a,u_b] )
mas01mc@514 200 #endif
mas01mc@293 201 Uns32T maxp; // highest pointID stored in database
mas01mc@293 202 Uns32T bucketCount; // count of number of point buckets allocated
mas01mc@293 203 Uns32T pointCount; // count of number of points inserted
mas01mc@296 204 Uns32T collisionCount; // number of points collided in a hash-table row
mas01mc@293 205
mas01mc@292 206 Uns32T t1; // first hash table key
mas01mc@292 207 Uns32T t2; // second hash table key
mas01mc@292 208 Uns32T P; // hash table prime number
mas01mc@292 209
mas01mc@292 210 Uns32T N; // num rows per table
mas01mc@292 211 Uns32T C; // num collision per row
mas01mc@292 212 Uns32T k; // num projections per hash function
mas01mc@292 213 Uns32T m; // ~sqrt num hash tables
mas01mc@292 214 Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0
mas01mc@292 215 Uns32T d; // dimensions
mas01mc@292 216 Uns32T p; // current point
mas01mc@293 217 float w; // width of hash slots (relative to normalized feature space)
mas01mc@293 218 float radius;// scaling coefficient for data (1./radius)
mas01mc@292 219
mas01mc@293 220 void initialize_data_structures();
mas01mc@293 221 void initialize_lsh_functions();
mas01mc@293 222 void initialize_partial_functions();
mas01mc@293 223 void __bucket_insert_point(bucket*);
mas01mc@293 224 void __sbucket_insert_point(sbucket*);
mas01mc@340 225 bucket** get_pointer_to_bucket_linked_list(bucket* rowPtr);
mas01mc@293 226 Uns32T computeProductModDefaultPrime(Uns32T*,Uns32T*,IntT);
mas01mc@293 227 Uns32T randr();
mas01mc@293 228 float randn();
mas01mc@293 229 float ranf();
mas01mc@293 230 bucket** get_bucket(int j);
mas01mc@293 231 void error(const char* a, const char* b = "", const char *sysFunc = 0);
mas01mc@293 232
mas01mc@293 233 public:
mas01mc@293 234
mas01mc@293 235 H();
mas01mc@293 236 H(Uns32T k, Uns32T m, Uns32T d, Uns32T N, Uns32T C, float w, float r);
mas01mc@474 237 virtual ~H();
mas01mc@292 238
mas01mc@293 239 float get_w(){return w;}
mas01mc@293 240 float get_radius(){return radius;}
mas01mc@293 241 Uns32T get_numRows(){return N;}
mas01mc@293 242 Uns32T get_numCols(){return C;}
mas01mc@293 243 Uns32T get_numFuns(){return k;}
mas01mc@293 244 Uns32T get_numTables(){return L;}
mas01mc@293 245 Uns32T get_dataDim(){return d;}
mas01mc@293 246 Uns32T get_maxp(){return maxp;}
mas01mc@292 247
mas01mc@292 248 Uns32T bucket_insert_point(bucket**);
mas01mc@292 249
mas01mc@293 250 // Interface to hash functions
mas01mc@293 251 void compute_hash_functions(vector<float>& v);
mas01mc@293 252 void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2);
mas01mc@293 253 Uns32T get_t1(){return t1;} // hash-key t1
mas01mc@293 254 Uns32T get_t2(){return t2;} // hash-key t2
mas01mc@292 255 };
mas01mc@292 256
mas01mc@293 257 // Typedef for point-reporting callback function. Used to collect points during LSH retrieval
mas01mc@292 258 typedef void (*ReporterCallbackPtr)(void* objPtr, Uns32T pointID, Uns32T queryIndex, float squaredDistance);
mas01mc@292 259
mas01mc@292 260 // Interface for indexing and retrieval
mas01mc@292 261 class G: public H{
mas01mc@292 262 private:
mas01mc@308 263 char* indexName;
mas01mc@308 264
mas01mc@293 265 // LSH serial data structure file handling
mas01mc@292 266 void get_lock(int fd, bool exclusive);
mas01mc@292 267 void release_lock(int fd);
mas01mc@292 268 int serial_create(char* filename, Uns32T FMT);
mas01mc@292 269 int serial_create(char* filename, float binWidth, Uns32T nTables, Uns32T nRows, Uns32T nCols, Uns32T k, Uns32T d, Uns32T FMT);
mas01mc@292 270 char* serial_mmap(int dbfid, Uns32T sz, Uns32T w, off_t offset = 0);
mas01mc@292 271 void serial_munmap(char* db, Uns32T N);
mas01mc@292 272 int serial_open(char* filename,int writeFlag);
mas01mc@292 273 void serial_close(int dbfid);
mas01mc@292 274
mas01mc@292 275 // Function to write hashfunctions to disk
mas01mc@292 276 int serialize_lsh_hashfunctions(int fid);
mas01mc@292 277
mas01mc@292 278 // Functions to write hashtables to disk in format1 (optimized for on-disk retrieval)
mas01mc@292 279 int serialize_lsh_hashtables_format1(int fid, int merge);
mas01mc@292 280 void serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* h, Uns32T& colCount);
mas01mc@292 281 void serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount);
mas01mc@292 282 void serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* h, Uns32T& colCount);
mas01mc@292 283 void serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount);
mas01mc@292 284 int serial_can_merge(Uns32T requestedFormat); // Test to see whether core and on-disk structures are compatible
mas01mc@292 285
mas01mc@292 286 // Functions to write hashtables to disk in format2 (optimized for in-core retrieval)
mas01mc@336 287 int serialize_lsh_hashtables_format2(FILE* dbFile, int merge);
mas01mc@336 288 void serial_write_hashtable_row_format2(FILE* dbFile, bucket* h, Uns32T& colCount);
mas01mc@336 289 void serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount);
mas01mc@340 290 Uns32T count_buckets_and_points_hashtable_row(bucket* bPtr);
mas01mc@340 291 Uns32T count_points_hashtable_row(bucket* bPtr);
mas01mc@292 292
mas01mc@292 293 // Functions to read serial header and hash functions (format1 and format2)
mas01mc@292 294 int unserialize_lsh_header(char* filename); // read lsh header from disk into core
mas01mc@292 295 void unserialize_lsh_functions(int fid); // read the lsh hash functions into core
mas01mc@292 296
mas01mc@292 297 // Functions to read hashtables in format1
mas01mc@292 298 void unserialize_lsh_hashtables_format1(int fid); // read FORMAT1 hash tables into core (disk format)
mas01mc@292 299 void unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b); // read lsh hash table row into core
mas01mc@292 300
mas01mc@292 301 // Functions to read hashtables in format2
mas01mc@340 302 void unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge = 0);
mas01mc@340 303 Uns32T unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token=0); // to dynamic linked list
mas01mc@340 304 Uns32T unserialize_hashtable_row_to_array(FILE* dbFile, bucket** b, Uns32T numElements); // to core array
mas01mc@292 305
mas01mc@292 306 // Helper functions
mas01mc@292 307 void serial_print_header(Uns32T requestedFormat);
mas01mc@292 308 float* get_serial_hashfunction_base(char* db);
mas01mc@292 309 SerialElementT* get_serial_hashtable_base(char* db);
mas01mc@292 310 Uns32T get_serial_hashtable_offset(); // Size of SerialHeader + HashFunctions
mas01mc@292 311 SerialHeaderT* serial_get_header(char* db);
mas01mc@292 312 SerialHeaderT* lshHeader;
mas01mc@292 313
mas01mc@292 314 // Core Retrieval/Inspections Functions
mas01mc@292 315 void bucket_chain_point(bucket* p, Uns32T qpos);
mas01mc@292 316 void sbucket_chain_point(sbucket* p, Uns32T qpos);
mas01mc@292 317 void dump_hashtable_row(bucket* p);
mas01mc@513 318 void dump_core_hashtable_array(Uns32T* p);
mas01mc@292 319
mas01mc@292 320 // Serial (Format 1) Retrieval/Inspection Functions
mas01mc@292 321 void serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos);
mas01mc@292 322 void serial_bucket_dump(SerialElementT* pe);
mas01mc@292 323
mas01mc@340 324 // Core ARRAY Retrieval Functions
mas01mc@340 325 void retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos);
mas01mc@340 326
mas01mc@340 327
mas01mc@293 328 // Callback Function for point reporting
mas01mc@293 329 void* calling_instance; // store calling object instance for member-function callback
mas01mc@324 330 ReporterCallbackPtr add_point_callback; // Pointer to the callback function
mas01mc@292 331
mas01mc@292 332 public:
mas01mc@292 333 G(char* lshFile, bool lshInCore = false); // unserialize constructor
mas01mc@292 334 G(float w, Uns32T k,Uns32T m, Uns32T d, Uns32T N, Uns32T C, float r); // core constructor
mas01mc@474 335 virtual ~G();
mas01mc@292 336
mas01mc@292 337 Uns32T insert_point(vector<float>&, Uns32T pointID);
mas01mc@292 338 void insert_point_set(vector<vector<float> >& vv, Uns32T basePointID);
mas01mc@292 339
mas01mc@292 340 // point retrieval from core
mas01mc@292 341 void retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 342 // point set retrieval from core
mas01mc@292 343 void retrieve_point_set(vector<vector<float> >& vv, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 344 // serial point set retrieval
mas01mc@292 345 void serial_retrieve_point_set(char* filename, vector<vector<float> >& vv, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 346 // serial point retrieval
mas01mc@292 347 void serial_retrieve_point(char* filename, vector<float>& vv, Uns32T qpos, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 348
mas01mc@292 349 void serialize(char* filename, Uns32T serialFormat = O2_SERIAL_FILEFORMAT1); // write hashfunctions and hashtables to disk
mas01mc@292 350
mas01mc@292 351 SerialHeaderT* get_lshHeader(){return lshHeader;}
mas01mc@292 352 void serial_dump_tables(char* filename);
mas01mc@292 353 float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; }
mas01mc@308 354 char* get_indexName(){return indexName;}
mas01mc@513 355 void dump_hashtables();
mas01mc@513 356
mas01mc@292 357 };
mas01mc@292 358
mas01mc@292 359 typedef class G LSH;
mas01mc@292 360
mas01mc@292 361
mas01mc@292 362
mas01mc@292 363 #endif