annotate lshlib.h @ 369:6564be3109c5 gcc-4.3-cleanups

gcc-4.3 warning cleanups for lshlib.cpp (I do not believe that any of these changes contain significant copyrightable "intellectual property". However, to the extent that they do, the changes are hereby released into the Public Domain, and may be therefore be used by anyone for any purpose without need for consideration of any kind.)
author mas01cr
date Wed, 12 Nov 2008 15:23:32 +0000
parents 223a5994a962
children f283448a40db
rev   line source
mas01mc@292 1 // lshlib.h - a library for locality sensitive hashtable insertion and retrieval
mas01mc@292 2 //
mas01mc@292 3 // Author: Michael Casey
mas01mc@292 4 // Copyright (c) 2008 Michael Casey, All Rights Reserved
mas01mc@292 5
mas01mc@292 6 /* GNU GENERAL PUBLIC LICENSE
mas01mc@292 7 Version 2, June 1991
mas01mc@292 8 See LICENSE.txt
mas01mc@292 9 */
mas01mc@292 10
mas01mc@292 11 #ifndef __LSHLIB_H
mas01mc@292 12 #define __LSHLIB_H
mas01mc@292 13
mas01mc@292 14 #include <vector>
mas01mc@292 15 #include <queue>
mas01mc@292 16 #include <stdio.h>
mas01mc@292 17 #include <stdlib.h>
mas01mc@292 18 #include <sys/types.h>
mas01mc@292 19 #include <sys/stat.h>
mas01mc@292 20 #include <sys/mman.h>
mas01mc@292 21 #include <fcntl.h>
mas01mc@292 22 #include <string.h>
mas01mc@292 23 #include <iostream>
mas01mc@292 24 #include <fstream>
mas01mc@292 25 #include <math.h>
mas01mc@292 26 #include <sys/time.h>
mas01mc@292 27 #include <assert.h>
mas01mc@292 28 #include <float.h>
mas01mc@292 29 #include <signal.h>
mas01mc@292 30 #include <time.h>
mas01mc@292 31 #include <limits.h>
mas01mc@292 32 #include <errno.h>
mas01mc@292 33 #ifdef MT19937
mas01mc@292 34 #include "mt19937/mt19937ar.h"
mas01mc@292 35 #endif
mas01mc@292 36
mas01mc@292 37 #define IntT int
mas01mc@292 38 #define LongUns64T long long unsigned
mas01mc@292 39 #define Uns32T unsigned
mas01mc@292 40 #define Int32T int
mas01mc@292 41 #define BooleanT int
mas01mc@292 42 #define TRUE 1
mas01mc@292 43 #define FALSE 0
mas01mc@292 44
mas01mc@292 45 // A big number (>> max # of points)
mas01mc@292 46 #define INDEX_START_EMPTY 1000000000U
mas01mc@292 47
mas01mc@292 48 // 4294967291 = 2^32-5
mas01mc@292 49 #define UH_PRIME_DEFAULT 4294967291U
mas01mc@292 50
mas01mc@292 51 // 2^29
mas01mc@292 52 #define MAX_HASH_RND 536870912U
mas01mc@292 53
mas01mc@292 54 // 2^32-1
mas01mc@292 55 #define TWO_TO_32_MINUS_1 4294967295U
mas01mc@292 56
mas01mc@292 57 #define O2_SERIAL_VERSION 1 // Sync with SVN version
mas01mc@292 58 #define O2_SERIAL_HEADER_SIZE sizeof(SerialHeaderT)
mas01mc@292 59 #define O2_SERIAL_ELEMENT_SIZE sizeof(SerialElementT)
mas01mc@292 60 #define O2_SERIAL_MAX_TABLES (200)
mas01mc@324 61 #define O2_SERIAL_MAX_ROWS (1000000000)
mas01mc@324 62 #define O2_SERIAL_MAX_COLS (1000000)
mas01mc@292 63 #define O2_SERIAL_MAX_DIM (2000)
mas01mc@292 64 #define O2_SERIAL_MAX_FUNS (100)
mas01mc@292 65 #define O2_SERIAL_MAX_BINWIDTH (200)
mas01mc@296 66 #define O2_SERIAL_MAXFILESIZE (4000000000UL)
mas01mc@292 67
mas01mc@292 68 // Flags for Serial Header
mas01mc@324 69 #define O2_SERIAL_FILEFORMAT1 (0x1U) // Optimize disk format for on-disk search
mas01mc@324 70 #define O2_SERIAL_FILEFORMAT2 (0x2U) // Optimize disk format for in-core search
mas01mc@324 71 #define O2_SERIAL_COREFORMAT1 (0x4U)
mas01mc@324 72 #define O2_SERIAL_COREFORMAT2 (0x8U)
mas01mc@292 73
mas01mc@292 74 // Flags for serialization fileformat2: use high 3 bits of Uns32T
mas01mc@324 75 #define O2_SERIAL_TOKEN_T1 (0xFFFFFFFCU)
mas01mc@306 76 #define O2_SERIAL_TOKEN_T2 (0xFFFFFFFDU)
mas01mc@306 77 #define O2_SERIAL_TOKEN_ENDTABLE (0xFFFFFFFEU)
mas01mc@292 78
mas01mc@324 79 #define O2_INDEX_MAXSTR (256)
mas01mc@309 80
mas01mc@292 81 unsigned align_up(unsigned x, unsigned w);
mas01mc@292 82
mas01mc@292 83 #define O2_SERIAL_FUNCTIONS_SIZE (align_up(sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * O2_SERIAL_MAX_DIM \
mas01mc@292 84 + sizeof(float) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS + \
mas01mc@292 85 + sizeof(Uns32T) * O2_SERIAL_MAX_TABLES * O2_SERIAL_MAX_FUNS * 2 \
mas01mc@292 86 + O2_SERIAL_HEADER_SIZE,get_page_logn()))
mas01mc@292 87
mas01mc@292 88 #define O2_SERIAL_MAX_LSH_SIZE (O2_SERIAL_ELEMENT_SIZE * O2_SERIAL_MAX_TABLES \
mas01mc@292 89 * O2_SERIAL_MAX_ROWS * O2_SERIAL_MAX_COLS + O2_SERIAL_FUNCTIONS_SIZE)
mas01mc@292 90
mas01mc@292 91 #define O2_SERIAL_MAGIC ('o'|'2'<<8|'l'<<16|'s'<<24)
mas01mc@292 92
mas01mc@340 93 #define WRITE_UNS32(VAL, TOKENSTR) if( fwrite(VAL, sizeof(Uns32T), 1, dbFile) != 1 ){\
mas01mc@340 94 fclose(dbFile);error("write error in serial_write_format2",TOKENSTR);}
mas01mc@340 95
mas01mc@292 96 using namespace std;
mas01mc@292 97
mas01mc@292 98 Uns32T get_page_logn();
mas01mc@292 99
mas01mc@292 100 // Disk table entry
mas01mc@292 101 typedef class SerialElement SerialElementT;
mas01mc@292 102 class SerialElement {
mas01mc@292 103 public:
mas01mc@292 104 Uns32T hashValue;
mas01mc@292 105 Uns32T pointID;
mas01mc@292 106
mas01mc@292 107 SerialElement(Uns32T h, Uns32T pID):
mas01mc@292 108 hashValue(h),
mas01mc@292 109 pointID(pID){}
mas01mc@292 110 };
mas01mc@292 111
mas01mc@292 112 // Disk header
mas01mc@292 113 typedef class SerialHeader SerialHeaderT;
mas01mc@292 114 class SerialHeader {
mas01mc@292 115 public:
mas01mc@292 116 Uns32T lshMagic; // unique identifier for file header
mas01mc@292 117 float binWidth; // hash-function bin width
mas01mc@292 118 Uns32T numTables; // number of hash tables in file
mas01mc@292 119 Uns32T numRows; // size of each hash table
mas01mc@292 120 Uns32T numCols; // max collisions in each hash table
mas01mc@292 121 Uns32T elementSize; // size of a hash bucket
mas01mc@292 122 Uns32T version; // version number of file format
mas01mc@292 123 Uns32T size; // total size of database (bytes)
mas01mc@292 124 Uns32T flags; // 32 bits of useful information
mas01mc@292 125 Uns32T dataDim; // vector dimensionality
mas01mc@292 126 Uns32T numFuns; // number of independent hash functions
mas01mc@292 127 float radius; // 32-bit floating point radius
mas01mc@292 128 Uns32T maxp; // number of unique IDs in the database
mas01mc@296 129 unsigned long long size_long; // long version of size
mas01mc@296 130 Uns32T pointCount; // number of points in the database
mas01mc@292 131
mas01mc@292 132 SerialHeader();
mas01mc@296 133 SerialHeader(float W, Uns32T L, Uns32T N, Uns32T C, Uns32T k, Uns32T d, float radius, Uns32T p, Uns32T FMT, Uns32T pointCount);
mas01mc@292 134
mas01mc@292 135 float get_binWidth(){return binWidth;}
mas01mc@292 136 Uns32T get_numTables(){return numTables;}
mas01mc@292 137 Uns32T get_numRows(){return numRows;}
mas01mc@292 138 Uns32T get_numCols(){return numCols;}
mas01mc@292 139 Uns32T get_elementSize(){return elementSize;}
mas01mc@292 140 Uns32T get_version(){return version;}
mas01mc@292 141 Uns32T get_flags(){return flags;}
mas01mc@296 142 unsigned long long get_size(){return size_long;}
mas01mc@292 143 Uns32T get_dataDim(){return dataDim;}
mas01mc@292 144 Uns32T get_numFuns(){return numFuns;}
mas01mc@292 145 Uns32T get_maxp(){return maxp;}
mas01mc@296 146 Uns32T get_pointCount(){return pointCount;}
mas01mc@292 147 };
mas01mc@292 148
mas01mc@292 149 #define IFLAG 0xFFFFFFFF
mas01mc@292 150
mas01mc@292 151 // Point-set collision bucket (sbucket).
mas01mc@292 152 // sbuckets form a collision chain that identifies PointIDs falling in the same locale.
mas01mc@292 153 // sbuckets are chained from a bucket containing the collision list's t2 identifier
mas01mc@292 154 class sbucket {
mas01mc@292 155 friend class bucket;
mas01mc@292 156 friend class H;
mas01mc@292 157 friend class G;
mas01mc@292 158
mas01mc@292 159 public:
mas01mc@292 160 class sbucket* snext;
mas01mc@292 161 unsigned int pointID;
mas01mc@292 162
mas01mc@292 163 sbucket(){
mas01mc@292 164 snext=0;
mas01mc@292 165 pointID=IFLAG;
mas01mc@292 166 }
mas01mc@292 167 ~sbucket(){delete snext;}
mas01mc@292 168 sbucket* get_snext(){return snext;}
mas01mc@292 169 };
mas01mc@292 170
mas01mc@292 171 // bucket structure for a linked list of locales that collide with the same hash value t1
mas01mc@292 172 // different buckets represent different locales, collisions within a locale are chained
mas01mc@292 173 // in sbuckets
mas01mc@292 174 class bucket {
mas01mc@292 175 friend class H;
mas01mc@292 176 friend class G;
mas01mc@292 177 bucket* next;
mas01mc@344 178 union {
mas01mc@344 179 sbucket* ptr;
mas01mc@344 180 Uns32T numBuckets;
mas01mc@344 181 } snext;
mas01mc@292 182 public:
mas01mc@292 183 unsigned int t2;
mas01mc@292 184 bucket(){
mas01mc@292 185 next=0;
mas01mc@344 186 snext.ptr=0;
mas01mc@292 187 t2=IFLAG;
mas01mc@292 188 }
mas01mc@344 189 ~bucket(){delete next;delete snext.ptr;}
mas01mc@292 190 bucket* get_next(){return next;}
mas01mc@292 191 };
mas01mc@292 192
mas01mc@292 193
mas01mc@292 194 // The hash_functions for locality-sensitive hashing
mas01mc@292 195 class H{
mas01mc@292 196 friend class G;
mas01mc@292 197 private:
mas01mc@293 198
mas01mc@293 199 float *** A; // m x k x d random projectors from R^d->R^k
mas01mc@293 200 float ** b; // m x k uniform additive constants
mas01mc@293 201
mas01mc@293 202 Uns32T ** g; // L x k random hash projections \in Z^k
mas01mc@292 203 Uns32T** r1; // random ints for hashing
mas01mc@292 204 Uns32T** r2; // random ints for hashing
mas01mc@293 205
mas01mc@293 206 bucket*** h; // The LSH hash tables
mas01mc@293 207
mas01mc@293 208 bool use_u_functions; // flag to optimize computation of hashes
mas01mc@293 209 vector<vector<Uns32T> > uu; // Storage for m patial hash evaluations ( g_j = [u_a,u_b] )
mas01mc@293 210
mas01mc@293 211 Uns32T maxp; // highest pointID stored in database
mas01mc@293 212 Uns32T bucketCount; // count of number of point buckets allocated
mas01mc@293 213 Uns32T pointCount; // count of number of points inserted
mas01mc@296 214 Uns32T collisionCount; // number of points collided in a hash-table row
mas01mc@293 215
mas01mc@292 216 Uns32T t1; // first hash table key
mas01mc@292 217 Uns32T t2; // second hash table key
mas01mc@292 218 Uns32T P; // hash table prime number
mas01mc@292 219
mas01mc@292 220 Uns32T N; // num rows per table
mas01mc@292 221 Uns32T C; // num collision per row
mas01mc@292 222 Uns32T k; // num projections per hash function
mas01mc@292 223 Uns32T m; // ~sqrt num hash tables
mas01mc@292 224 Uns32T L; // L = m*(m-1)/2, conversely, m = (1 + sqrt(1 + 8.0*L)) / 2.0
mas01mc@292 225 Uns32T d; // dimensions
mas01mc@292 226 Uns32T p; // current point
mas01mc@293 227 float w; // width of hash slots (relative to normalized feature space)
mas01mc@293 228 float radius;// scaling coefficient for data (1./radius)
mas01mc@292 229
mas01mc@293 230 void initialize_data_structures();
mas01mc@293 231 void initialize_lsh_functions();
mas01mc@293 232 void initialize_partial_functions();
mas01mc@293 233 void __bucket_insert_point(bucket*);
mas01mc@293 234 void __sbucket_insert_point(sbucket*);
mas01mc@340 235 bucket** get_pointer_to_bucket_linked_list(bucket* rowPtr);
mas01mc@293 236 Uns32T computeProductModDefaultPrime(Uns32T*,Uns32T*,IntT);
mas01mc@293 237 Uns32T randr();
mas01mc@293 238 float randn();
mas01mc@293 239 float ranf();
mas01mc@293 240 bucket** get_bucket(int j);
mas01mc@293 241 void error(const char* a, const char* b = "", const char *sysFunc = 0);
mas01mc@293 242
mas01mc@293 243 public:
mas01mc@293 244
mas01mc@293 245 H();
mas01mc@293 246 H(Uns32T k, Uns32T m, Uns32T d, Uns32T N, Uns32T C, float w, float r);
mas01mc@292 247 ~H();
mas01mc@292 248
mas01mc@293 249 float get_w(){return w;}
mas01mc@293 250 float get_radius(){return radius;}
mas01mc@293 251 Uns32T get_numRows(){return N;}
mas01mc@293 252 Uns32T get_numCols(){return C;}
mas01mc@293 253 Uns32T get_numFuns(){return k;}
mas01mc@293 254 Uns32T get_numTables(){return L;}
mas01mc@293 255 Uns32T get_dataDim(){return d;}
mas01mc@293 256 Uns32T get_maxp(){return maxp;}
mas01mc@292 257
mas01mc@292 258 Uns32T bucket_insert_point(bucket**);
mas01mc@292 259
mas01mc@293 260 // Interface to hash functions
mas01mc@293 261 void compute_hash_functions(vector<float>& v);
mas01mc@293 262 void generate_hash_keys(Uns32T* g, Uns32T* r1, Uns32T* r2);
mas01mc@293 263 Uns32T get_t1(){return t1;} // hash-key t1
mas01mc@293 264 Uns32T get_t2(){return t2;} // hash-key t2
mas01mc@292 265 };
mas01mc@292 266
mas01mc@293 267 // Typedef for point-reporting callback function. Used to collect points during LSH retrieval
mas01mc@292 268 typedef void (*ReporterCallbackPtr)(void* objPtr, Uns32T pointID, Uns32T queryIndex, float squaredDistance);
mas01mc@292 269
mas01mc@292 270 // Interface for indexing and retrieval
mas01mc@292 271 class G: public H{
mas01mc@292 272 private:
mas01mc@308 273 char* indexName;
mas01mc@308 274
mas01mc@293 275 // LSH serial data structure file handling
mas01mc@292 276 void get_lock(int fd, bool exclusive);
mas01mc@292 277 void release_lock(int fd);
mas01mc@292 278 int serial_create(char* filename, Uns32T FMT);
mas01mc@292 279 int serial_create(char* filename, float binWidth, Uns32T nTables, Uns32T nRows, Uns32T nCols, Uns32T k, Uns32T d, Uns32T FMT);
mas01mc@292 280 char* serial_mmap(int dbfid, Uns32T sz, Uns32T w, off_t offset = 0);
mas01mc@292 281 void serial_munmap(char* db, Uns32T N);
mas01mc@292 282 int serial_open(char* filename,int writeFlag);
mas01mc@292 283 void serial_close(int dbfid);
mas01mc@292 284
mas01mc@292 285 // Function to write hashfunctions to disk
mas01mc@292 286 int serialize_lsh_hashfunctions(int fid);
mas01mc@292 287
mas01mc@292 288 // Functions to write hashtables to disk in format1 (optimized for on-disk retrieval)
mas01mc@292 289 int serialize_lsh_hashtables_format1(int fid, int merge);
mas01mc@292 290 void serial_write_hashtable_row_format1(SerialElementT*& pe, bucket* h, Uns32T& colCount);
mas01mc@292 291 void serial_write_element_format1(SerialElementT*& pe, sbucket* sb, Uns32T t2, Uns32T& colCount);
mas01mc@292 292 void serial_merge_hashtable_row_format1(SerialElementT* pr, bucket* h, Uns32T& colCount);
mas01mc@292 293 void serial_merge_element_format1(SerialElementT* pe, sbucket* sb, Uns32T t2, Uns32T& colCount);
mas01mc@292 294 int serial_can_merge(Uns32T requestedFormat); // Test to see whether core and on-disk structures are compatible
mas01mc@292 295
mas01mc@292 296 // Functions to write hashtables to disk in format2 (optimized for in-core retrieval)
mas01mc@336 297 int serialize_lsh_hashtables_format2(FILE* dbFile, int merge);
mas01mc@336 298 void serial_write_hashtable_row_format2(FILE* dbFile, bucket* h, Uns32T& colCount);
mas01mc@336 299 void serial_write_element_format2(FILE* dbFile, sbucket* sb, Uns32T& colCount);
mas01mc@340 300 Uns32T count_buckets_and_points_hashtable_row(bucket* bPtr);
mas01mc@340 301 Uns32T count_points_hashtable_row(bucket* bPtr);
mas01mc@292 302
mas01mc@292 303 // Functions to read serial header and hash functions (format1 and format2)
mas01mc@292 304 int unserialize_lsh_header(char* filename); // read lsh header from disk into core
mas01mc@292 305 void unserialize_lsh_functions(int fid); // read the lsh hash functions into core
mas01mc@292 306
mas01mc@292 307 // Functions to read hashtables in format1
mas01mc@292 308 void unserialize_lsh_hashtables_format1(int fid); // read FORMAT1 hash tables into core (disk format)
mas01mc@292 309 void unserialize_hashtable_row_format1(SerialElementT* pe, bucket** b); // read lsh hash table row into core
mas01mc@292 310
mas01mc@292 311 // Functions to read hashtables in format2
mas01mc@340 312 void unserialize_lsh_hashtables_format2(FILE* dbFile, bool forMerge = 0);
mas01mc@340 313 Uns32T unserialize_hashtable_row_format2(FILE* dbFile, bucket** b, Uns32T token=0); // to dynamic linked list
mas01mc@340 314 Uns32T unserialize_hashtable_row_to_array(FILE* dbFile, bucket** b, Uns32T numElements); // to core array
mas01mc@292 315
mas01mc@292 316 // Helper functions
mas01mc@292 317 void serial_print_header(Uns32T requestedFormat);
mas01mc@292 318 float* get_serial_hashfunction_base(char* db);
mas01mc@292 319 SerialElementT* get_serial_hashtable_base(char* db);
mas01mc@292 320 Uns32T get_serial_hashtable_offset(); // Size of SerialHeader + HashFunctions
mas01mc@292 321 SerialHeaderT* serial_get_header(char* db);
mas01mc@292 322 SerialHeaderT* lshHeader;
mas01mc@292 323
mas01mc@292 324 // Core Retrieval/Inspections Functions
mas01mc@292 325 void bucket_chain_point(bucket* p, Uns32T qpos);
mas01mc@292 326 void sbucket_chain_point(sbucket* p, Uns32T qpos);
mas01mc@292 327 void dump_hashtable_row(bucket* p);
mas01mc@292 328
mas01mc@292 329 // Serial (Format 1) Retrieval/Inspection Functions
mas01mc@292 330 void serial_bucket_chain_point(SerialElementT* pe, Uns32T qpos);
mas01mc@292 331 void serial_bucket_dump(SerialElementT* pe);
mas01mc@292 332
mas01mc@340 333 // Core ARRAY Retrieval Functions
mas01mc@340 334 void retrieve_from_core_hashtable_array(Uns32T* p, Uns32T qpos);
mas01mc@340 335
mas01mc@340 336
mas01mc@293 337 // Callback Function for point reporting
mas01mc@293 338 void* calling_instance; // store calling object instance for member-function callback
mas01mc@324 339 ReporterCallbackPtr add_point_callback; // Pointer to the callback function
mas01mc@292 340
mas01mc@292 341 public:
mas01mc@292 342 G(char* lshFile, bool lshInCore = false); // unserialize constructor
mas01mc@292 343 G(float w, Uns32T k,Uns32T m, Uns32T d, Uns32T N, Uns32T C, float r); // core constructor
mas01mc@292 344 ~G();
mas01mc@292 345
mas01mc@292 346 Uns32T insert_point(vector<float>&, Uns32T pointID);
mas01mc@292 347 void insert_point_set(vector<vector<float> >& vv, Uns32T basePointID);
mas01mc@292 348
mas01mc@292 349 // point retrieval from core
mas01mc@292 350 void retrieve_point(vector<float>& v, Uns32T qpos, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 351 // point set retrieval from core
mas01mc@292 352 void retrieve_point_set(vector<vector<float> >& vv, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 353 // serial point set retrieval
mas01mc@292 354 void serial_retrieve_point_set(char* filename, vector<vector<float> >& vv, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 355 // serial point retrieval
mas01mc@292 356 void serial_retrieve_point(char* filename, vector<float>& vv, Uns32T qpos, ReporterCallbackPtr, void* me=NULL);
mas01mc@292 357
mas01mc@292 358 void serialize(char* filename, Uns32T serialFormat = O2_SERIAL_FILEFORMAT1); // write hashfunctions and hashtables to disk
mas01mc@292 359
mas01mc@292 360 SerialHeaderT* get_lshHeader(){return lshHeader;}
mas01mc@292 361 void serial_dump_tables(char* filename);
mas01mc@292 362 float get_mean_collision_rate(){ return (float) pointCount / bucketCount ; }
mas01mc@308 363 char* get_indexName(){return indexName;}
mas01mc@292 364 };
mas01mc@292 365
mas01mc@292 366 typedef class G LSH;
mas01mc@292 367
mas01mc@292 368
mas01mc@292 369
mas01mc@292 370 #endif