annotate audioDB.h @ 755:37c2b9cce23a multiprobeLSH

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents 06409b6e268f
children e18843dc0aea
rev   line source
mas01mc@292 1 #ifndef __AUDIODB_H_
mas01mc@292 2 #define __AUDIODB_H_
mas01mc@292 3
mas01cr@0 4 #include <stdio.h>
mas01cr@0 5 #include <stdlib.h>
mas01cr@0 6 #include <sys/types.h>
mas01cr@0 7 #include <sys/stat.h>
mas01cr@0 8 #include <sys/mman.h>
mas01cr@0 9 #include <fcntl.h>
mas01cr@0 10 #include <string.h>
mas01cr@0 11 #include <iostream>
mas01cr@0 12 #include <fstream>
mas01cr@302 13 #include <set>
mas01cr@498 14 #include <map>
mas01cr@302 15 #include <string>
mas01cr@0 16 #include <math.h>
mas01cr@0 17 #include <sys/time.h>
mas01cr@0 18 #include <assert.h>
mas01cr@62 19 #include <float.h>
mas01cr@104 20 #include <signal.h>
mas01cr@280 21 #include <gsl/gsl_rng.h>
mas01cr@0 22
mas01mc@292 23 // includes for LSH indexing
mas01cr@498 24 extern "C" {
mas01cr@498 25 #include "audioDB_API.h"
mas01cr@498 26 }
mas01cr@509 27 #include "audioDB-internals.h"
mas01mc@292 28 #include "ReporterBase.h"
mas01cr@498 29 #include "accumulator.h"
mas01mc@292 30 #include "lshlib.h"
mas01mc@292 31
mas01cr@0 32 // includes for web services
mas01cr@0 33 #include "soapH.h"
mas01cr@0 34 #include "cmdline.h"
mas01cr@0 35
mas01cr@509 36 #define MAXSTR ADB_MAXSTR
mas01cr@0 37
mas01cr@0 38 // Databse PRIMARY commands
mas01cr@0 39 #define COM_CREATE "--NEW"
mas01cr@0 40 #define COM_INSERT "--INSERT"
mas01cr@0 41 #define COM_BATCHINSERT "--BATCHINSERT"
mas01cr@0 42 #define COM_QUERY "--QUERY"
mas01cr@0 43 #define COM_STATUS "--STATUS"
mas01cr@0 44 #define COM_L2NORM "--L2NORM"
mas01cr@193 45 #define COM_POWER "--POWER"
mas01cr@0 46 #define COM_DUMP "--DUMP"
mas01cr@0 47 #define COM_SERVER "--SERVER"
mas01mc@292 48 #define COM_INDEX "--INDEX"
mas01cr@280 49 #define COM_SAMPLE "--SAMPLE"
mas01mc@334 50 #define COM_LISZT "--LISZT"
mas01cr@0 51
mas01cr@0 52 // parameters
mas01cr@0 53 #define COM_CLIENT "--client"
mas01cr@0 54 #define COM_DATABASE "--database"
mas01cr@0 55 #define COM_QTYPE "--qtype"
mas01cr@0 56 #define COM_SEQLEN "--sequencelength"
mas01cr@0 57 #define COM_SEQHOP "--sequencehop"
mas01cr@0 58 #define COM_POINTNN "--pointnn"
mas01mc@307 59 #define COM_RADIUS "--radius"
mas01mc@18 60 #define COM_TRACKNN "--resultlength"
mas01cr@0 61 #define COM_QPOINT "--qpoint"
mas01cr@0 62 #define COM_FEATURES "--features"
mas01cr@0 63 #define COM_QUERYKEY "--key"
mas01cr@0 64 #define COM_KEYLIST "--keyList"
mas01cr@0 65 #define COM_TIMES "--times"
mas01cr@193 66 #define COM_QUERYPOWER "--power"
mas01cr@193 67 #define COM_RELATIVE_THRESH "--relative-threshold"
mas01cr@193 68 #define COM_ABSOLUTE_THRESH "--absolute-threshold"
mas01mc@310 69 #define COM_EXHAUSTIVE "--exhaustive"
mas01mc@310 70 #define COM_LSH_EXACT "--lsh_exact"
mas01mc@471 71 #define COM_NO_UNIT_NORMING "--no_unit_norming"
mas01cr@0 72
mas01cr@0 73 #define O2_DEFAULT_POINTNN (10U)
mas01mc@18 74 #define O2_DEFAULT_TRACKNN (10U)
mas01cr@0 75
mas01mc@248 76 //#define O2_DEFAULTDBSIZE (4000000000) // 4GB table size
mas01mc@7 77 #define O2_DEFAULTDBSIZE (2000000000) // 2GB table size
mas01cr@0 78
mas01cr@509 79 #define O2_DEFAULT_DATASIZE (1355U) /* in MB */
mas01cr@509 80 #define O2_DEFAULT_NTRACKS (20000U)
mas01cr@509 81 #define O2_DEFAULT_DATADIM (9U)
mas01mc@295 82
mas01mc@295 83 // LIMIT PARAMETERS
mas01mc@292 84 #define O2_REALTYPE (double)
mas01mc@324 85 #define O2_MAXFILES (1000000U)
mas01cr@509 86 #define O2_MAXFILESTR ADB_FILETABLE_ENTRY_SIZE
mas01cr@509 87 #define O2_FILETABLE_ENTRY_SIZE ADB_FILETABLE_ENTRY_SIZE
mas01cr@509 88 #define O2_TRACKTABLE_ENTRY_SIZE ADB_TRACKTABLE_ENTRY_SIZE
mas01cr@0 89 #define O2_HEADERSIZE (sizeof(dbTableHeaderT))
mas01cr@0 90 #define O2_MEANNUMVECTORS (1000U)
mas01mc@464 91 #define O2_MAXDIM (20000U)
mas01mc@263 92 #define O2_MAXNN (1000000U)
mas01mc@292 93 #define O2_MAXSEQLEN (8000U) // maximum feature vectors in a sequence
mas01mc@324 94 #define O2_MAXTRACKS (1000000U) // maximum number of tracks
mas01mc@532 95
mas01mc@292 96 #define O2_MAXDOTPRODUCTMEMORY (sizeof(O2_REALTYPE)*O2_MAXSEQLEN*O2_MAXSEQLEN) // 512MB
mas01mc@324 97 #define O2_SERIAL_MAX_TRACKBATCH (1000000)
mas01mc@324 98 #define O2_LARGE_ADB_SIZE (O2_DEFAULT_DATASIZE+1) // datasize at which features are kept externally (in Mbytes)
mas01mc@324 99 #define O2_LARGE_ADB_NTRACKS (O2_DEFAULT_NTRACKS+1) // ntracks at which features are kept externally
mas01mc@324 100 #define O2_MAX_VECTORS ( O2_MEANNUMVECTORS * O2_MAXTRACKS )
mas01cr@0 101
mas01cr@0 102 // Flags
mas01cr@509 103 #define O2_FLAG_L2NORM ADB_HEADER_FLAG_L2NORM
mas01cr@0 104 #define O2_FLAG_MINMAX (0x2U)
mas01cr@509 105 #define O2_FLAG_POWER ADB_HEADER_FLAG_POWER
mas01cr@509 106 #define O2_FLAG_TIMES ADB_HEADER_FLAG_TIMES
mas01cr@509 107 #define O2_FLAG_LARGE_ADB ADB_HEADER_FLAG_REFERENCES
mas01mc@301 108 #define DISPLAY_FLAG(x) (x?"on":"off")
mas01cr@0 109
mas01cr@105 110 // Query types
mas01cr@105 111 #define O2_POINT_QUERY (0x4U)
mas01cr@105 112 #define O2_SEQUENCE_QUERY (0x8U)
mas01cr@105 113 #define O2_TRACK_QUERY (0x10U)
mas01mc@248 114 #define O2_N_SEQUENCE_QUERY (0x20U)
mas01mc@263 115 #define O2_ONE_TO_ONE_N_SEQUENCE_QUERY (0x40U)
mas01mc@248 116
mas01cr@0 117 // Error Codes
mas01cr@0 118 #define O2_ERR_KEYNOTFOUND (0xFFFFFF00)
mas01cr@0 119
mas01cr@0 120 // Macros
mas01cr@0 121 #define O2_ACTION(a) (strcmp(command,a)==0)
mas01cr@0 122
mas01cr@370 123 #define ALIGN_UP(x,w) (((x) + ((1<<w)-1)) & ~((1<<w)-1))
mas01cr@108 124 #define ALIGN_DOWN(x,w) ((x) & ~((1<<w)-1))
mas01cr@108 125
mas01cr@370 126 #define ALIGN_PAGE_UP(x) (((x) + (getpagesize()-1)) & ~(getpagesize()-1))
mas01cr@196 127 #define ALIGN_PAGE_DOWN(x) ((x) & ~(getpagesize()-1))
mas01cr@196 128
mas01cr@166 129 #define ENSURE_STRING(x) ((x) ? (x) : "")
mas01cr@166 130
mas01cr@239 131 #define CHECKED_MMAP(type, var, start, length) \
mas01cr@239 132 { void *tmp = mmap(0, length, (PROT_READ | (forWrite ? PROT_WRITE : 0)), MAP_SHARED, dbfid, (start)); \
mas01cr@239 133 if(tmp == (void *) -1) { \
mas01cr@239 134 error("mmap error for db table", #var, "mmap"); \
mas01cr@239 135 } \
mas01cr@239 136 var = (type) tmp; \
mas01cr@239 137 }
mas01cr@239 138
mas01cr@370 139 #define CHECKED_READ(fd, buf, count) \
mas01cr@370 140 { size_t tmpcount = count; \
mas01cr@370 141 ssize_t tmp = read(fd, buf, tmpcount); \
mas01cr@370 142 if(tmp == -1) { \
mas01cr@370 143 error("read error", "", "read"); \
mas01cr@370 144 } else if((size_t) tmp != tmpcount) { \
mas01cr@370 145 error("short read", ""); \
mas01cr@370 146 } \
mas01cr@370 147 }
mas01cr@370 148
mas01cr@370 149 #define CHECKED_WRITE(fd, buf, count) \
mas01cr@370 150 { size_t tmpcount = count; \
mas01cr@370 151 ssize_t tmp = write(fd, buf, tmpcount); \
mas01cr@370 152 if(tmp == -1) { \
mas01cr@370 153 error("write error", "", "write"); \
mas01cr@370 154 } else if((size_t) tmp != tmpcount) { \
mas01cr@370 155 error("short write", ""); \
mas01cr@370 156 } \
mas01cr@370 157 }
mas01cr@370 158
mas01cr@239 159 #define VERB_LOG(vv, ...) \
mas01cr@239 160 if(verbosity > vv) { \
mas01cr@239 161 fprintf(stderr, __VA_ARGS__); \
mas01cr@239 162 fflush(stderr); \
mas01cr@239 163 }
mas01cr@0 164
mas01mc@324 165 // We will only use this in a 32-bit address space
mas01mc@324 166 // So map the off_t down to 32-bits first
mas01mc@324 167 #define INSERT_FILETABLE_STRING(TABLE, STR) \
mas01mc@324 168 strncpy(TABLE + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, STR, strlen(STR));
mas01mc@324 169
mas01mc@324 170 #define SAFE_DELETE(PTR) delete PTR; PTR=0;
mas01mc@324 171 #define SAFE_DELETE_ARRAY(PTR) delete[] PTR; PTR=0;
mas01mc@324 172
mas01mc@324 173 extern char* SERVER_ADB_ROOT;
mas01mc@324 174 extern char* SERVER_ADB_FEATURE_ROOT;
mas01mc@308 175
mas01mc@308 176 class audioDB{
mas01cr@0 177 private:
mas01cr@0 178 gengetopt_args_info args_info;
mas01cr@0 179 unsigned dim;
mas01cr@0 180 const char *dbName;
mas01cr@0 181 const char *inFile;
mas01cr@0 182 const char *hostport;
mas01cr@0 183 const char *key;
mas01mc@18 184 const char* trackFileName;
mas01cr@239 185 std::ifstream *trackFile;
mas01cr@0 186 const char *command;
mas01cr@131 187 const char *output;
mas01cr@0 188 const char *timesFileName;
mas01cr@239 189 std::ifstream *timesFile;
mas01cr@193 190 const char *powerFileName;
mas01cr@239 191 std::ifstream *powerFile;
mas01mc@324 192 const char* adb_root;
mas01mc@324 193 const char* adb_feature_root;
mas01mc@324 194
mas01cr@193 195 int powerfd;
mas01cr@0 196 int dbfid;
mas01mc@292 197 int lshfid;
mas01cr@196 198 bool forWrite;
mas01cr@0 199 int infid;
mas01cr@0 200 struct stat statbuf;
mas01cr@509 201 struct adbheader *dbH;
mas01cr@498 202 struct adb *adb;
mas01cr@284 203
mas01cr@284 204 gsl_rng *rng;
mas01cr@0 205
mas01mc@324 206 char* fileTable;
mas01mc@18 207 unsigned* trackTable;
mas01cr@0 208 double* l2normTable;
mas01cr@196 209 double* timesTable;
mas01cr@193 210 double* powerTable;
mas01cr@0 211
mas01mc@324 212 char* featureFileNameTable;
mas01mc@324 213 char* timesFileNameTable;
mas01mc@324 214 char* powerFileNameTable;
mas01mc@324 215
mas01cr@196 216 size_t fileTableLength;
mas01cr@196 217 size_t trackTableLength;
mas01cr@196 218 size_t timesTableLength;
mas01cr@196 219 size_t powerTableLength;
mas01cr@196 220 size_t l2normTableLength;
mas01cr@196 221
mas01cr@0 222 // Flags and parameters
mas01cr@0 223 unsigned verbosity; // how much do we want to know?
mas01cr@256 224
mas01cr@280 225 unsigned nsamples;
mas01cr@280 226
mas01cr@256 227 //off_t size; // given size (for creation)
mas01cr@256 228 unsigned datasize; // size in MB
mas01cr@256 229 unsigned ntracks;
mas01cr@256 230 unsigned datadim;
mas01cr@256 231
mas01cr@0 232 unsigned queryType; // point queries default
mas01cr@0 233 unsigned pointNN; // how many point NNs ?
mas01mc@18 234 unsigned trackNN; // how many track NNs ?
mas01cr@0 235 unsigned sequenceLength;
mas01cr@0 236 unsigned sequenceHop;
mas01cr@239 237 bool normalizedDistance;
mas01mc@292 238 bool no_unit_norming;
mas01cr@0 239 unsigned queryPoint;
mas01cr@0 240 unsigned usingQueryPoint;
mas01cr@0 241 unsigned usingTimes;
mas01cr@193 242 unsigned usingPower;
mas01cr@0 243 unsigned isClient;
mas01cr@0 244 unsigned isServer;
mas01cr@0 245 unsigned port;
mas01cr@0 246 double timesTol;
mas01mc@17 247 double radius;
mas01mc@292 248 bool query_from_key;
mas01mc@292 249 Uns32T query_from_key_index;
mas01cr@193 250 bool use_absolute_threshold;
mas01cr@193 251 double absolute_threshold;
mas01cr@193 252 bool use_relative_threshold;
mas01cr@193 253 double relative_threshold;
mas01mc@334 254
mas01mc@292 255 ReporterBase* reporter; // track/point reporter
mas01mc@292 256
mas01mc@334 257 // LISZT parameters
mas01mc@334 258 unsigned lisztOffset;
mas01mc@334 259 unsigned lisztLength;
mas01mc@334 260
mas01cr@0 261 // private methods
mas01cr@32 262 void error(const char* a, const char* b = "", const char *sysFunc = 0);
mas01cr@193 263
mas01cr@498 264 void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata);
mas01cr@284 265 void initRNG();
mas01cr@196 266 void initDBHeader(const char *dbName);
mas01cr@498 267 void initInputFile(const char *inFile);
mas01mc@292 268 void initTables(const char* dbName, const char* inFile = 0);
mas01mc@292 269 void initTablesFromKey(const char* dbName, const Uns32T queryIndex);
mas01mc@324 270 void prefix_name(char** const name, const char* prefix);
mas01mc@324 271
mas01cr@0 272 public:
mas01cr@370 273 audioDB(const unsigned argc, const char *argv[]);
mas01cr@508 274 audioDB(const unsigned argc, const char *argv[], struct soap *soap, adb__queryResponse *adbQueryResponse);
mas01cr@370 275 audioDB(const unsigned argc, const char *argv[], adb__statusResponse *adbStatusResponse);
mas01cr@370 276 audioDB(const unsigned argc, const char *argv[], adb__lisztResponse *adbLisztResponse);
mas01mc@334 277
mas01cr@97 278 void cleanup();
mas01cr@0 279 ~audioDB();
mas01cr@370 280 int processArgs(const unsigned argc, const char* argv[]);
mas01cr@30 281 void get_lock(int fd, bool exclusive);
mas01cr@30 282 void release_lock(int fd);
mas01cr@0 283 void create(const char* dbName);
mas01cr@0 284 void insert(const char* dbName, const char* inFile);
mas01cr@0 285 void batchinsert(const char* dbName, const char* inFile);
mas01cr@508 286 void query(const char* dbName, const char* inFile, struct soap *soap=0, adb__queryResponse *adbQueryResponse=0);
mas01cr@133 287 void status(const char* dbName, adb__statusResponse *adbStatusResponse=0);
mas01ik@355 288
mas01cr@284 289 unsigned random_track(unsigned *propTable, unsigned total);
mas01cr@280 290 void sample(const char *dbName);
mas01cr@0 291 void l2norm(const char* dbName);
mas01cr@193 292 void power_flag(const char *dbName);
mas01cr@0 293 void dump(const char* dbName);
mas01mc@334 294 void liszt(const char* dbName, unsigned offset, unsigned numLines, adb__lisztResponse* adbLisztResponse=0);
mas01cr@0 295
mas01mc@292 296 // LSH indexing parameters and data structures
mas01mc@292 297 LSH* lsh;
mas01mc@292 298 bool lsh_in_core; // load LSH tables for query into core (true) or keep on disk (false)
mas01mc@292 299 bool lsh_use_u_functions;
mas01mc@292 300 bool lsh_exact; // flag to indicate use exact evaluation of points returned by LSH
mas01mc@308 301 bool WS_load_index; // flag to indicate that we want to make a Web Services index memory resident
mas01mc@292 302 double lsh_param_w; // Width of LSH hash-function bins
mas01mc@292 303 Uns32T lsh_param_k; // Number of independent hash functions
mas01mc@292 304 Uns32T lsh_param_m; // Combinatorial parameter for m(m-1)/2 hash tables
mas01mc@292 305 Uns32T lsh_param_N; // Number of rows per hash table
mas01mc@292 306 Uns32T lsh_param_b; // Batch size, in number of tracks, per indexing iteration
mas01mc@292 307 Uns32T lsh_param_ncols; // Maximum number of collision in a hash-table row
mas01mc@292 308
mas01mc@292 309 // LSH indexing and retrieval methods
mas01mc@292 310 void index_index_db(const char* dbName);
mas01mc@292 311 void index_initialize(double**,double**,double**,double**,unsigned int*);
mas01mc@292 312 void index_insert_tracks(Uns32T start_track, Uns32T end_track, double** fvpp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp);
mas01mc@292 313 int index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp);
mas01mc@292 314 Uns32T index_insert_shingles(vector<vector<float> >*, Uns32T trackID, double* spp);
mas01cr@498 315 void insertPowerData(unsigned n, int powerfd, double *powerdata);
mas01mc@324 316 void init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp);
mas01mc@324 317
mas01mc@292 318 // Web Services
mas01cr@0 319 void startServer();
mas01ik@355 320
mas01mc@308 321 void ws_status(const char*dbName, char* hostport);
mas01mc@308 322 void ws_query(const char*dbName, const char *featureFileName, const char* hostport);
mas01mc@328 323 void ws_query_by_key(const char*dbName, const char *trackKey, const char* featureFileName, const char* hostport);
mas01mc@334 324 void ws_liszt(const char* dbName, char* hostport);
mas01mc@334 325
mas01cr@0 326 };
mas01mc@17 327
mas01mc@292 328 #define O2_AUDIODB_INITIALIZERS \
mas01mc@292 329 dim(0), \
mas01mc@292 330 dbName(0), \
mas01mc@292 331 inFile(0), \
mas01mc@292 332 key(0), \
mas01mc@292 333 trackFileName(0), \
mas01mc@292 334 trackFile(0), \
mas01mc@292 335 command(0), \
mas01mc@292 336 output(0), \
mas01mc@292 337 timesFileName(0), \
mas01mc@292 338 timesFile(0), \
mas01mc@292 339 powerFileName(0), \
mas01mc@292 340 powerFile(0), \
mas01mc@324 341 adb_root(0), \
mas01mc@324 342 adb_feature_root(0), \
mas01mc@324 343 powerfd(0), \
mas01mc@292 344 dbfid(0), \
mas01mc@292 345 lshfid(0), \
mas01mc@292 346 forWrite(false), \
mas01mc@292 347 infid(0), \
mas01mc@292 348 dbH(0), \
mas01cr@498 349 adb(0), \
mas01mc@292 350 rng(0), \
mas01mc@292 351 fileTable(0), \
mas01mc@292 352 trackTable(0), \
mas01mc@292 353 l2normTable(0), \
mas01mc@292 354 timesTable(0), \
mas01mc@314 355 powerTable(0), \
mas01mc@324 356 featureFileNameTable(0), \
mas01mc@324 357 timesFileNameTable(0), \
mas01mc@324 358 powerFileNameTable(0), \
mas01mc@292 359 fileTableLength(0), \
mas01mc@292 360 trackTableLength(0), \
mas01mc@292 361 timesTableLength(0), \
mas01mc@292 362 powerTableLength(0), \
mas01mc@292 363 l2normTableLength(0), \
mas01mc@292 364 verbosity(1), \
mas01mc@292 365 nsamples(2000), \
mas01mc@292 366 datasize(O2_DEFAULT_DATASIZE), \
mas01mc@292 367 ntracks(O2_DEFAULT_NTRACKS), \
mas01mc@292 368 datadim(O2_DEFAULT_DATADIM), \
mas01mc@292 369 queryType(O2_POINT_QUERY), \
mas01mc@292 370 pointNN(O2_DEFAULT_POINTNN), \
mas01mc@292 371 trackNN(O2_DEFAULT_TRACKNN), \
mas01mc@292 372 sequenceLength(16), \
mas01mc@292 373 sequenceHop(1), \
mas01mc@292 374 normalizedDistance(true), \
mas01mc@292 375 no_unit_norming(false), \
mas01mc@292 376 queryPoint(0), \
mas01mc@292 377 usingQueryPoint(0), \
mas01mc@292 378 usingTimes(0), \
mas01mc@292 379 usingPower(0), \
mas01mc@292 380 isClient(0), \
mas01mc@292 381 isServer(0), \
mas01mc@292 382 port(0), \
mas01mc@292 383 timesTol(0.1), \
mas01mc@292 384 radius(0), \
mas01mc@292 385 query_from_key(false), \
mas01cr@498 386 query_from_key_index((uint32_t) -1), \
mas01mc@292 387 use_absolute_threshold(false), \
mas01mc@292 388 absolute_threshold(0.0), \
mas01mc@292 389 use_relative_threshold(false), \
mas01mc@292 390 relative_threshold(0.0), \
mas01mc@292 391 reporter(0), \
mas01mc@334 392 lisztOffset(0), \
mas01mc@334 393 lisztLength(0), \
mas01mc@292 394 lsh(0), \
mas01mc@292 395 lsh_in_core(false), \
mas01mc@292 396 lsh_use_u_functions(false), \
mas01mc@292 397 lsh_exact(false), \
mas01mc@308 398 WS_load_index(false), \
mas01mc@292 399 lsh_param_k(0), \
mas01mc@292 400 lsh_param_m(0), \
mas01mc@292 401 lsh_param_N(0), \
mas01mc@292 402 lsh_param_b(0), \
mas01cr@498 403 lsh_param_ncols(0)
mas01mc@292 404 #endif