mas01mc@292: #ifndef __AUDIODB_H_ mas01mc@292: #define __AUDIODB_H_ mas01mc@292: mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@302: #include mas01cr@302: #include mas01cr@0: #include mas01cr@0: #include mas01cr@0: #include mas01cr@62: #include mas01cr@104: #include mas01cr@280: #include mas01cr@0: mas01mc@292: // includes for LSH indexing mas01mc@292: #include "ReporterBase.h" mas01mc@292: #include "lshlib.h" mas01mc@292: mas01cr@0: // includes for web services mas01cr@0: #include "soapH.h" mas01cr@0: #include "cmdline.h" mas01cr@0: mas01cr@0: #define MAXSTR 512 mas01cr@0: mas01cr@0: // Databse PRIMARY commands mas01cr@0: #define COM_CREATE "--NEW" mas01cr@0: #define COM_INSERT "--INSERT" mas01cr@0: #define COM_BATCHINSERT "--BATCHINSERT" mas01cr@0: #define COM_QUERY "--QUERY" mas01cr@0: #define COM_STATUS "--STATUS" mas01cr@0: #define COM_L2NORM "--L2NORM" mas01cr@193: #define COM_POWER "--POWER" mas01cr@0: #define COM_DUMP "--DUMP" mas01cr@0: #define COM_SERVER "--SERVER" mas01mc@292: #define COM_INDEX "--INDEX" mas01cr@280: #define COM_SAMPLE "--SAMPLE" mas01cr@0: mas01cr@0: // parameters mas01cr@0: #define COM_CLIENT "--client" mas01cr@0: #define COM_DATABASE "--database" mas01cr@0: #define COM_QTYPE "--qtype" mas01cr@0: #define COM_SEQLEN "--sequencelength" mas01cr@0: #define COM_SEQHOP "--sequencehop" mas01cr@0: #define COM_POINTNN "--pointnn" mas01mc@307: #define COM_RADIUS "--radius" mas01mc@18: #define COM_TRACKNN "--resultlength" mas01cr@0: #define COM_QPOINT "--qpoint" mas01cr@0: #define COM_FEATURES "--features" mas01cr@0: #define COM_QUERYKEY "--key" mas01cr@0: #define COM_KEYLIST "--keyList" mas01cr@0: #define COM_TIMES "--times" mas01cr@193: #define COM_QUERYPOWER "--power" mas01cr@193: #define COM_RELATIVE_THRESH "--relative-threshold" mas01cr@193: #define COM_ABSOLUTE_THRESH "--absolute-threshold" mas01mc@310: #define COM_EXHAUSTIVE "--exhaustive" mas01mc@310: #define COM_LSH_EXACT "--lsh_exact" mas01cr@0: mas01mc@314: // Because LSH returns NN with P(1)<1 we want to return exact mas01mc@314: // points above this boundary. mas01mc@314: // Because we work in Radius^2 units, mas01mc@314: // The sqrt of this number is the multiplier on the radius mas01mc@314: mas01mc@314: #define O2_LSH_EXACT_MULT 9 mas01mc@314: mas01cr@108: #define O2_OLD_MAGIC ('O'|'2'<<8|'D'<<16|'B'<<24) mas01cr@108: #define O2_MAGIC ('o'|'2'<<8|'d'<<16|'b'<<24) mas01cr@210: #define O2_FORMAT_VERSION (4U) mas01cr@0: mas01cr@0: #define O2_DEFAULT_POINTNN (10U) mas01mc@18: #define O2_DEFAULT_TRACKNN (10U) mas01cr@0: mas01mc@248: //#define O2_DEFAULTDBSIZE (4000000000) // 4GB table size mas01mc@7: #define O2_DEFAULTDBSIZE (2000000000) // 2GB table size mas01cr@0: mas01mc@295: // Bit masks for packing (trackID,pointID) into 32-bit unsigned int mas01mc@319: // This can be controlled at compile time mas01mc@319: #define O2_DEFAULT_LSH_N_POINT_BITS 14 mas01mc@319: mas01mc@319: // Override the default point bit width for large database support mas01mc@319: #ifndef LSH_N_POINT_BITS mas01mc@323: #define LSH_N_POINT_BITS O2_DEFAULT_LSH_N_POINT_BITS mas01mc@319: #endif mas01mc@295: mas01mc@295: // LIMIT PARAMETERS mas01cr@256: #define O2_DEFAULT_DATASIZE (1355U) // in MB mas01cr@256: #define O2_DEFAULT_NTRACKS (20000U) mas01cr@256: #define O2_DEFAULT_DATADIM (9U) mas01mc@292: #define O2_REALTYPE (double) mas01mc@317: #define O2_MAXFILES (1000000U) mas01cr@0: #define O2_MAXFILESTR (256U) mas01cr@256: #define O2_FILETABLE_ENTRY_SIZE (O2_MAXFILESTR) mas01cr@256: #define O2_TRACKTABLE_ENTRY_SIZE (sizeof(unsigned)) mas01cr@0: #define O2_HEADERSIZE (sizeof(dbTableHeaderT)) mas01cr@0: #define O2_MEANNUMVECTORS (1000U) mas01mc@292: #define O2_MAXDIM (2000U) mas01mc@263: #define O2_MAXNN (1000000U) mas01mc@292: #define O2_MAXSEQLEN (8000U) // maximum feature vectors in a sequence mas01mc@316: #define O2_MAXTRACKS (1000000U) // maximum number of tracks mas01mc@319: #define O2_MAXTRACKLEN (1< vv) { \ mas01cr@239: fprintf(stderr, __VA_ARGS__); \ mas01cr@239: fflush(stderr); \ mas01cr@239: } mas01cr@0: mas01mc@316: // We will only use this in a 32-bit address space mas01mc@316: // So map the off_t down to 32-bits first mas01mc@321: #define INSERT_FILETABLE_STRING(TABLE, STR) \ mas01mc@321: strncpy(TABLE + dbH->numFiles*O2_FILETABLE_ENTRY_SIZE, STR, strlen(STR)); mas01mc@316: mas01mc@320: #define SAFE_DELETE(PTR) delete PTR; PTR=0; mas01mc@320: #define SAFE_DELETE_ARRAY(PTR) delete[] PTR; PTR=0; mas01mc@320: mas01mc@308: extern LSH* SERVER_LSH_INDEX_SINGLETON; mas01mc@322: extern char* SERVER_ADB_ROOT; mas01mc@322: extern char* SERVER_ADB_FEATURE_ROOT; mas01mc@308: mas01cr@210: typedef struct dbTableHeader { mas01cr@114: uint32_t magic; mas01cr@114: uint32_t version; mas01cr@114: uint32_t numFiles; mas01cr@114: uint32_t dim; mas01cr@114: uint32_t flags; mas01cr@210: uint32_t headerSize; mas01cr@196: off_t length; mas01cr@196: off_t fileTableOffset; mas01cr@196: off_t trackTableOffset; mas01cr@196: off_t dataOffset; mas01cr@196: off_t l2normTableOffset; mas01cr@196: off_t timesTableOffset; mas01cr@196: off_t powerTableOffset; mas01cr@196: off_t dbSize; mas01cr@0: } dbTableHeaderT, *dbTableHeaderPtr; mas01cr@0: mas01mc@292: mas01mc@292: class PointPair{ mas01mc@292: public: mas01mc@292: Uns32T trackID; mas01mc@292: Uns32T qpos; mas01mc@292: Uns32T spos; mas01mc@292: PointPair(Uns32T a, Uns32T b, Uns32T c); mas01mc@292: }; mas01mc@292: mas01mc@292: bool operator<(const PointPair& a, const PointPair& b); mas01cr@0: mas01mc@308: class audioDB{ mas01cr@0: private: mas01cr@0: gengetopt_args_info args_info; mas01cr@0: unsigned dim; mas01cr@0: const char *dbName; mas01cr@0: const char *inFile; mas01cr@0: const char *hostport; mas01cr@0: const char *key; mas01mc@18: const char* trackFileName; mas01cr@239: std::ifstream *trackFile; mas01cr@0: const char *command; mas01cr@131: const char *output; mas01cr@0: const char *timesFileName; mas01cr@239: std::ifstream *timesFile; mas01cr@193: const char *powerFileName; mas01cr@239: std::ifstream *powerFile; mas01mc@321: const char* adb_root; mas01mc@321: const char* adb_feature_root; mas01mc@321: mas01cr@193: int powerfd; mas01cr@0: int dbfid; mas01mc@292: int lshfid; mas01cr@196: bool forWrite; mas01cr@0: int infid; mas01cr@0: char* db; mas01cr@0: char* indata; mas01cr@0: struct stat statbuf; mas01cr@0: dbTableHeaderPtr dbH; mas01cr@284: mas01cr@284: gsl_rng *rng; mas01cr@0: mas01mc@316: char* fileTable; mas01mc@18: unsigned* trackTable; mas01mc@316: off_t* trackOffsetTable; mas01cr@0: double* dataBuf; mas01cr@0: double* inBuf; mas01cr@0: double* l2normTable; mas01cr@196: double* timesTable; mas01cr@193: double* powerTable; mas01cr@0: mas01mc@318: char* featureFileNameTable; mas01mc@318: char* timesFileNameTable; mas01mc@318: char* powerFileNameTable; mas01mc@318: mas01cr@196: size_t fileTableLength; mas01cr@196: size_t trackTableLength; mas01cr@196: off_t dataBufLength; mas01cr@196: size_t timesTableLength; mas01cr@196: size_t powerTableLength; mas01cr@196: size_t l2normTableLength; mas01cr@196: mas01cr@0: // Flags and parameters mas01cr@0: unsigned verbosity; // how much do we want to know? mas01cr@256: mas01cr@280: unsigned nsamples; mas01cr@280: mas01cr@256: //off_t size; // given size (for creation) mas01cr@256: unsigned datasize; // size in MB mas01cr@256: unsigned ntracks; mas01cr@256: unsigned datadim; mas01cr@256: mas01cr@0: unsigned queryType; // point queries default mas01cr@0: unsigned pointNN; // how many point NNs ? mas01mc@18: unsigned trackNN; // how many track NNs ? mas01cr@0: unsigned sequenceLength; mas01cr@0: unsigned sequenceHop; mas01cr@239: bool normalizedDistance; mas01mc@292: bool no_unit_norming; mas01cr@0: unsigned queryPoint; mas01cr@0: unsigned usingQueryPoint; mas01cr@0: unsigned usingTimes; mas01cr@193: unsigned usingPower; mas01cr@0: unsigned isClient; mas01cr@0: unsigned isServer; mas01cr@0: unsigned port; mas01cr@0: double timesTol; mas01mc@17: double radius; mas01mc@292: bool query_from_key; mas01mc@292: Uns32T query_from_key_index; mas01cr@193: bool use_absolute_threshold; mas01cr@193: double absolute_threshold; mas01cr@193: bool use_relative_threshold; mas01cr@193: double relative_threshold; mas01cr@193: mas01mc@292: ReporterBase* reporter; // track/point reporter mas01mc@292: priority_queue, std::less >* exact_evaluation_queue; mas01mc@292: mas01cr@0: // Timers mas01cr@0: struct timeval tv1; mas01cr@0: struct timeval tv2; mas01cr@0: mas01cr@0: // private methods mas01cr@32: void error(const char* a, const char* b = "", const char *sysFunc = 0); mas01cr@193: void sequence_sum(double *buffer, int length, int seqlen); mas01cr@193: void sequence_sqrt(double *buffer, int length, int seqlen); mas01cr@193: void sequence_average(double *buffer, int length, int seqlen); mas01cr@193: mas01cr@239: void initialize_arrays(int track, unsigned int numVectors, double *query, double *data_buffer, double **D, double **DD); mas01cr@239: void delete_arrays(int track, unsigned int numVectors, double **D, double **DD); mas01mc@319: void read_data(int trkfid, int track, double **data_buffer_p, size_t *data_buffer_size_p); mas01cr@239: void set_up_query(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned int *nvp); mas01mc@292: void set_up_query_from_key(double **qp, double **vqp, double **qnp, double **vqnp, double **qpp, double **vqpp, double *mqdp, unsigned *nvp, Uns32T queryIndex); mas01cr@239: void set_up_db(double **snp, double **vsnp, double **spp, double **vspp, double **mddp, unsigned int *dvp); mas01mc@292: void query_loop(const char* dbName, Uns32T queryIndex); mas01mc@292: void query_loop_points(double* query, double* qnPtr, double* qpPtr, double meanQdur, Uns32T numVectors); mas01mc@292: double dot_product_points(double* q, double* p, Uns32T L); mas01cr@284: void initRNG(); mas01cr@196: void initDBHeader(const char *dbName); mas01mc@316: void initInputFile(const char *inFile, bool loadData = true); mas01mc@292: void initTables(const char* dbName, const char* inFile = 0); mas01mc@292: void initTablesFromKey(const char* dbName, const Uns32T queryIndex); mas01cr@0: void unitNorm(double* X, unsigned d, unsigned n, double* qNorm); mas01cr@0: void unitNormAndInsertL2(double* X, unsigned dim, unsigned n, unsigned append); mas01cr@239: void insertTimeStamps(unsigned n, std::ifstream* timesFile, double* timesdata); mas01cr@193: void insertPowerData(unsigned n, int powerfd, double *powerdata); mas01cr@0: unsigned getKeyPos(char* key); mas01mc@321: void prefix_name(char** const name, const char* prefix); mas01mc@321: mas01cr@0: public: mas01cr@76: audioDB(const unsigned argc, char* const argv[]); mas01cr@133: audioDB(const unsigned argc, char* const argv[], adb__queryResponse *adbQueryResponse); mas01cr@133: audioDB(const unsigned argc, char* const argv[], adb__statusResponse *adbStatusResponse); mas01cr@97: void cleanup(); mas01cr@0: ~audioDB(); mas01cr@0: int processArgs(const unsigned argc, char* const argv[]); mas01cr@30: void get_lock(int fd, bool exclusive); mas01cr@30: void release_lock(int fd); mas01cr@0: void create(const char* dbName); mas01cr@251: bool enough_per_file_space_free(); mas01cr@196: bool enough_data_space_free(off_t size); mas01cr@196: void insert_data_vectors(off_t offset, void *buffer, size_t size); mas01cr@0: void insert(const char* dbName, const char* inFile); mas01cr@0: void batchinsert(const char* dbName, const char* inFile); mas01mc@316: void batchinsert_large_adb(const char* dbName, const char* inFile); mas01cr@133: void query(const char* dbName, const char* inFile, adb__queryResponse *adbQueryResponse=0); mas01cr@133: void status(const char* dbName, adb__statusResponse *adbStatusResponse=0); mas01cr@284: unsigned random_track(unsigned *propTable, unsigned total); mas01cr@280: void sample(const char *dbName); mas01cr@0: void l2norm(const char* dbName); mas01cr@193: void power_flag(const char *dbName); mas01cr@193: bool powers_acceptable(double p1, double p2); mas01cr@0: void dump(const char* dbName); mas01cr@0: mas01mc@292: // LSH indexing parameters and data structures mas01mc@292: LSH* lsh; mas01mc@292: bool lsh_in_core; // load LSH tables for query into core (true) or keep on disk (false) mas01mc@292: bool lsh_use_u_functions; mas01mc@292: bool lsh_exact; // flag to indicate use exact evaluation of points returned by LSH mas01mc@308: bool WS_load_index; // flag to indicate that we want to make a Web Services index memory resident mas01mc@292: double lsh_param_w; // Width of LSH hash-function bins mas01mc@292: Uns32T lsh_param_k; // Number of independent hash functions mas01mc@292: Uns32T lsh_param_m; // Combinatorial parameter for m(m-1)/2 hash tables mas01mc@292: Uns32T lsh_param_N; // Number of rows per hash table mas01mc@292: Uns32T lsh_param_b; // Batch size, in number of tracks, per indexing iteration mas01mc@292: Uns32T lsh_param_ncols; // Maximum number of collision in a hash-table row mas01mc@319: Uns32T lsh_n_point_bits; // How many bits to use to encode point ID within a track mas01mc@319: mas01mc@292: mas01mc@292: // LSH vector<> containers for one in-core copy of a set of feature vectors mas01mc@292: vector::iterator vi; // feature vector iterator mas01mc@292: vector > *vv; // one-track's worth data mas01mc@292: mas01mc@292: // LSH indexing and retrieval methods mas01mc@292: void index_index_db(const char* dbName); mas01mc@292: void index_initialize(double**,double**,double**,double**,unsigned int*); mas01mc@292: void index_insert_tracks(Uns32T start_track, Uns32T end_track, double** fvpp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp); mas01mc@292: int index_insert_track(Uns32T trackID, double** fvpp, double** snpp, double** sppp); mas01mc@292: Uns32T index_insert_shingles(vector >*, Uns32T trackID, double* spp); mas01mc@292: void index_make_shingle(vector >*, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen); mas01mc@292: int index_norm_shingles(vector >*, double* snp, double* spp); mas01mc@292: int index_query_loop(const char* dbName, Uns32T queryIndex); mas01mc@292: vector >* index_initialize_shingles(Uns32T sz); mas01mc@292: int index_init_query(const char* dbName); mas01mc@292: int index_exists(const char* dbName, double radius, Uns32T sequenceLength); mas01mc@292: char* index_get_name(const char*dbName, double radius, Uns32T sequenceLength); mas01mc@292: static void index_add_point_approximate(void* instance, Uns32T pointID, Uns32T qpos, float dist); // static point reporter callback method mas01mc@292: static void index_add_point_exact(void* instance, Uns32T pointID, Uns32T qpos, float dist); // static point reporter callback method mas01mc@319: static Uns32T index_to_trackID(Uns32T lshID, Uns32T nPntBits); // Convert lsh point index to audioDB trackID mas01mc@319: static Uns32T index_to_trackPos(Uns32T lshID, Uns32T nPntBits); // Convert lsh point index to audioDB trackPos (spos) mas01mc@319: static Uns32T index_from_trackInfo(Uns32T trackID, Uns32T pntID, Uns32T nPntBits); // Convert audioDB trackID and trackPos to an lsh point index mas01mc@292: void initialize_exact_evalutation_queue(); mas01mc@292: void index_insert_exact_evaluation_queue(Uns32T trackID, Uns32T qpos, Uns32T spos); mas01mc@308: LSH* index_allocate(char* indexName, bool load_hashTables); mas01mc@319: void init_track_aux_data(Uns32T trackID, double* fvp, double** sNormpp,double** snPtrp, double** sPowerp, double** spPtrp); mas01mc@319: mas01mc@292: // Web Services mas01cr@0: void startServer(); mas01mc@308: void ws_status(const char*dbName, char* hostport); mas01mc@308: void ws_query(const char*dbName, const char *featureFileName, const char* hostport); mas01mc@308: void ws_query_by_key(const char*dbName, const char *trackKey, const char* hostport); mas01cr@0: mas01cr@0: }; mas01mc@17: mas01mc@292: #define O2_AUDIODB_INITIALIZERS \ mas01mc@292: dim(0), \ mas01mc@292: dbName(0), \ mas01mc@292: inFile(0), \ mas01mc@292: key(0), \ mas01mc@292: trackFileName(0), \ mas01mc@292: trackFile(0), \ mas01mc@292: command(0), \ mas01mc@292: output(0), \ mas01mc@292: timesFileName(0), \ mas01mc@292: timesFile(0), \ mas01mc@292: powerFileName(0), \ mas01mc@292: powerFile(0), \ mas01mc@321: adb_root(0), \ mas01mc@321: adb_feature_root(0), \ mas01mc@321: powerfd(0), \ mas01mc@292: dbfid(0), \ mas01mc@292: lshfid(0), \ mas01mc@292: forWrite(false), \ mas01mc@292: infid(0), \ mas01mc@292: db(0), \ mas01mc@292: indata(0), \ mas01mc@292: dbH(0), \ mas01mc@292: rng(0), \ mas01mc@292: fileTable(0), \ mas01mc@292: trackTable(0), \ mas01mc@292: trackOffsetTable(0), \ mas01mc@292: dataBuf(0), \ mas01mc@292: l2normTable(0), \ mas01mc@292: timesTable(0), \ mas01mc@314: powerTable(0), \ mas01mc@318: featureFileNameTable(0), \ mas01mc@318: timesFileNameTable(0), \ mas01mc@318: powerFileNameTable(0), \ mas01mc@292: fileTableLength(0), \ mas01mc@292: trackTableLength(0), \ mas01mc@292: dataBufLength(0), \ mas01mc@292: timesTableLength(0), \ mas01mc@292: powerTableLength(0), \ mas01mc@292: l2normTableLength(0), \ mas01mc@292: verbosity(1), \ mas01mc@292: nsamples(2000), \ mas01mc@292: datasize(O2_DEFAULT_DATASIZE), \ mas01mc@292: ntracks(O2_DEFAULT_NTRACKS), \ mas01mc@292: datadim(O2_DEFAULT_DATADIM), \ mas01mc@292: queryType(O2_POINT_QUERY), \ mas01mc@292: pointNN(O2_DEFAULT_POINTNN), \ mas01mc@292: trackNN(O2_DEFAULT_TRACKNN), \ mas01mc@292: sequenceLength(16), \ mas01mc@292: sequenceHop(1), \ mas01mc@292: normalizedDistance(true), \ mas01mc@292: no_unit_norming(false), \ mas01mc@292: queryPoint(0), \ mas01mc@292: usingQueryPoint(0), \ mas01mc@292: usingTimes(0), \ mas01mc@292: usingPower(0), \ mas01mc@292: isClient(0), \ mas01mc@292: isServer(0), \ mas01mc@292: port(0), \ mas01mc@292: timesTol(0.1), \ mas01mc@292: radius(0), \ mas01mc@292: query_from_key(false), \ mas01mc@292: query_from_key_index(O2_ERR_KEYNOTFOUND), \ mas01mc@292: use_absolute_threshold(false), \ mas01mc@292: absolute_threshold(0.0), \ mas01mc@292: use_relative_threshold(false), \ mas01mc@292: relative_threshold(0.0), \ mas01mc@292: reporter(0), \ mas01mc@292: exact_evaluation_queue(0), \ mas01mc@292: lsh(0), \ mas01mc@292: lsh_in_core(false), \ mas01mc@292: lsh_use_u_functions(false), \ mas01mc@292: lsh_exact(false), \ mas01mc@308: WS_load_index(false), \ mas01mc@292: lsh_param_k(0), \ mas01mc@292: lsh_param_m(0), \ mas01mc@292: lsh_param_N(0), \ mas01mc@292: lsh_param_b(0), \ mas01mc@292: lsh_param_ncols(0), \ mas01mc@319: lsh_n_point_bits(0), \ mas01mc@292: vv(0) mas01mc@292: #endif