annotate index-utils.cpp @ 755:37c2b9cce23a multiprobeLSH

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents cc2b97d020b1
children 9119f2fa3efe
rev   line source
mas01cr@509 1 extern "C" {
mas01cr@509 2 #include "audioDB_API.h"
mas01cr@509 3 }
mas01cr@509 4 #include "audioDB-internals.h"
mas01cr@509 5
mas01cr@509 6 /*
mas01cr@509 7 * Routines which are common to both indexed query and index creation:
mas01cr@509 8 * we put them in their own file for build logistics.
mas01cr@509 9 */
mas01cr@509 10
mas01cr@509 11 /* FIXME: there are several things wrong with this: the memory
mas01cr@509 12 * discipline isn't ideal, the radius printing is a bit lame, the name
mas01cr@509 13 * getting will succeed or fail depending on whether the path was
mas01cr@509 14 * relative or absolute -- but most importantly encoding all that
mas01cr@509 15 * information in a filename is going to lose: it's impossible to
mas01cr@509 16 * maintain backwards-compatibility. Instead we should probably store
mas01cr@509 17 * the index metadata inside the audiodb instance. */
mas01cr@509 18 char *audiodb_index_get_name(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@509 19 char *indexName;
mas01cr@509 20 if(strlen(dbName) > (ADB_MAXSTR - 32)) {
mas01cr@509 21 return NULL;
mas01cr@509 22 }
mas01cr@509 23 indexName = new char[ADB_MAXSTR];
mas01cr@509 24 strncpy(indexName, dbName, ADB_MAXSTR);
mas01cr@509 25 sprintf(indexName+strlen(dbName), ".lsh.%019.9f.%d", radius, sequenceLength);
mas01cr@509 26 return indexName;
mas01cr@509 27 }
mas01cr@509 28
mas01cr@509 29 bool audiodb_index_exists(const char *dbName, double radius, Uns32T sequenceLength) {
mas01cr@509 30 char *indexName = audiodb_index_get_name(dbName, radius, sequenceLength);
mas01cr@509 31 if(!indexName) {
mas01cr@509 32 return false;
mas01cr@509 33 }
mas01cr@509 34 struct stat st;
mas01cr@509 35 if(stat(indexName, &st)) {
mas01cr@509 36 delete [] indexName;
mas01cr@509 37 return false;
mas01cr@509 38 }
mas01cr@509 39 /* FIXME: other stat checks here? */
mas01cr@509 40 /* FIXME: is there any better way to check whether we can open a
mas01cr@509 41 * file for reading than by opening a file for reading? */
mas01cr@509 42 int fd = open(indexName, O_RDONLY);
mas01cr@509 43 delete [] indexName;
mas01cr@509 44 if(fd < 0) {
mas01cr@509 45 return false;
mas01cr@509 46 } else {
mas01cr@509 47 close(fd);
mas01cr@509 48 return true;
mas01cr@509 49 }
mas01cr@509 50 }
mas01cr@509 51
mas01cr@509 52 /* FIXME: the indexName arg should be "const char *", but the LSH
mas01cr@509 53 * library doesn't like that.
mas01cr@509 54 */
mas01cr@509 55 LSH *audiodb_index_allocate(adb_t *adb, char *indexName, bool load_tables) {
mas01cr@509 56 LSH *lsh;
mas01cr@509 57 if(adb->cached_lsh) {
mas01cr@509 58 if(!strncmp(adb->cached_lsh->get_indexName(), indexName, ADB_MAXSTR)) {
mas01cr@509 59 return adb->cached_lsh;
mas01cr@509 60 } else {
mas01cr@509 61 delete adb->cached_lsh;
mas01cr@509 62 }
mas01cr@509 63 }
mas01cr@509 64 lsh = new LSH(indexName, load_tables);
mas01cr@509 65 if(load_tables) {
mas01cr@509 66 adb->cached_lsh = lsh;
mas01cr@509 67 }
mas01cr@509 68 return lsh;
mas01cr@509 69 }
mas01cr@509 70
mas01cr@509 71 vector<vector<float> > *audiodb_index_initialize_shingles(Uns32T sz, Uns32T dim, Uns32T seqLen) {
mas01cr@509 72 std::vector<std::vector<float> > *vv = new vector<vector<float> >(sz);
mas01cr@509 73 for(Uns32T i=0 ; i < sz ; i++) {
mas01cr@509 74 (*vv)[i]=vector<float>(dim * seqLen);
mas01cr@509 75 }
mas01cr@509 76 return vv;
mas01cr@509 77 }
mas01cr@509 78
mas01cr@509 79 void audiodb_index_delete_shingles(vector<vector<float> > *vv) {
mas01cr@509 80 delete vv;
mas01cr@509 81 }
mas01cr@509 82
mas01cr@509 83 void audiodb_index_make_shingle(vector<vector<float> >* vv, Uns32T idx, double* fvp, Uns32T dim, Uns32T seqLen){
mas01cr@509 84 assert(idx<(*vv).size());
mas01cr@509 85 vector<float>::iterator ve = (*vv)[idx].end();
mas01cr@509 86 vector<float>::iterator vi = (*vv)[idx].begin();
mas01cr@509 87 // First feature vector in shingle
mas01cr@509 88 if(idx == 0) {
mas01cr@509 89 while(vi!=ve) {
mas01cr@509 90 *vi++ = (float)(*fvp++);
mas01cr@509 91 }
mas01cr@509 92 } else {
mas01cr@509 93 // Not first feature vector in shingle
mas01cr@509 94 vector<float>::iterator ui=(*vv)[idx-1].begin() + dim;
mas01cr@509 95 // Previous seqLen-1 dim-vectors
mas01cr@509 96 while(vi!=ve-dim) {
mas01cr@509 97 *vi++ = *ui++;
mas01cr@509 98 }
mas01cr@509 99 // Move data pointer to next feature vector
mas01cr@509 100 fvp += ( seqLen + idx - 1 ) * dim ;
mas01cr@509 101 // New d-vector
mas01cr@509 102 while(vi!=ve) {
mas01cr@509 103 *vi++ = (float)(*fvp++);
mas01cr@509 104 }
mas01cr@509 105 }
mas01cr@509 106 }
mas01cr@509 107
mas01cr@509 108 // in-place norming, no deletions. If using power, return number of
mas01cr@509 109 // shingles above power threshold.
mas01cr@509 110 int audiodb_index_norm_shingles(vector<vector<float> >* vv, double* snp, double* spp, Uns32T dim, Uns32T seqLen, double radius, bool normed_vectors, bool use_pthreshold, float pthreshold) {
mas01cr@509 111 int z = 0; // number of above-threshold shingles
mas01cr@509 112 float l2norm;
mas01cr@509 113 double power;
mas01cr@509 114 float oneOverRadius = 1./(float)sqrt(radius); // Passed radius is really radius^2
mas01cr@509 115 float oneOverSqrtl2NormDivRad = oneOverRadius;
mas01cr@509 116 Uns32T shingleSize = seqLen * dim;
mas01cr@509 117
mas01cr@509 118 if(!spp) {
mas01cr@509 119 return -1;
mas01cr@509 120 }
mas01cr@509 121 for(Uns32T a=0; a<(*vv).size(); a++){
mas01cr@509 122 l2norm = (float)(*snp++);
mas01cr@509 123 if(normed_vectors)
mas01cr@509 124 oneOverSqrtl2NormDivRad = (1./l2norm)*oneOverRadius;
mas01cr@509 125
mas01cr@509 126 for(Uns32T b=0; b < shingleSize ; b++)
mas01cr@509 127 (*vv)[a][b]*=oneOverSqrtl2NormDivRad;
mas01cr@509 128
mas01cr@509 129 power = *spp++;
mas01cr@509 130 if(use_pthreshold){
mas01cr@509 131 if (power >= pthreshold)
mas01cr@509 132 z++;
mas01cr@509 133 }
mas01cr@509 134 else
mas01cr@509 135 z++;
mas01cr@509 136 }
mas01cr@509 137 return z;
mas01cr@509 138 }