annotate multiprobe.cpp @ 524:469b50a3dd84 multiprobeLSH

Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted.
author mas01mc
date Wed, 28 Jan 2009 16:02:17 +0000
parents dad3d252462a
children
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@518 4 * Given a vector of LSH boundary distances for a query,
mas01mc@518 5 * perform lookup by probing nearby hash-function locations
mas01mc@516 6 *
mas01mc@516 7 * Implementation using C++ STL
mas01mc@516 8 *
mas01mc@516 9 * Reference:
mas01mc@516 10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 12 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 13 *
mas01mc@516 14 *
mas01mc@516 15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 16 * License: GNU Public License 2.0
mas01mc@516 17 *
mas01mc@516 18 */
mas01mc@516 19
mas01mc@516 20 #include "multiprobe.h"
mas01mc@516 21
mas01mc@519 22 //#define _TEST_MP_LSH
mas01mc@516 23
mas01mc@519 24 bool operator> (const min_heap_element& a, const min_heap_element& b){
mas01mc@519 25 return a.score > b.score;
mas01mc@519 26 }
mas01mc@519 27
mas01mc@519 28 bool operator< (const min_heap_element& a, const min_heap_element& b){
mas01mc@519 29 return a.score < b.score;
mas01mc@519 30 }
mas01mc@519 31
mas01mc@519 32 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@519 33 return a.first > b.first;
mas01mc@519 34 }
mas01mc@519 35
mas01mc@519 36 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@519 37 return a.first < b.first;
mas01mc@519 38 }
mas01mc@519 39
mas01mc@519 40 MinHeapElement::MinHeapElement(perturbation_set a, float s):
mas01mc@518 41 perturbs(a),
mas01mc@518 42 score(s)
mas01mc@518 43 {
mas01mc@518 44
mas01mc@518 45 }
mas01mc@518 46
mas01mc@518 47 MinHeapElement::~MinHeapElement(){;}
mas01mc@516 48
mas01mc@517 49 MultiProbe::MultiProbe():
mas01mc@517 50 minHeap(0),
mas01mc@517 51 outSets(0),
mas01mc@517 52 distFuns(0),
mas01mc@517 53 numHashBoundaries(0)
mas01mc@517 54 {
mas01mc@518 55
mas01mc@516 56 }
mas01mc@516 57
mas01mc@516 58 MultiProbe::~MultiProbe(){
mas01mc@518 59 cleanup();
mas01mc@518 60 }
mas01mc@518 61
mas01mc@518 62 void MultiProbe::initialize(){
mas01mc@518 63 minHeap = new min_heap_of_perturbation_set();
mas01mc@518 64 outSets = new min_heap_of_perturbation_set();
mas01mc@518 65 }
mas01mc@518 66
mas01mc@518 67 void MultiProbe::cleanup(){
mas01mc@516 68 delete minHeap;
mas01mc@518 69 minHeap = 0;
mas01mc@516 70 delete outSets;
mas01mc@518 71 outSets = 0;
mas01mc@517 72 delete distFuns;
mas01mc@518 73 distFuns = 0;
mas01mc@516 74 }
mas01mc@516 75
mas01mc@519 76 size_t MultiProbe::size(){
mas01mc@518 77 return outSets->size();
mas01mc@518 78 }
mas01mc@518 79
mas01mc@519 80 bool MultiProbe::empty(){
mas01mc@518 81 return !outSets->size();
mas01mc@518 82 }
mas01mc@518 83
mas01mc@518 84
mas01mc@519 85 void MultiProbe::generatePerturbationSets(vector<float>& x, unsigned T){
mas01mc@519 86 cleanup(); // Make re-entrant
mas01mc@519 87 initialize();
mas01mc@519 88 makeSortedDistFuns(x);
mas01mc@519 89 algorithm1(T);
mas01mc@519 90 }
mas01mc@519 91
mas01mc@519 92 // overloading to support efficient array use without initial copy
mas01mc@519 93 void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){
mas01mc@519 94 cleanup(); // Make re-entrant
mas01mc@519 95 initialize();
mas01mc@519 96 makeSortedDistFuns(x, N);
mas01mc@519 97 algorithm1(T);
mas01mc@519 98 }
mas01mc@519 99
mas01mc@516 100 // Generate the optimal T perturbation sets for current query
mas01mc@516 101 // pre-conditions:
mas01mc@516 102 // an LSH structure was initialized and passed to constructor
mas01mc@516 103 // a query vector was passed to lsh->compute_hash_functions()
mas01mc@516 104 // the query-to-boundary distances are stored in x[hashFunIndex]
mas01mc@516 105 //
mas01mc@516 106 // post-conditions:
mas01mc@516 107 // generates an ordered list of perturbation sets (stored as an array of sets)
mas01mc@516 108 // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
mas01mc@516 109 // data structures are cleared and reset to zeros thereby making them re-entrant
mas01mc@516 110 //
mas01mc@519 111 void MultiProbe::algorithm1(unsigned T){
mas01mc@516 112 perturbation_set ai,as,ae;
mas01mc@519 113 float ai_score;
mas01mc@518 114 ai.insert(0); // Initialize for this query
mas01mc@518 115 minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
mas01mc@516 116
mas01mc@518 117 min_heap_element mhe = minHeap->top();
mas01mc@518 118
mas01mc@522 119 if(T>distFuns->size())
mas01mc@522 120 T = distFuns->size();
mas01mc@519 121 for(unsigned i = 0 ; i != T ; i++ ){
mas01mc@516 122 do{
mas01mc@518 123 mhe = minHeap->top();
mas01mc@518 124 ai = mhe.perturbs;
mas01mc@518 125 ai_score = mhe.score;
mas01mc@516 126 minHeap->pop();
mas01mc@518 127 as=ai;
mas01mc@517 128 shift(as);
mas01mc@518 129 minHeap->push(min_heap_element(as, score(as)));
mas01mc@518 130 ae=ai;
mas01mc@517 131 expand(ae);
mas01mc@518 132 minHeap->push(min_heap_element(ae, score(ae)));
mas01mc@516 133 }while(!valid(ai));
mas01mc@518 134 outSets->push(mhe); // Ordered list of perturbation sets
mas01mc@516 135 }
mas01mc@516 136 }
mas01mc@516 137
mas01mc@518 138 void MultiProbe::dump(perturbation_set a){
mas01mc@517 139 perturbation_set::iterator it = a.begin();
mas01mc@518 140 while(it != a.end()){
mas01mc@518 141 cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " "
mas01mc@518 142 << (*distFuns)[*it].first << *it << ", ";
mas01mc@518 143 it++;
mas01mc@518 144 }
mas01mc@518 145 cout << "(" << score(a) << ")";
mas01mc@518 146 cout << endl;
mas01mc@517 147 }
mas01mc@517 148
mas01mc@517 149 // Given the set a, add 1 to last element of the set
mas01mc@518 150 inline perturbation_set& MultiProbe::shift(perturbation_set& a){
mas01mc@517 151 perturbation_set::iterator it = a.end();
mas01mc@517 152 int val = *(--it) + 1;
mas01mc@517 153 a.erase(it);
mas01mc@517 154 a.insert(it,val);
mas01mc@519 155 return a;
mas01mc@517 156 }
mas01mc@517 157
mas01mc@517 158 // Given the set a, add a new element one greater than the max
mas01mc@518 159 inline perturbation_set& MultiProbe::expand(perturbation_set& a){
mas01mc@517 160 perturbation_set::reverse_iterator ri = a.rbegin();
mas01mc@517 161 a.insert(*ri+1);
mas01mc@519 162 return a;
mas01mc@517 163 }
mas01mc@516 164
mas01mc@516 165 // Take the list of distances (x) assuming list len is 2M and
mas01mc@516 166 // delta = (-1)^i, i = { 0 .. 2M-1 }
mas01mc@519 167 void MultiProbe::makeSortedDistFuns(vector<float>& x){
mas01mc@517 168 numHashBoundaries = x.size(); // x.size() == 2M
mas01mc@517 169 delete distFuns;
mas01mc@517 170 distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
mas01mc@519 171 for(unsigned i = 0; i != numHashBoundaries ; i++ )
mas01mc@520 172 (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
mas01mc@519 173 // SORT
mas01mc@519 174 sort( distFuns->begin(), distFuns->end() );
mas01mc@519 175 }
mas01mc@519 176
mas01mc@519 177 // Float array version of above
mas01mc@519 178 void MultiProbe::makeSortedDistFuns(float* x, unsigned N){
mas01mc@519 179 numHashBoundaries = N; // x.size() == 2M
mas01mc@519 180 delete distFuns;
mas01mc@519 181 distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
mas01mc@519 182 for(unsigned i = 0; i != numHashBoundaries ; i++ )
mas01mc@520 183 (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
mas01mc@517 184 // SORT
mas01mc@517 185 sort( distFuns->begin(), distFuns->end() );
mas01mc@516 186 }
mas01mc@516 187
mas01mc@517 188 // For a given perturbation set, the score is the
mas01mc@517 189 // sum of squares of corresponding distances in x
mas01mc@519 190 float MultiProbe::score(perturbation_set& a){
mas01mc@517 191 //assert(!a.empty());
mas01mc@519 192 float score = 0.0, tmp;
mas01mc@517 193 perturbation_set::iterator it;
mas01mc@517 194 it = a.begin();
mas01mc@517 195 do{
mas01mc@518 196 tmp = (*distFuns)[*it].first;
mas01mc@517 197 score += tmp*tmp;
mas01mc@518 198 }while( ++it != a.end() );
mas01mc@517 199 return score;
mas01mc@516 200 }
mas01mc@516 201
mas01mc@517 202 // A valid set must have at most one
mas01mc@517 203 // of the two elements {j, 2M + 1 - j} for every j
mas01mc@517 204 //
mas01mc@517 205 // A perturbation set containing an element > 2M is invalid
mas01mc@516 206 bool MultiProbe::valid(perturbation_set& a){
mas01mc@517 207 int j;
mas01mc@517 208 perturbation_set::iterator it = a.begin();
mas01mc@517 209 while( it != a.end() ){
mas01mc@518 210 j = *it;
mas01mc@518 211 it++;
mas01mc@520 212 if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries - j - 1 ) != a.end() ) )
mas01mc@517 213 return false;
mas01mc@517 214 }
mas01mc@517 215 return true;
mas01mc@516 216 }
mas01mc@516 217
mas01mc@519 218 int MultiProbe::getIndex(perturbation_set::iterator it){
mas01mc@519 219 return (*distFuns)[*it].second.first;
mas01mc@519 220 }
mas01mc@519 221
mas01mc@519 222 int MultiProbe::getBoundary(perturbation_set::iterator it){
mas01mc@519 223 return (*distFuns)[*it].second.second;
mas01mc@519 224 }
mas01mc@519 225
mas01mc@518 226 // copy return next perturbation_set
mas01mc@518 227 perturbation_set MultiProbe::getNextPerturbationSet(){
mas01mc@518 228 perturbation_set s = outSets->top().perturbs;
mas01mc@518 229 outSets->pop();
mas01mc@518 230 return s;
mas01mc@518 231 }
mas01mc@518 232
mas01mc@518 233 // Test routine: generate 100 random boundary distance pairs
mas01mc@518 234 // call generatePerturbationSets on these distances
mas01mc@518 235 // dump output for inspection
mas01mc@516 236 #ifdef _TEST_MP_LSH
mas01mc@516 237 int main(const int argc, const char* argv[]){
mas01mc@518 238 int N_SAMPS = 100; // Number of random samples
mas01mc@518 239 int W = 4; // simulated hash-bucket size
mas01mc@518 240 int N_ITER = 100; // How many re-entrant iterations
mas01mc@519 241 unsigned T = 10; // Number of multi-probe sets to generate
mas01mc@518 242
mas01mc@518 243 MultiProbe mp= MultiProbe();
mas01mc@519 244 vector<float> x(N_SAMPS);
mas01mc@518 245
mas01mc@518 246 srand((unsigned)time(0));
mas01mc@518 247
mas01mc@518 248 // Test re-entrance on single instance
mas01mc@518 249 for(int j = 0; j< N_ITER ; j++){
mas01mc@518 250 cout << "********** ITERATION " << j << " **********" << endl;
mas01mc@518 251 cout.flush();
mas01mc@518 252 for (int i = 0 ; i != x.size()/2 ; i++ ){
mas01mc@518 253 x[2*i] = W*(rand()/(RAND_MAX+1.0));
mas01mc@518 254 x[2*i+1] = W - x[2*i];
mas01mc@518 255 }
mas01mc@518 256 // Generate multi-probe sets
mas01mc@518 257 mp.generatePerturbationSets(x, T);
mas01mc@518 258 // Output contents of multi-probe sets
mas01mc@518 259 while(!mp.empty())
mas01mc@518 260 mp.dump(mp.getNextPerturbationSet());
mas01mc@518 261 }
mas01mc@516 262 }
mas01mc@516 263 #endif