annotate multiprobe.cpp @ 755:37c2b9cce23a multiprobeLSH

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents dad3d252462a
children
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@518 4 * Given a vector of LSH boundary distances for a query,
mas01mc@518 5 * perform lookup by probing nearby hash-function locations
mas01mc@516 6 *
mas01mc@516 7 * Implementation using C++ STL
mas01mc@516 8 *
mas01mc@516 9 * Reference:
mas01mc@516 10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 12 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 13 *
mas01mc@516 14 *
mas01mc@516 15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 16 * License: GNU Public License 2.0
mas01mc@516 17 *
mas01mc@516 18 */
mas01mc@516 19
mas01mc@516 20 #include "multiprobe.h"
mas01mc@516 21
mas01mc@519 22 //#define _TEST_MP_LSH
mas01mc@516 23
mas01mc@519 24 bool operator> (const min_heap_element& a, const min_heap_element& b){
mas01mc@519 25 return a.score > b.score;
mas01mc@519 26 }
mas01mc@519 27
mas01mc@519 28 bool operator< (const min_heap_element& a, const min_heap_element& b){
mas01mc@519 29 return a.score < b.score;
mas01mc@519 30 }
mas01mc@519 31
mas01mc@519 32 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@519 33 return a.first > b.first;
mas01mc@519 34 }
mas01mc@519 35
mas01mc@519 36 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@519 37 return a.first < b.first;
mas01mc@519 38 }
mas01mc@519 39
mas01mc@519 40 MinHeapElement::MinHeapElement(perturbation_set a, float s):
mas01mc@518 41 perturbs(a),
mas01mc@518 42 score(s)
mas01mc@518 43 {
mas01mc@518 44
mas01mc@518 45 }
mas01mc@518 46
mas01mc@518 47 MinHeapElement::~MinHeapElement(){;}
mas01mc@516 48
mas01mc@517 49 MultiProbe::MultiProbe():
mas01mc@517 50 minHeap(0),
mas01mc@517 51 outSets(0),
mas01mc@517 52 distFuns(0),
mas01mc@517 53 numHashBoundaries(0)
mas01mc@517 54 {
mas01mc@518 55
mas01mc@516 56 }
mas01mc@516 57
mas01mc@516 58 MultiProbe::~MultiProbe(){
mas01mc@518 59 cleanup();
mas01mc@518 60 }
mas01mc@518 61
mas01mc@518 62 void MultiProbe::initialize(){
mas01mc@518 63 minHeap = new min_heap_of_perturbation_set();
mas01mc@518 64 outSets = new min_heap_of_perturbation_set();
mas01mc@518 65 }
mas01mc@518 66
mas01mc@518 67 void MultiProbe::cleanup(){
mas01mc@516 68 delete minHeap;
mas01mc@518 69 minHeap = 0;
mas01mc@516 70 delete outSets;
mas01mc@518 71 outSets = 0;
mas01mc@517 72 delete distFuns;
mas01mc@518 73 distFuns = 0;
mas01mc@516 74 }
mas01mc@516 75
mas01mc@519 76 size_t MultiProbe::size(){
mas01mc@518 77 return outSets->size();
mas01mc@518 78 }
mas01mc@518 79
mas01mc@519 80 bool MultiProbe::empty(){
mas01mc@518 81 return !outSets->size();
mas01mc@518 82 }
mas01mc@518 83
mas01mc@518 84
mas01mc@519 85 void MultiProbe::generatePerturbationSets(vector<float>& x, unsigned T){
mas01mc@519 86 cleanup(); // Make re-entrant
mas01mc@519 87 initialize();
mas01mc@519 88 makeSortedDistFuns(x);
mas01mc@519 89 algorithm1(T);
mas01mc@519 90 }
mas01mc@519 91
mas01mc@519 92 // overloading to support efficient array use without initial copy
mas01mc@519 93 void MultiProbe::generatePerturbationSets(float* x, unsigned N, unsigned T){
mas01mc@519 94 cleanup(); // Make re-entrant
mas01mc@519 95 initialize();
mas01mc@519 96 makeSortedDistFuns(x, N);
mas01mc@519 97 algorithm1(T);
mas01mc@519 98 }
mas01mc@519 99
mas01mc@516 100 // Generate the optimal T perturbation sets for current query
mas01mc@516 101 // pre-conditions:
mas01mc@516 102 // an LSH structure was initialized and passed to constructor
mas01mc@516 103 // a query vector was passed to lsh->compute_hash_functions()
mas01mc@516 104 // the query-to-boundary distances are stored in x[hashFunIndex]
mas01mc@516 105 //
mas01mc@516 106 // post-conditions:
mas01mc@516 107 // generates an ordered list of perturbation sets (stored as an array of sets)
mas01mc@516 108 // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
mas01mc@516 109 // data structures are cleared and reset to zeros thereby making them re-entrant
mas01mc@516 110 //
mas01mc@519 111 void MultiProbe::algorithm1(unsigned T){
mas01mc@516 112 perturbation_set ai,as,ae;
mas01mc@519 113 float ai_score;
mas01mc@518 114 ai.insert(0); // Initialize for this query
mas01mc@518 115 minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
mas01mc@516 116
mas01mc@518 117 min_heap_element mhe = minHeap->top();
mas01mc@518 118
mas01mc@522 119 if(T>distFuns->size())
mas01mc@522 120 T = distFuns->size();
mas01mc@519 121 for(unsigned i = 0 ; i != T ; i++ ){
mas01mc@516 122 do{
mas01mc@518 123 mhe = minHeap->top();
mas01mc@518 124 ai = mhe.perturbs;
mas01mc@518 125 ai_score = mhe.score;
mas01mc@516 126 minHeap->pop();
mas01mc@518 127 as=ai;
mas01mc@517 128 shift(as);
mas01mc@518 129 minHeap->push(min_heap_element(as, score(as)));
mas01mc@518 130 ae=ai;
mas01mc@517 131 expand(ae);
mas01mc@518 132 minHeap->push(min_heap_element(ae, score(ae)));
mas01mc@516 133 }while(!valid(ai));
mas01mc@518 134 outSets->push(mhe); // Ordered list of perturbation sets
mas01mc@516 135 }
mas01mc@516 136 }
mas01mc@516 137
mas01mc@518 138 void MultiProbe::dump(perturbation_set a){
mas01mc@517 139 perturbation_set::iterator it = a.begin();
mas01mc@518 140 while(it != a.end()){
mas01mc@518 141 cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " "
mas01mc@518 142 << (*distFuns)[*it].first << *it << ", ";
mas01mc@518 143 it++;
mas01mc@518 144 }
mas01mc@518 145 cout << "(" << score(a) << ")";
mas01mc@518 146 cout << endl;
mas01mc@517 147 }
mas01mc@517 148
mas01mc@517 149 // Given the set a, add 1 to last element of the set
mas01mc@518 150 inline perturbation_set& MultiProbe::shift(perturbation_set& a){
mas01mc@517 151 perturbation_set::iterator it = a.end();
mas01mc@517 152 int val = *(--it) + 1;
mas01mc@517 153 a.erase(it);
mas01mc@517 154 a.insert(it,val);
mas01mc@519 155 return a;
mas01mc@517 156 }
mas01mc@517 157
mas01mc@517 158 // Given the set a, add a new element one greater than the max
mas01mc@518 159 inline perturbation_set& MultiProbe::expand(perturbation_set& a){
mas01mc@517 160 perturbation_set::reverse_iterator ri = a.rbegin();
mas01mc@517 161 a.insert(*ri+1);
mas01mc@519 162 return a;
mas01mc@517 163 }
mas01mc@516 164
mas01mc@516 165 // Take the list of distances (x) assuming list len is 2M and
mas01mc@516 166 // delta = (-1)^i, i = { 0 .. 2M-1 }
mas01mc@519 167 void MultiProbe::makeSortedDistFuns(vector<float>& x){
mas01mc@517 168 numHashBoundaries = x.size(); // x.size() == 2M
mas01mc@517 169 delete distFuns;
mas01mc@517 170 distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
mas01mc@519 171 for(unsigned i = 0; i != numHashBoundaries ; i++ )
mas01mc@520 172 (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
mas01mc@519 173 // SORT
mas01mc@519 174 sort( distFuns->begin(), distFuns->end() );
mas01mc@519 175 }
mas01mc@519 176
mas01mc@519 177 // Float array version of above
mas01mc@519 178 void MultiProbe::makeSortedDistFuns(float* x, unsigned N){
mas01mc@519 179 numHashBoundaries = N; // x.size() == 2M
mas01mc@519 180 delete distFuns;
mas01mc@519 181 distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
mas01mc@519 182 for(unsigned i = 0; i != numHashBoundaries ; i++ )
mas01mc@520 183 (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
mas01mc@517 184 // SORT
mas01mc@517 185 sort( distFuns->begin(), distFuns->end() );
mas01mc@516 186 }
mas01mc@516 187
mas01mc@517 188 // For a given perturbation set, the score is the
mas01mc@517 189 // sum of squares of corresponding distances in x
mas01mc@519 190 float MultiProbe::score(perturbation_set& a){
mas01mc@517 191 //assert(!a.empty());
mas01mc@519 192 float score = 0.0, tmp;
mas01mc@517 193 perturbation_set::iterator it;
mas01mc@517 194 it = a.begin();
mas01mc@517 195 do{
mas01mc@518 196 tmp = (*distFuns)[*it].first;
mas01mc@517 197 score += tmp*tmp;
mas01mc@518 198 }while( ++it != a.end() );
mas01mc@517 199 return score;
mas01mc@516 200 }
mas01mc@516 201
mas01mc@517 202 // A valid set must have at most one
mas01mc@517 203 // of the two elements {j, 2M + 1 - j} for every j
mas01mc@517 204 //
mas01mc@517 205 // A perturbation set containing an element > 2M is invalid
mas01mc@516 206 bool MultiProbe::valid(perturbation_set& a){
mas01mc@517 207 int j;
mas01mc@517 208 perturbation_set::iterator it = a.begin();
mas01mc@517 209 while( it != a.end() ){
mas01mc@518 210 j = *it;
mas01mc@518 211 it++;
mas01mc@520 212 if( ( (unsigned)j > numHashBoundaries ) || ( a.find( numHashBoundaries - j - 1 ) != a.end() ) )
mas01mc@517 213 return false;
mas01mc@517 214 }
mas01mc@517 215 return true;
mas01mc@516 216 }
mas01mc@516 217
mas01mc@519 218 int MultiProbe::getIndex(perturbation_set::iterator it){
mas01mc@519 219 return (*distFuns)[*it].second.first;
mas01mc@519 220 }
mas01mc@519 221
mas01mc@519 222 int MultiProbe::getBoundary(perturbation_set::iterator it){
mas01mc@519 223 return (*distFuns)[*it].second.second;
mas01mc@519 224 }
mas01mc@519 225
mas01mc@518 226 // copy return next perturbation_set
mas01mc@518 227 perturbation_set MultiProbe::getNextPerturbationSet(){
mas01mc@518 228 perturbation_set s = outSets->top().perturbs;
mas01mc@518 229 outSets->pop();
mas01mc@518 230 return s;
mas01mc@518 231 }
mas01mc@518 232
mas01mc@518 233 // Test routine: generate 100 random boundary distance pairs
mas01mc@518 234 // call generatePerturbationSets on these distances
mas01mc@518 235 // dump output for inspection
mas01mc@516 236 #ifdef _TEST_MP_LSH
mas01mc@516 237 int main(const int argc, const char* argv[]){
mas01mc@518 238 int N_SAMPS = 100; // Number of random samples
mas01mc@518 239 int W = 4; // simulated hash-bucket size
mas01mc@518 240 int N_ITER = 100; // How many re-entrant iterations
mas01mc@519 241 unsigned T = 10; // Number of multi-probe sets to generate
mas01mc@518 242
mas01mc@518 243 MultiProbe mp= MultiProbe();
mas01mc@519 244 vector<float> x(N_SAMPS);
mas01mc@518 245
mas01mc@518 246 srand((unsigned)time(0));
mas01mc@518 247
mas01mc@518 248 // Test re-entrance on single instance
mas01mc@518 249 for(int j = 0; j< N_ITER ; j++){
mas01mc@518 250 cout << "********** ITERATION " << j << " **********" << endl;
mas01mc@518 251 cout.flush();
mas01mc@518 252 for (int i = 0 ; i != x.size()/2 ; i++ ){
mas01mc@518 253 x[2*i] = W*(rand()/(RAND_MAX+1.0));
mas01mc@518 254 x[2*i+1] = W - x[2*i];
mas01mc@518 255 }
mas01mc@518 256 // Generate multi-probe sets
mas01mc@518 257 mp.generatePerturbationSets(x, T);
mas01mc@518 258 // Output contents of multi-probe sets
mas01mc@518 259 while(!mp.empty())
mas01mc@518 260 mp.dump(mp.getNextPerturbationSet());
mas01mc@518 261 }
mas01mc@516 262 }
mas01mc@516 263 #endif