annotate multiprobe.cpp @ 518:ca1ee92c359c multiprobeLSH

Completed initial implementation of LSH MultiProbe class. Now needs to be joined to lshlib.cpp retrieve() method to perform multi-probe queries.
author mas01mc
date Mon, 26 Jan 2009 02:50:44 +0000
parents 807c8be7dd45
children fdcd436d7cbd
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@518 4 * Given a vector of LSH boundary distances for a query,
mas01mc@518 5 * perform lookup by probing nearby hash-function locations
mas01mc@516 6 *
mas01mc@516 7 * Implementation using C++ STL
mas01mc@516 8 *
mas01mc@516 9 * Reference:
mas01mc@516 10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 12 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 13 *
mas01mc@516 14 *
mas01mc@516 15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 16 * License: GNU Public License 2.0
mas01mc@516 17 *
mas01mc@516 18 */
mas01mc@516 19
mas01mc@516 20 #include "multiprobe.h"
mas01mc@516 21
mas01mc@518 22 #define _TEST_MP_LSH
mas01mc@516 23
mas01mc@518 24 MinHeapElement::MinHeapElement(perturbation_set a, double s):
mas01mc@518 25 perturbs(a),
mas01mc@518 26 score(s)
mas01mc@518 27 {
mas01mc@518 28
mas01mc@518 29 }
mas01mc@518 30
mas01mc@518 31 MinHeapElement::~MinHeapElement(){;}
mas01mc@516 32
mas01mc@517 33 MultiProbe::MultiProbe():
mas01mc@517 34 minHeap(0),
mas01mc@517 35 outSets(0),
mas01mc@517 36 distFuns(0),
mas01mc@517 37 numHashBoundaries(0)
mas01mc@517 38 {
mas01mc@518 39
mas01mc@516 40 }
mas01mc@516 41
mas01mc@516 42 MultiProbe::~MultiProbe(){
mas01mc@518 43 cleanup();
mas01mc@518 44 }
mas01mc@518 45
mas01mc@518 46 void MultiProbe::initialize(){
mas01mc@518 47 minHeap = new min_heap_of_perturbation_set();
mas01mc@518 48 outSets = new min_heap_of_perturbation_set();
mas01mc@518 49 }
mas01mc@518 50
mas01mc@518 51 void MultiProbe::cleanup(){
mas01mc@516 52 delete minHeap;
mas01mc@518 53 minHeap = 0;
mas01mc@516 54 delete outSets;
mas01mc@518 55 outSets = 0;
mas01mc@517 56 delete distFuns;
mas01mc@518 57 distFuns = 0;
mas01mc@516 58 }
mas01mc@516 59
mas01mc@518 60 inline size_t MultiProbe::size(){
mas01mc@518 61 return outSets->size();
mas01mc@518 62 }
mas01mc@518 63
mas01mc@518 64 inline bool MultiProbe::empty(){
mas01mc@518 65 return !outSets->size();
mas01mc@518 66 }
mas01mc@518 67
mas01mc@518 68
mas01mc@516 69 // Generate the optimal T perturbation sets for current query
mas01mc@516 70 // pre-conditions:
mas01mc@516 71 // an LSH structure was initialized and passed to constructor
mas01mc@516 72 // a query vector was passed to lsh->compute_hash_functions()
mas01mc@516 73 // the query-to-boundary distances are stored in x[hashFunIndex]
mas01mc@516 74 //
mas01mc@516 75 // post-conditions:
mas01mc@516 76 // generates an ordered list of perturbation sets (stored as an array of sets)
mas01mc@516 77 // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
mas01mc@516 78 // data structures are cleared and reset to zeros thereby making them re-entrant
mas01mc@516 79 //
mas01mc@518 80 void MultiProbe::generatePerturbationSets(vector<double>& x, int T){
mas01mc@516 81 perturbation_set ai,as,ae;
mas01mc@518 82 double ai_score;
mas01mc@516 83
mas01mc@518 84 cleanup(); // Make re-entrant
mas01mc@518 85 initialize();
mas01mc@517 86 makeSortedDistFuns(x);
mas01mc@516 87
mas01mc@518 88 ai.insert(0); // Initialize for this query
mas01mc@518 89 minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
mas01mc@516 90
mas01mc@518 91 min_heap_element mhe = minHeap->top();
mas01mc@518 92
mas01mc@518 93 for(int i = 0 ; i != T ; i++ ){
mas01mc@516 94 do{
mas01mc@518 95 if(minHeap->empty())
mas01mc@518 96 break;
mas01mc@518 97 mhe = minHeap->top();
mas01mc@518 98 ai = mhe.perturbs;
mas01mc@518 99 ai_score = mhe.score;
mas01mc@516 100 minHeap->pop();
mas01mc@518 101 as=ai;
mas01mc@517 102 shift(as);
mas01mc@518 103 minHeap->push(min_heap_element(as, score(as)));
mas01mc@518 104 ae=ai;
mas01mc@517 105 expand(ae);
mas01mc@518 106 minHeap->push(min_heap_element(ae, score(ae)));
mas01mc@516 107 }while(!valid(ai));
mas01mc@518 108 outSets->push(mhe); // Ordered list of perturbation sets
mas01mc@516 109 }
mas01mc@516 110 }
mas01mc@516 111
mas01mc@518 112 void MultiProbe::dump(perturbation_set a){
mas01mc@517 113 perturbation_set::iterator it = a.begin();
mas01mc@518 114 while(it != a.end()){
mas01mc@518 115 cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " "
mas01mc@518 116 << (*distFuns)[*it].first << *it << ", ";
mas01mc@518 117 it++;
mas01mc@518 118 }
mas01mc@518 119 cout << "(" << score(a) << ")";
mas01mc@518 120 cout << endl;
mas01mc@517 121 }
mas01mc@517 122
mas01mc@517 123 // Given the set a, add 1 to last element of the set
mas01mc@518 124 inline perturbation_set& MultiProbe::shift(perturbation_set& a){
mas01mc@517 125 perturbation_set::iterator it = a.end();
mas01mc@517 126 int val = *(--it) + 1;
mas01mc@517 127 a.erase(it);
mas01mc@517 128 a.insert(it,val);
mas01mc@517 129 }
mas01mc@517 130
mas01mc@517 131 // Given the set a, add a new element one greater than the max
mas01mc@518 132 inline perturbation_set& MultiProbe::expand(perturbation_set& a){
mas01mc@517 133 perturbation_set::reverse_iterator ri = a.rbegin();
mas01mc@517 134 a.insert(*ri+1);
mas01mc@517 135 }
mas01mc@516 136
mas01mc@516 137 // Take the list of distances (x) assuming list len is 2M and
mas01mc@516 138 // delta = (-1)^i, i = { 0 .. 2M-1 }
mas01mc@517 139 void MultiProbe::makeSortedDistFuns(vector<double>& x){
mas01mc@517 140 numHashBoundaries = x.size(); // x.size() == 2M
mas01mc@517 141 delete distFuns;
mas01mc@517 142 distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
mas01mc@518 143 for(int i = 0; i != numHashBoundaries ; i++ )
mas01mc@518 144 (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
mas01mc@517 145 // SORT
mas01mc@517 146 sort( distFuns->begin(), distFuns->end() );
mas01mc@516 147 }
mas01mc@516 148
mas01mc@517 149 // For a given perturbation set, the score is the
mas01mc@517 150 // sum of squares of corresponding distances in x
mas01mc@516 151 double MultiProbe::score(perturbation_set& a){
mas01mc@517 152 //assert(!a.empty());
mas01mc@517 153 double score = 0.0, tmp;
mas01mc@517 154 perturbation_set::iterator it;
mas01mc@517 155 it = a.begin();
mas01mc@517 156 do{
mas01mc@518 157 tmp = (*distFuns)[*it].first;
mas01mc@517 158 score += tmp*tmp;
mas01mc@518 159 }while( ++it != a.end() );
mas01mc@517 160 return score;
mas01mc@516 161 }
mas01mc@516 162
mas01mc@517 163 // A valid set must have at most one
mas01mc@517 164 // of the two elements {j, 2M + 1 - j} for every j
mas01mc@517 165 //
mas01mc@517 166 // A perturbation set containing an element > 2M is invalid
mas01mc@516 167 bool MultiProbe::valid(perturbation_set& a){
mas01mc@517 168 int j;
mas01mc@517 169 perturbation_set::iterator it = a.begin();
mas01mc@517 170 while( it != a.end() ){
mas01mc@518 171 j = *it;
mas01mc@518 172 it++;
mas01mc@517 173 if( ( j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) )
mas01mc@517 174 return false;
mas01mc@517 175 }
mas01mc@517 176 return true;
mas01mc@516 177 }
mas01mc@516 178
mas01mc@518 179 // copy return next perturbation_set
mas01mc@518 180 perturbation_set MultiProbe::getNextPerturbationSet(){
mas01mc@518 181 perturbation_set s = outSets->top().perturbs;
mas01mc@518 182 outSets->pop();
mas01mc@518 183 return s;
mas01mc@518 184 }
mas01mc@518 185
mas01mc@518 186 // Test routine: generate 100 random boundary distance pairs
mas01mc@518 187 // call generatePerturbationSets on these distances
mas01mc@518 188 // dump output for inspection
mas01mc@516 189 #ifdef _TEST_MP_LSH
mas01mc@516 190 int main(const int argc, const char* argv[]){
mas01mc@518 191 int N_SAMPS = 100; // Number of random samples
mas01mc@518 192 int W = 4; // simulated hash-bucket size
mas01mc@518 193 int N_ITER = 100; // How many re-entrant iterations
mas01mc@518 194 int T = 10; // Number of multi-probe sets to generate
mas01mc@518 195
mas01mc@518 196 MultiProbe mp= MultiProbe();
mas01mc@518 197 vector<double> x(N_SAMPS);
mas01mc@518 198
mas01mc@518 199 srand((unsigned)time(0));
mas01mc@518 200
mas01mc@518 201 // Test re-entrance on single instance
mas01mc@518 202 for(int j = 0; j< N_ITER ; j++){
mas01mc@518 203 cout << "********** ITERATION " << j << " **********" << endl;
mas01mc@518 204 cout.flush();
mas01mc@518 205 for (int i = 0 ; i != x.size()/2 ; i++ ){
mas01mc@518 206 x[2*i] = W*(rand()/(RAND_MAX+1.0));
mas01mc@518 207 x[2*i+1] = W - x[2*i];
mas01mc@518 208 }
mas01mc@518 209 // Generate multi-probe sets
mas01mc@518 210 mp.generatePerturbationSets(x, T);
mas01mc@518 211 // Output contents of multi-probe sets
mas01mc@518 212 while(!mp.empty())
mas01mc@518 213 mp.dump(mp.getNextPerturbationSet());
mas01mc@518 214 }
mas01mc@516 215 }
mas01mc@516 216 #endif