mas01mc@516
|
1 /*
|
mas01mc@516
|
2 * MultiProbe C++ class
|
mas01mc@516
|
3 *
|
mas01mc@518
|
4 * Given a vector of LSH boundary distances for a query,
|
mas01mc@518
|
5 * perform lookup by probing nearby hash-function locations
|
mas01mc@516
|
6 *
|
mas01mc@516
|
7 * Implementation using C++ STL
|
mas01mc@516
|
8 *
|
mas01mc@516
|
9 * Reference:
|
mas01mc@516
|
10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
12 * Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
13 *
|
mas01mc@516
|
14 *
|
mas01mc@516
|
15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
|
mas01mc@516
|
16 * License: GNU Public License 2.0
|
mas01mc@516
|
17 *
|
mas01mc@516
|
18 */
|
mas01mc@516
|
19
|
mas01mc@516
|
20 #include "multiprobe.h"
|
mas01mc@516
|
21
|
mas01mc@518
|
22 #define _TEST_MP_LSH
|
mas01mc@516
|
23
|
mas01mc@518
|
24 MinHeapElement::MinHeapElement(perturbation_set a, double s):
|
mas01mc@518
|
25 perturbs(a),
|
mas01mc@518
|
26 score(s)
|
mas01mc@518
|
27 {
|
mas01mc@518
|
28
|
mas01mc@518
|
29 }
|
mas01mc@518
|
30
|
mas01mc@518
|
31 MinHeapElement::~MinHeapElement(){;}
|
mas01mc@516
|
32
|
mas01mc@517
|
33 MultiProbe::MultiProbe():
|
mas01mc@517
|
34 minHeap(0),
|
mas01mc@517
|
35 outSets(0),
|
mas01mc@517
|
36 distFuns(0),
|
mas01mc@517
|
37 numHashBoundaries(0)
|
mas01mc@517
|
38 {
|
mas01mc@518
|
39
|
mas01mc@516
|
40 }
|
mas01mc@516
|
41
|
mas01mc@516
|
42 MultiProbe::~MultiProbe(){
|
mas01mc@518
|
43 cleanup();
|
mas01mc@518
|
44 }
|
mas01mc@518
|
45
|
mas01mc@518
|
46 void MultiProbe::initialize(){
|
mas01mc@518
|
47 minHeap = new min_heap_of_perturbation_set();
|
mas01mc@518
|
48 outSets = new min_heap_of_perturbation_set();
|
mas01mc@518
|
49 }
|
mas01mc@518
|
50
|
mas01mc@518
|
51 void MultiProbe::cleanup(){
|
mas01mc@516
|
52 delete minHeap;
|
mas01mc@518
|
53 minHeap = 0;
|
mas01mc@516
|
54 delete outSets;
|
mas01mc@518
|
55 outSets = 0;
|
mas01mc@517
|
56 delete distFuns;
|
mas01mc@518
|
57 distFuns = 0;
|
mas01mc@516
|
58 }
|
mas01mc@516
|
59
|
mas01mc@518
|
60 inline size_t MultiProbe::size(){
|
mas01mc@518
|
61 return outSets->size();
|
mas01mc@518
|
62 }
|
mas01mc@518
|
63
|
mas01mc@518
|
64 inline bool MultiProbe::empty(){
|
mas01mc@518
|
65 return !outSets->size();
|
mas01mc@518
|
66 }
|
mas01mc@518
|
67
|
mas01mc@518
|
68
|
mas01mc@516
|
69 // Generate the optimal T perturbation sets for current query
|
mas01mc@516
|
70 // pre-conditions:
|
mas01mc@516
|
71 // an LSH structure was initialized and passed to constructor
|
mas01mc@516
|
72 // a query vector was passed to lsh->compute_hash_functions()
|
mas01mc@516
|
73 // the query-to-boundary distances are stored in x[hashFunIndex]
|
mas01mc@516
|
74 //
|
mas01mc@516
|
75 // post-conditions:
|
mas01mc@516
|
76 // generates an ordered list of perturbation sets (stored as an array of sets)
|
mas01mc@516
|
77 // these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j
|
mas01mc@516
|
78 // data structures are cleared and reset to zeros thereby making them re-entrant
|
mas01mc@516
|
79 //
|
mas01mc@518
|
80 void MultiProbe::generatePerturbationSets(vector<double>& x, int T){
|
mas01mc@516
|
81 perturbation_set ai,as,ae;
|
mas01mc@518
|
82 double ai_score;
|
mas01mc@516
|
83
|
mas01mc@518
|
84 cleanup(); // Make re-entrant
|
mas01mc@518
|
85 initialize();
|
mas01mc@517
|
86 makeSortedDistFuns(x);
|
mas01mc@516
|
87
|
mas01mc@518
|
88 ai.insert(0); // Initialize for this query
|
mas01mc@518
|
89 minHeap->push(min_heap_element(ai, score(ai))); // unique instance stored in mhe
|
mas01mc@516
|
90
|
mas01mc@518
|
91 min_heap_element mhe = minHeap->top();
|
mas01mc@518
|
92
|
mas01mc@518
|
93 for(int i = 0 ; i != T ; i++ ){
|
mas01mc@516
|
94 do{
|
mas01mc@518
|
95 if(minHeap->empty())
|
mas01mc@518
|
96 break;
|
mas01mc@518
|
97 mhe = minHeap->top();
|
mas01mc@518
|
98 ai = mhe.perturbs;
|
mas01mc@518
|
99 ai_score = mhe.score;
|
mas01mc@516
|
100 minHeap->pop();
|
mas01mc@518
|
101 as=ai;
|
mas01mc@517
|
102 shift(as);
|
mas01mc@518
|
103 minHeap->push(min_heap_element(as, score(as)));
|
mas01mc@518
|
104 ae=ai;
|
mas01mc@517
|
105 expand(ae);
|
mas01mc@518
|
106 minHeap->push(min_heap_element(ae, score(ae)));
|
mas01mc@516
|
107 }while(!valid(ai));
|
mas01mc@518
|
108 outSets->push(mhe); // Ordered list of perturbation sets
|
mas01mc@516
|
109 }
|
mas01mc@516
|
110 }
|
mas01mc@516
|
111
|
mas01mc@518
|
112 void MultiProbe::dump(perturbation_set a){
|
mas01mc@517
|
113 perturbation_set::iterator it = a.begin();
|
mas01mc@518
|
114 while(it != a.end()){
|
mas01mc@518
|
115 cout << "[" << (*distFuns)[*it].second.first << "," << (*distFuns)[*it].second.second << "]" << " "
|
mas01mc@518
|
116 << (*distFuns)[*it].first << *it << ", ";
|
mas01mc@518
|
117 it++;
|
mas01mc@518
|
118 }
|
mas01mc@518
|
119 cout << "(" << score(a) << ")";
|
mas01mc@518
|
120 cout << endl;
|
mas01mc@517
|
121 }
|
mas01mc@517
|
122
|
mas01mc@517
|
123 // Given the set a, add 1 to last element of the set
|
mas01mc@518
|
124 inline perturbation_set& MultiProbe::shift(perturbation_set& a){
|
mas01mc@517
|
125 perturbation_set::iterator it = a.end();
|
mas01mc@517
|
126 int val = *(--it) + 1;
|
mas01mc@517
|
127 a.erase(it);
|
mas01mc@517
|
128 a.insert(it,val);
|
mas01mc@517
|
129 }
|
mas01mc@517
|
130
|
mas01mc@517
|
131 // Given the set a, add a new element one greater than the max
|
mas01mc@518
|
132 inline perturbation_set& MultiProbe::expand(perturbation_set& a){
|
mas01mc@517
|
133 perturbation_set::reverse_iterator ri = a.rbegin();
|
mas01mc@517
|
134 a.insert(*ri+1);
|
mas01mc@517
|
135 }
|
mas01mc@516
|
136
|
mas01mc@516
|
137 // Take the list of distances (x) assuming list len is 2M and
|
mas01mc@516
|
138 // delta = (-1)^i, i = { 0 .. 2M-1 }
|
mas01mc@517
|
139 void MultiProbe::makeSortedDistFuns(vector<double>& x){
|
mas01mc@517
|
140 numHashBoundaries = x.size(); // x.size() == 2M
|
mas01mc@517
|
141 delete distFuns;
|
mas01mc@517
|
142 distFuns = new std::vector<sorted_distance_functions>(numHashBoundaries);
|
mas01mc@518
|
143 for(int i = 0; i != numHashBoundaries ; i++ )
|
mas01mc@518
|
144 (*distFuns)[i] = make_pair(x[i], make_pair(i, i%2?1:-1));
|
mas01mc@517
|
145 // SORT
|
mas01mc@517
|
146 sort( distFuns->begin(), distFuns->end() );
|
mas01mc@516
|
147 }
|
mas01mc@516
|
148
|
mas01mc@517
|
149 // For a given perturbation set, the score is the
|
mas01mc@517
|
150 // sum of squares of corresponding distances in x
|
mas01mc@516
|
151 double MultiProbe::score(perturbation_set& a){
|
mas01mc@517
|
152 //assert(!a.empty());
|
mas01mc@517
|
153 double score = 0.0, tmp;
|
mas01mc@517
|
154 perturbation_set::iterator it;
|
mas01mc@517
|
155 it = a.begin();
|
mas01mc@517
|
156 do{
|
mas01mc@518
|
157 tmp = (*distFuns)[*it].first;
|
mas01mc@517
|
158 score += tmp*tmp;
|
mas01mc@518
|
159 }while( ++it != a.end() );
|
mas01mc@517
|
160 return score;
|
mas01mc@516
|
161 }
|
mas01mc@516
|
162
|
mas01mc@517
|
163 // A valid set must have at most one
|
mas01mc@517
|
164 // of the two elements {j, 2M + 1 - j} for every j
|
mas01mc@517
|
165 //
|
mas01mc@517
|
166 // A perturbation set containing an element > 2M is invalid
|
mas01mc@516
|
167 bool MultiProbe::valid(perturbation_set& a){
|
mas01mc@517
|
168 int j;
|
mas01mc@517
|
169 perturbation_set::iterator it = a.begin();
|
mas01mc@517
|
170 while( it != a.end() ){
|
mas01mc@518
|
171 j = *it;
|
mas01mc@518
|
172 it++;
|
mas01mc@517
|
173 if( ( j > numHashBoundaries ) || ( a.find( numHashBoundaries + 1 - j ) != a.end() ) )
|
mas01mc@517
|
174 return false;
|
mas01mc@517
|
175 }
|
mas01mc@517
|
176 return true;
|
mas01mc@516
|
177 }
|
mas01mc@516
|
178
|
mas01mc@518
|
179 // copy return next perturbation_set
|
mas01mc@518
|
180 perturbation_set MultiProbe::getNextPerturbationSet(){
|
mas01mc@518
|
181 perturbation_set s = outSets->top().perturbs;
|
mas01mc@518
|
182 outSets->pop();
|
mas01mc@518
|
183 return s;
|
mas01mc@518
|
184 }
|
mas01mc@518
|
185
|
mas01mc@518
|
186 // Test routine: generate 100 random boundary distance pairs
|
mas01mc@518
|
187 // call generatePerturbationSets on these distances
|
mas01mc@518
|
188 // dump output for inspection
|
mas01mc@516
|
189 #ifdef _TEST_MP_LSH
|
mas01mc@516
|
190 int main(const int argc, const char* argv[]){
|
mas01mc@518
|
191 int N_SAMPS = 100; // Number of random samples
|
mas01mc@518
|
192 int W = 4; // simulated hash-bucket size
|
mas01mc@518
|
193 int N_ITER = 100; // How many re-entrant iterations
|
mas01mc@518
|
194 int T = 10; // Number of multi-probe sets to generate
|
mas01mc@518
|
195
|
mas01mc@518
|
196 MultiProbe mp= MultiProbe();
|
mas01mc@518
|
197 vector<double> x(N_SAMPS);
|
mas01mc@518
|
198
|
mas01mc@518
|
199 srand((unsigned)time(0));
|
mas01mc@518
|
200
|
mas01mc@518
|
201 // Test re-entrance on single instance
|
mas01mc@518
|
202 for(int j = 0; j< N_ITER ; j++){
|
mas01mc@518
|
203 cout << "********** ITERATION " << j << " **********" << endl;
|
mas01mc@518
|
204 cout.flush();
|
mas01mc@518
|
205 for (int i = 0 ; i != x.size()/2 ; i++ ){
|
mas01mc@518
|
206 x[2*i] = W*(rand()/(RAND_MAX+1.0));
|
mas01mc@518
|
207 x[2*i+1] = W - x[2*i];
|
mas01mc@518
|
208 }
|
mas01mc@518
|
209 // Generate multi-probe sets
|
mas01mc@518
|
210 mp.generatePerturbationSets(x, T);
|
mas01mc@518
|
211 // Output contents of multi-probe sets
|
mas01mc@518
|
212 while(!mp.empty())
|
mas01mc@518
|
213 mp.dump(mp.getNextPerturbationSet());
|
mas01mc@518
|
214 }
|
mas01mc@516
|
215 }
|
mas01mc@516
|
216 #endif
|