mas01mc@516
|
1 /*
|
mas01mc@516
|
2 * MultiProbe C++ class
|
mas01mc@516
|
3 *
|
mas01mc@518
|
4 * Given a vector of LSH boundary distances for a query,
|
mas01mc@518
|
5 * perform lookup by probing nearby hash-function locations
|
mas01mc@516
|
6 *
|
mas01mc@516
|
7 * Implementation using C++ STL
|
mas01mc@516
|
8 *
|
mas01mc@516
|
9 * Reference:
|
mas01mc@516
|
10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
12 * Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
13 *
|
mas01mc@516
|
14 *
|
mas01mc@516
|
15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
|
mas01mc@516
|
16 * License: GNU Public License 2.0
|
mas01mc@516
|
17 *
|
mas01mc@516
|
18 */
|
mas01mc@516
|
19
|
mas01mc@518
|
20 #ifndef __LSH_MULTIPROBE_
|
mas01mc@518
|
21 #define __LSH_MULTIPROBE_
|
mas01mc@518
|
22
|
mas01mc@516
|
23 #include <functional>
|
mas01mc@516
|
24 #include <queue>
|
mas01mc@516
|
25 #include <vector>
|
mas01mc@516
|
26 #include <set>
|
mas01mc@517
|
27 #include <algorithm>
|
mas01mc@517
|
28 #include <iostream>
|
mas01mc@516
|
29
|
mas01mc@518
|
30 using namespace std;
|
mas01mc@516
|
31
|
mas01mc@516
|
32 typedef set<int > perturbation_set ;
|
mas01mc@518
|
33
|
mas01mc@518
|
34 typedef class MinHeapElement{
|
mas01mc@518
|
35 public:
|
mas01mc@518
|
36 perturbation_set perturbs;
|
mas01mc@518
|
37 double score;
|
mas01mc@518
|
38 MinHeapElement(perturbation_set a, double s);
|
mas01mc@518
|
39 virtual ~MinHeapElement();
|
mas01mc@518
|
40 } min_heap_element;
|
mas01mc@518
|
41
|
mas01mc@518
|
42 typedef priority_queue<min_heap_element,
|
mas01mc@518
|
43 vector<min_heap_element>,
|
mas01mc@518
|
44 greater<min_heap_element>
|
mas01mc@518
|
45 > min_heap_of_perturbation_set ;
|
mas01mc@518
|
46
|
mas01mc@516
|
47 typedef pair<double, pair<int, int> > sorted_distance_functions ;
|
mas01mc@516
|
48
|
mas01mc@518
|
49 bool operator> (const min_heap_element& a, const min_heap_element& b){
|
mas01mc@518
|
50 return a.score > b.score;
|
mas01mc@516
|
51 }
|
mas01mc@516
|
52
|
mas01mc@518
|
53 bool operator< (const min_heap_element& a, const min_heap_element& b){
|
mas01mc@518
|
54 return a.score < b.score;
|
mas01mc@516
|
55 }
|
mas01mc@516
|
56
|
mas01mc@516
|
57 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
|
mas01mc@517
|
58 return a.first > b.first;
|
mas01mc@516
|
59 }
|
mas01mc@516
|
60
|
mas01mc@516
|
61 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
|
mas01mc@517
|
62 return a.first < b.first;
|
mas01mc@516
|
63 }
|
mas01mc@516
|
64
|
mas01mc@516
|
65 class MultiProbe{
|
mas01mc@516
|
66 protected:
|
mas01mc@516
|
67 min_heap_of_perturbation_set* minHeap;
|
mas01mc@518
|
68 min_heap_of_perturbation_set* outSets;
|
mas01mc@518
|
69 vector<sorted_distance_functions>* distFuns;
|
mas01mc@517
|
70 unsigned numHashBoundaries;
|
mas01mc@516
|
71
|
mas01mc@518
|
72 // data structure initialization and cleanup
|
mas01mc@518
|
73 void initialize();
|
mas01mc@518
|
74 void cleanup();
|
mas01mc@518
|
75
|
mas01mc@516
|
76 // perturbation set operations
|
mas01mc@517
|
77 perturbation_set& shift(perturbation_set&);
|
mas01mc@517
|
78 perturbation_set& expand(perturbation_set&);
|
mas01mc@516
|
79 double score(perturbation_set&);
|
mas01mc@516
|
80 bool valid(perturbation_set&);
|
mas01mc@518
|
81 void makeSortedDistFuns(vector<double> &);
|
mas01mc@518
|
82
|
mas01mc@518
|
83 public:
|
mas01mc@518
|
84 MultiProbe();
|
mas01mc@518
|
85 ~MultiProbe();
|
mas01mc@516
|
86
|
mas01mc@516
|
87 // generation of perturbation sets
|
mas01mc@518
|
88 void generatePerturbationSets(vector<double>& vectorOfBounaryDistances, int numSetsToGenerate);
|
mas01mc@518
|
89 perturbation_set getNextPerturbationSet();
|
mas01mc@518
|
90 void dump(perturbation_set);
|
mas01mc@518
|
91 size_t size(); // Number of perturbation sets are in the output queue
|
mas01mc@518
|
92 bool empty(); // predicate for empty MultiProbe set
|
mas01mc@516
|
93 };
|
mas01mc@516
|
94
|
mas01mc@518
|
95
|
mas01mc@516
|
96 /* NOTES:
|
mas01mc@516
|
97
|
mas01mc@516
|
98 Reference:
|
mas01mc@516
|
99 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
100 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
101 Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
102
|
mas01mc@516
|
103 i = 1..M (number of hash functions used)
|
mas01mc@516
|
104 f_i(q) = a_i * q + b_i // projection to real line
|
mas01mc@516
|
105 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
|
mas01mc@516
|
106 delta \in {-1, +1}
|
mas01mc@516
|
107 x_i( delta ) = distance of query to left or right boundary
|
mas01mc@516
|
108 Delta = [delta_1 delta_2 ... delta_M]
|
mas01mc@516
|
109 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
|
mas01mc@516
|
110
|
mas01mc@516
|
111 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
|
mas01mc@516
|
112 p_j = (i, delta) if z_j == x_i(delta)
|
mas01mc@516
|
113
|
mas01mc@516
|
114 A_k is index into p_j
|
mas01mc@516
|
115
|
mas01mc@516
|
116 Multi-probe algorithm (after Lv et al. 2007)
|
mas01mc@516
|
117 ---------------------------------------------
|
mas01mc@516
|
118
|
mas01mc@516
|
119 A0 = {1}
|
mas01mc@516
|
120 minHeap.insert(A0, score(A0))
|
mas01mc@516
|
121
|
mas01mc@516
|
122 for i = 1 to T do // T = number of perturbation sets
|
mas01mc@516
|
123 repeat
|
mas01mc@516
|
124 Ai = minHeap.extractMin()
|
mas01mc@516
|
125 As = shift(Ai)
|
mas01mc@516
|
126 minHeap.insert(As, score(As))
|
mas01mc@516
|
127 Ae = expand(Ai)
|
mas01mc@516
|
128 minHeap.insert(Ae, score(Ae))
|
mas01mc@516
|
129 until valid(Ai)
|
mas01mc@516
|
130 output Ai
|
mas01mc@516
|
131 end for
|
mas01mc@516
|
132
|
mas01mc@516
|
133 */
|
mas01mc@518
|
134
|
mas01mc@518
|
135 #endif // __LSH_MULTIPROBE_
|