mas01mc@516
|
1 /*
|
mas01mc@516
|
2 * MultiProbe C++ class
|
mas01mc@516
|
3 *
|
mas01mc@516
|
4 * Given a LSH structure, perform lookup by probing nearby hash-function locations
|
mas01mc@516
|
5 *
|
mas01mc@516
|
6 * Implementation using C++ STL
|
mas01mc@516
|
7 *
|
mas01mc@516
|
8 * Reference:
|
mas01mc@516
|
9 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
10 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
11 * Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
12 *
|
mas01mc@516
|
13 *
|
mas01mc@516
|
14 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
|
mas01mc@516
|
15 * License: GNU Public License 2.0
|
mas01mc@516
|
16 *
|
mas01mc@516
|
17 */
|
mas01mc@516
|
18
|
mas01mc@516
|
19 #include <functional>
|
mas01mc@516
|
20 #include <queue>
|
mas01mc@516
|
21 #include <vector>
|
mas01mc@516
|
22 #include <set>
|
mas01mc@516
|
23
|
mas01mc@516
|
24 using namespace std ;
|
mas01mc@516
|
25
|
mas01mc@516
|
26 typedef set<int > perturbation_set ;
|
mas01mc@516
|
27 typedef vector<perturbation_set > vector_of_perturbation_set ;
|
mas01mc@516
|
28 typedef pair<perturbation_set, float > min_heap_element ;
|
mas01mc@516
|
29 typedef vector<min_heap_element > vector_of_min_heap_element ;
|
mas01mc@516
|
30 typedef priority_queue<
|
mas01mc@516
|
31 min_heap_element,
|
mas01mc@516
|
32 vector_of_min_heap_element,
|
mas01mc@516
|
33 greater<min_heap_element>
|
mas01mc@516
|
34 > min_heap_of_perturbation_set ;
|
mas01mc@516
|
35 typedef pair<double, pair<int, int> > sorted_distance_functions ;
|
mas01mc@516
|
36
|
mas01mc@516
|
37 bool operator>(const min_heap_element& a, const min_heap_element& b){
|
mas01mc@516
|
38 return a.second > b.second;
|
mas01mc@516
|
39 }
|
mas01mc@516
|
40
|
mas01mc@516
|
41 bool operator<(const min_heap_element& a, const min_heap_element& b){
|
mas01mc@516
|
42 return a.second < b.second;
|
mas01mc@516
|
43 }
|
mas01mc@516
|
44
|
mas01mc@516
|
45 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
|
mas01mc@516
|
46 return a.second > b.second;
|
mas01mc@516
|
47 }
|
mas01mc@516
|
48
|
mas01mc@516
|
49 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
|
mas01mc@516
|
50 return a.second < b.second;
|
mas01mc@516
|
51 }
|
mas01mc@516
|
52
|
mas01mc@516
|
53 class MultiProbe{
|
mas01mc@516
|
54 protected:
|
mas01mc@516
|
55 min_heap_of_perturbation_set* minHeap;
|
mas01mc@516
|
56 vector_of_perturbation_set* outSets;
|
mas01mc@516
|
57 sorted_distance_functions distFuns;
|
mas01mc@516
|
58 public:
|
mas01mc@516
|
59 MultiProbe();
|
mas01mc@516
|
60 ~MultiProbe();
|
mas01mc@516
|
61
|
mas01mc@516
|
62 // perturbation set operations
|
mas01mc@516
|
63 perturbation_set& shift(perturbation_set);
|
mas01mc@516
|
64 perturbation_set& expand(perturbation_set);
|
mas01mc@516
|
65 double score(perturbation_set&);
|
mas01mc@516
|
66 bool valid(perturbation_set&);
|
mas01mc@516
|
67
|
mas01mc@516
|
68 // generation of perturbation sets
|
mas01mc@516
|
69 void generatePerturbationSets(int, vector<int>&);
|
mas01mc@516
|
70 sorted_distance_functions& makeSortedDistFuns(vector<int> &);
|
mas01mc@516
|
71 };
|
mas01mc@516
|
72
|
mas01mc@516
|
73 /* NOTES:
|
mas01mc@516
|
74
|
mas01mc@516
|
75 Reference:
|
mas01mc@516
|
76 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
77 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
78 Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
79
|
mas01mc@516
|
80 i = 1..M (number of hash functions used)
|
mas01mc@516
|
81 f_i(q) = a_i * q + b_i // projection to real line
|
mas01mc@516
|
82 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
|
mas01mc@516
|
83 delta \in {-1, +1}
|
mas01mc@516
|
84 x_i( delta ) = distance of query to left or right boundary
|
mas01mc@516
|
85 Delta = [delta_1 delta_2 ... delta_M]
|
mas01mc@516
|
86 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
|
mas01mc@516
|
87
|
mas01mc@516
|
88 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
|
mas01mc@516
|
89 p_j = (i, delta) if z_j == x_i(delta)
|
mas01mc@516
|
90
|
mas01mc@516
|
91 A_k is index into p_j
|
mas01mc@516
|
92
|
mas01mc@516
|
93 Multi-probe algorithm (after Lv et al. 2007)
|
mas01mc@516
|
94 ---------------------------------------------
|
mas01mc@516
|
95
|
mas01mc@516
|
96 A0 = {1}
|
mas01mc@516
|
97 minHeap.insert(A0, score(A0))
|
mas01mc@516
|
98
|
mas01mc@516
|
99 for i = 1 to T do // T = number of perturbation sets
|
mas01mc@516
|
100 repeat
|
mas01mc@516
|
101 Ai = minHeap.extractMin()
|
mas01mc@516
|
102 As = shift(Ai)
|
mas01mc@516
|
103 minHeap.insert(As, score(As))
|
mas01mc@516
|
104 Ae = expand(Ai)
|
mas01mc@516
|
105 minHeap.insert(Ae, score(Ae))
|
mas01mc@516
|
106 until valid(Ai)
|
mas01mc@516
|
107 output Ai
|
mas01mc@516
|
108 end for
|
mas01mc@516
|
109
|
mas01mc@516
|
110 */
|