mas01mc@516
|
1 /*
|
mas01mc@516
|
2 * MultiProbe C++ class
|
mas01mc@516
|
3 *
|
mas01mc@516
|
4 * Given a LSH structure, perform lookup by probing nearby hash-function locations
|
mas01mc@516
|
5 *
|
mas01mc@516
|
6 * Implementation using C++ STL
|
mas01mc@516
|
7 *
|
mas01mc@516
|
8 * Reference:
|
mas01mc@516
|
9 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
10 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
11 * Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
12 *
|
mas01mc@516
|
13 *
|
mas01mc@516
|
14 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
|
mas01mc@516
|
15 * License: GNU Public License 2.0
|
mas01mc@516
|
16 *
|
mas01mc@516
|
17 */
|
mas01mc@516
|
18
|
mas01mc@516
|
19 #include <functional>
|
mas01mc@516
|
20 #include <queue>
|
mas01mc@516
|
21 #include <vector>
|
mas01mc@516
|
22 #include <set>
|
mas01mc@517
|
23 #include <algorithm>
|
mas01mc@517
|
24 #include <iostream>
|
mas01mc@516
|
25
|
mas01mc@516
|
26 using namespace std ;
|
mas01mc@516
|
27
|
mas01mc@516
|
28 typedef set<int > perturbation_set ;
|
mas01mc@516
|
29 typedef vector<perturbation_set > vector_of_perturbation_set ;
|
mas01mc@516
|
30 typedef pair<perturbation_set, float > min_heap_element ;
|
mas01mc@516
|
31 typedef vector<min_heap_element > vector_of_min_heap_element ;
|
mas01mc@516
|
32 typedef priority_queue<
|
mas01mc@516
|
33 min_heap_element,
|
mas01mc@516
|
34 vector_of_min_heap_element,
|
mas01mc@516
|
35 greater<min_heap_element>
|
mas01mc@516
|
36 > min_heap_of_perturbation_set ;
|
mas01mc@516
|
37 typedef pair<double, pair<int, int> > sorted_distance_functions ;
|
mas01mc@517
|
38 typedef vector<sorted_distance_functions> vector_of_sorted_distance_functions;
|
mas01mc@516
|
39
|
mas01mc@516
|
40 bool operator>(const min_heap_element& a, const min_heap_element& b){
|
mas01mc@516
|
41 return a.second > b.second;
|
mas01mc@516
|
42 }
|
mas01mc@516
|
43
|
mas01mc@516
|
44 bool operator<(const min_heap_element& a, const min_heap_element& b){
|
mas01mc@516
|
45 return a.second < b.second;
|
mas01mc@516
|
46 }
|
mas01mc@516
|
47
|
mas01mc@516
|
48 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
|
mas01mc@517
|
49 return a.first > b.first;
|
mas01mc@516
|
50 }
|
mas01mc@516
|
51
|
mas01mc@516
|
52 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
|
mas01mc@517
|
53 return a.first < b.first;
|
mas01mc@516
|
54 }
|
mas01mc@516
|
55
|
mas01mc@516
|
56 class MultiProbe{
|
mas01mc@516
|
57 protected:
|
mas01mc@516
|
58 min_heap_of_perturbation_set* minHeap;
|
mas01mc@516
|
59 vector_of_perturbation_set* outSets;
|
mas01mc@517
|
60 vector_of_sorted_distance_functions* distFuns;
|
mas01mc@517
|
61 unsigned numHashBoundaries;
|
mas01mc@516
|
62 public:
|
mas01mc@516
|
63 MultiProbe();
|
mas01mc@516
|
64 ~MultiProbe();
|
mas01mc@516
|
65
|
mas01mc@516
|
66 // perturbation set operations
|
mas01mc@517
|
67 perturbation_set& shift(perturbation_set&);
|
mas01mc@517
|
68 perturbation_set& expand(perturbation_set&);
|
mas01mc@516
|
69 double score(perturbation_set&);
|
mas01mc@516
|
70 bool valid(perturbation_set&);
|
mas01mc@516
|
71
|
mas01mc@516
|
72 // generation of perturbation sets
|
mas01mc@517
|
73 void generatePerturbationSets(int, vector<double>&);
|
mas01mc@517
|
74 void makeSortedDistFuns(vector<double> &);
|
mas01mc@517
|
75 void dump(perturbation_set& );
|
mas01mc@516
|
76 };
|
mas01mc@516
|
77
|
mas01mc@516
|
78 /* NOTES:
|
mas01mc@516
|
79
|
mas01mc@516
|
80 Reference:
|
mas01mc@516
|
81 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@516
|
82 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@516
|
83 Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@516
|
84
|
mas01mc@516
|
85 i = 1..M (number of hash functions used)
|
mas01mc@516
|
86 f_i(q) = a_i * q + b_i // projection to real line
|
mas01mc@516
|
87 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
|
mas01mc@516
|
88 delta \in {-1, +1}
|
mas01mc@516
|
89 x_i( delta ) = distance of query to left or right boundary
|
mas01mc@516
|
90 Delta = [delta_1 delta_2 ... delta_M]
|
mas01mc@516
|
91 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
|
mas01mc@516
|
92
|
mas01mc@516
|
93 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
|
mas01mc@516
|
94 p_j = (i, delta) if z_j == x_i(delta)
|
mas01mc@516
|
95
|
mas01mc@516
|
96 A_k is index into p_j
|
mas01mc@516
|
97
|
mas01mc@516
|
98 Multi-probe algorithm (after Lv et al. 2007)
|
mas01mc@516
|
99 ---------------------------------------------
|
mas01mc@516
|
100
|
mas01mc@516
|
101 A0 = {1}
|
mas01mc@516
|
102 minHeap.insert(A0, score(A0))
|
mas01mc@516
|
103
|
mas01mc@516
|
104 for i = 1 to T do // T = number of perturbation sets
|
mas01mc@516
|
105 repeat
|
mas01mc@516
|
106 Ai = minHeap.extractMin()
|
mas01mc@516
|
107 As = shift(Ai)
|
mas01mc@516
|
108 minHeap.insert(As, score(As))
|
mas01mc@516
|
109 Ae = expand(Ai)
|
mas01mc@516
|
110 minHeap.insert(Ae, score(Ae))
|
mas01mc@516
|
111 until valid(Ai)
|
mas01mc@516
|
112 output Ai
|
mas01mc@516
|
113 end for
|
mas01mc@516
|
114
|
mas01mc@516
|
115 */
|