mas01mc@764
|
1 /*
|
mas01mc@764
|
2 * MultiProbe C++ class
|
mas01mc@764
|
3 *
|
mas01mc@764
|
4 * Given a vector of LSH boundary distances for a query,
|
mas01mc@764
|
5 * perform lookup by probing nearby hash-function locations
|
mas01mc@764
|
6 *
|
mas01mc@764
|
7 * Implementation using C++ STL
|
mas01mc@764
|
8 *
|
mas01mc@764
|
9 * Reference:
|
mas01mc@764
|
10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@764
|
11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@764
|
12 * Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@764
|
13 *
|
mas01mc@764
|
14 *
|
mas01mc@764
|
15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
|
mas01mc@764
|
16 * License: GNU Public License 2.0
|
mas01mc@764
|
17 *
|
mas01mc@764
|
18 */
|
mas01mc@764
|
19
|
mas01mc@764
|
20 #ifndef __LSH_MULTIPROBE_
|
mas01mc@764
|
21 #define __LSH_MULTIPROBE_
|
mas01mc@764
|
22
|
mas01mc@764
|
23 #include <functional>
|
mas01mc@764
|
24 #include <queue>
|
mas01mc@764
|
25 #include <vector>
|
mas01mc@764
|
26 #include <set>
|
mas01mc@764
|
27 #include <algorithm>
|
mas01mc@764
|
28 #include <iostream>
|
mas01mc@764
|
29
|
mas01mc@764
|
30 using namespace std;
|
mas01mc@764
|
31
|
mas01mc@764
|
32 typedef set<int > perturbation_set ;
|
mas01mc@764
|
33
|
mas01mc@764
|
34 typedef class MinHeapElement{
|
mas01mc@764
|
35 public:
|
mas01mc@764
|
36 perturbation_set perturbs;
|
mas01mc@764
|
37 float score;
|
mas01mc@764
|
38 MinHeapElement(perturbation_set a, float s);
|
mas01mc@764
|
39 virtual ~MinHeapElement();
|
mas01mc@764
|
40 } min_heap_element;
|
mas01mc@764
|
41
|
mas01mc@764
|
42 typedef priority_queue<min_heap_element,
|
mas01mc@764
|
43 vector<min_heap_element>,
|
mas01mc@764
|
44 greater<min_heap_element>
|
mas01mc@764
|
45 > min_heap_of_perturbation_set ;
|
mas01mc@764
|
46
|
mas01mc@764
|
47 typedef pair<float, pair<int, int> > sorted_distance_functions ;
|
mas01mc@764
|
48
|
mas01mc@764
|
49 class MultiProbe{
|
mas01mc@764
|
50 protected:
|
mas01mc@764
|
51 min_heap_of_perturbation_set* minHeap;
|
mas01mc@764
|
52 min_heap_of_perturbation_set* outSets;
|
mas01mc@764
|
53 vector<sorted_distance_functions>* distFuns;
|
mas01mc@764
|
54 unsigned numHashBoundaries;
|
mas01mc@764
|
55
|
mas01mc@764
|
56 // data structure initialization and cleanup
|
mas01mc@764
|
57 void initialize();
|
mas01mc@764
|
58 void cleanup();
|
mas01mc@764
|
59
|
mas01mc@764
|
60 // perturbation set operations
|
mas01mc@764
|
61 perturbation_set& shift(perturbation_set&);
|
mas01mc@764
|
62 perturbation_set& expand(perturbation_set&);
|
mas01mc@764
|
63 float score(perturbation_set&);
|
mas01mc@764
|
64 bool valid(perturbation_set&);
|
mas01mc@764
|
65 void makeSortedDistFuns(vector<float> &);
|
mas01mc@764
|
66 void makeSortedDistFuns(float* x, unsigned N);
|
mas01mc@764
|
67
|
mas01mc@764
|
68 // perturbation set generation algorithm
|
mas01mc@764
|
69 void algorithm1(unsigned T);
|
mas01mc@764
|
70
|
mas01mc@764
|
71 public:
|
mas01mc@764
|
72 MultiProbe();
|
mas01mc@764
|
73 ~MultiProbe();
|
mas01mc@764
|
74
|
mas01mc@764
|
75 // generation of perturbation sets
|
mas01mc@764
|
76 void generatePerturbationSets(vector<float>& vectorOfBounaryDistances, unsigned numSetsToGenerate);
|
mas01mc@764
|
77 void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate);
|
mas01mc@764
|
78 perturbation_set getNextPerturbationSet();
|
mas01mc@764
|
79 void dump(perturbation_set);
|
mas01mc@764
|
80 size_t size(); // Number of perturbation sets are in the output queue
|
mas01mc@764
|
81 bool empty(); // predicate for empty MultiProbe set
|
mas01mc@764
|
82 int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry
|
mas01mc@764
|
83 int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry
|
mas01mc@764
|
84 };
|
mas01mc@764
|
85
|
mas01mc@764
|
86
|
mas01mc@764
|
87 /* NOTES:
|
mas01mc@764
|
88
|
mas01mc@764
|
89 Reference:
|
mas01mc@764
|
90 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
|
mas01mc@764
|
91 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
|
mas01mc@764
|
92 Search", Proc. Intl. Conf. VLDB, 2007
|
mas01mc@764
|
93
|
mas01mc@764
|
94 i = 1..M (number of hash functions used)
|
mas01mc@764
|
95 f_i(q) = a_i * q + b_i // projection to real line
|
mas01mc@764
|
96 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
|
mas01mc@764
|
97 delta \in {-1, +1}
|
mas01mc@764
|
98 x_i( delta ) = distance of query to left or right boundary
|
mas01mc@764
|
99 Delta = [delta_1 delta_2 ... delta_M]
|
mas01mc@764
|
100 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
|
mas01mc@764
|
101
|
mas01mc@764
|
102 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
|
mas01mc@764
|
103 p_j = (i, delta) if z_j == x_i(delta)
|
mas01mc@764
|
104
|
mas01mc@764
|
105 A_k is index into p_j
|
mas01mc@764
|
106
|
mas01mc@764
|
107 Multi-probe algorithm (after Lv et al. 2007)
|
mas01mc@764
|
108 ---------------------------------------------
|
mas01mc@764
|
109
|
mas01mc@764
|
110 A0 = {1}
|
mas01mc@764
|
111 minHeap.insert(A0, score(A0))
|
mas01mc@764
|
112
|
mas01mc@764
|
113 for i = 1 to T do // T = number of perturbation sets
|
mas01mc@764
|
114 repeat
|
mas01mc@764
|
115 Ai = minHeap.extractMin()
|
mas01mc@764
|
116 As = shift(Ai)
|
mas01mc@764
|
117 minHeap.insert(As, score(As))
|
mas01mc@764
|
118 Ae = expand(Ai)
|
mas01mc@764
|
119 minHeap.insert(Ae, score(Ae))
|
mas01mc@764
|
120 until valid(Ai)
|
mas01mc@764
|
121 output Ai
|
mas01mc@764
|
122 end for
|
mas01mc@764
|
123
|
mas01mc@764
|
124 */
|
mas01mc@764
|
125
|
mas01mc@764
|
126 #endif // __LSH_MULTIPROBE_
|