annotate multiprobe.h @ 516:2a7bad47a4a7 multiprobeLSH

New branch to implement multiprobe LSH. Copy of trunk:802. Added multiprobe.{cpp,h} source files.
author mas01mc
date Sat, 24 Jan 2009 19:51:46 +0000
parents
children 807c8be7dd45
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@516 4 * Given a LSH structure, perform lookup by probing nearby hash-function locations
mas01mc@516 5 *
mas01mc@516 6 * Implementation using C++ STL
mas01mc@516 7 *
mas01mc@516 8 * Reference:
mas01mc@516 9 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 10 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 11 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 12 *
mas01mc@516 13 *
mas01mc@516 14 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 15 * License: GNU Public License 2.0
mas01mc@516 16 *
mas01mc@516 17 */
mas01mc@516 18
mas01mc@516 19 #include <functional>
mas01mc@516 20 #include <queue>
mas01mc@516 21 #include <vector>
mas01mc@516 22 #include <set>
mas01mc@516 23
mas01mc@516 24 using namespace std ;
mas01mc@516 25
mas01mc@516 26 typedef set<int > perturbation_set ;
mas01mc@516 27 typedef vector<perturbation_set > vector_of_perturbation_set ;
mas01mc@516 28 typedef pair<perturbation_set, float > min_heap_element ;
mas01mc@516 29 typedef vector<min_heap_element > vector_of_min_heap_element ;
mas01mc@516 30 typedef priority_queue<
mas01mc@516 31 min_heap_element,
mas01mc@516 32 vector_of_min_heap_element,
mas01mc@516 33 greater<min_heap_element>
mas01mc@516 34 > min_heap_of_perturbation_set ;
mas01mc@516 35 typedef pair<double, pair<int, int> > sorted_distance_functions ;
mas01mc@516 36
mas01mc@516 37 bool operator>(const min_heap_element& a, const min_heap_element& b){
mas01mc@516 38 return a.second > b.second;
mas01mc@516 39 }
mas01mc@516 40
mas01mc@516 41 bool operator<(const min_heap_element& a, const min_heap_element& b){
mas01mc@516 42 return a.second < b.second;
mas01mc@516 43 }
mas01mc@516 44
mas01mc@516 45 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@516 46 return a.second > b.second;
mas01mc@516 47 }
mas01mc@516 48
mas01mc@516 49 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@516 50 return a.second < b.second;
mas01mc@516 51 }
mas01mc@516 52
mas01mc@516 53 class MultiProbe{
mas01mc@516 54 protected:
mas01mc@516 55 min_heap_of_perturbation_set* minHeap;
mas01mc@516 56 vector_of_perturbation_set* outSets;
mas01mc@516 57 sorted_distance_functions distFuns;
mas01mc@516 58 public:
mas01mc@516 59 MultiProbe();
mas01mc@516 60 ~MultiProbe();
mas01mc@516 61
mas01mc@516 62 // perturbation set operations
mas01mc@516 63 perturbation_set& shift(perturbation_set);
mas01mc@516 64 perturbation_set& expand(perturbation_set);
mas01mc@516 65 double score(perturbation_set&);
mas01mc@516 66 bool valid(perturbation_set&);
mas01mc@516 67
mas01mc@516 68 // generation of perturbation sets
mas01mc@516 69 void generatePerturbationSets(int, vector<int>&);
mas01mc@516 70 sorted_distance_functions& makeSortedDistFuns(vector<int> &);
mas01mc@516 71 };
mas01mc@516 72
mas01mc@516 73 /* NOTES:
mas01mc@516 74
mas01mc@516 75 Reference:
mas01mc@516 76 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 77 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 78 Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 79
mas01mc@516 80 i = 1..M (number of hash functions used)
mas01mc@516 81 f_i(q) = a_i * q + b_i // projection to real line
mas01mc@516 82 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
mas01mc@516 83 delta \in {-1, +1}
mas01mc@516 84 x_i( delta ) = distance of query to left or right boundary
mas01mc@516 85 Delta = [delta_1 delta_2 ... delta_M]
mas01mc@516 86 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
mas01mc@516 87
mas01mc@516 88 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
mas01mc@516 89 p_j = (i, delta) if z_j == x_i(delta)
mas01mc@516 90
mas01mc@516 91 A_k is index into p_j
mas01mc@516 92
mas01mc@516 93 Multi-probe algorithm (after Lv et al. 2007)
mas01mc@516 94 ---------------------------------------------
mas01mc@516 95
mas01mc@516 96 A0 = {1}
mas01mc@516 97 minHeap.insert(A0, score(A0))
mas01mc@516 98
mas01mc@516 99 for i = 1 to T do // T = number of perturbation sets
mas01mc@516 100 repeat
mas01mc@516 101 Ai = minHeap.extractMin()
mas01mc@516 102 As = shift(Ai)
mas01mc@516 103 minHeap.insert(As, score(As))
mas01mc@516 104 Ae = expand(Ai)
mas01mc@516 105 minHeap.insert(Ae, score(Ae))
mas01mc@516 106 until valid(Ai)
mas01mc@516 107 output Ai
mas01mc@516 108 end for
mas01mc@516 109
mas01mc@516 110 */