annotate multiprobe.h @ 517:807c8be7dd45 multiprobeLSH

Completed multiprobe framework for LSH. Requires testing.
author mas01mc
date Sun, 25 Jan 2009 06:10:38 +0000
parents 2a7bad47a4a7
children ca1ee92c359c
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@516 4 * Given a LSH structure, perform lookup by probing nearby hash-function locations
mas01mc@516 5 *
mas01mc@516 6 * Implementation using C++ STL
mas01mc@516 7 *
mas01mc@516 8 * Reference:
mas01mc@516 9 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 10 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 11 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 12 *
mas01mc@516 13 *
mas01mc@516 14 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 15 * License: GNU Public License 2.0
mas01mc@516 16 *
mas01mc@516 17 */
mas01mc@516 18
mas01mc@516 19 #include <functional>
mas01mc@516 20 #include <queue>
mas01mc@516 21 #include <vector>
mas01mc@516 22 #include <set>
mas01mc@517 23 #include <algorithm>
mas01mc@517 24 #include <iostream>
mas01mc@516 25
mas01mc@516 26 using namespace std ;
mas01mc@516 27
mas01mc@516 28 typedef set<int > perturbation_set ;
mas01mc@516 29 typedef vector<perturbation_set > vector_of_perturbation_set ;
mas01mc@516 30 typedef pair<perturbation_set, float > min_heap_element ;
mas01mc@516 31 typedef vector<min_heap_element > vector_of_min_heap_element ;
mas01mc@516 32 typedef priority_queue<
mas01mc@516 33 min_heap_element,
mas01mc@516 34 vector_of_min_heap_element,
mas01mc@516 35 greater<min_heap_element>
mas01mc@516 36 > min_heap_of_perturbation_set ;
mas01mc@516 37 typedef pair<double, pair<int, int> > sorted_distance_functions ;
mas01mc@517 38 typedef vector<sorted_distance_functions> vector_of_sorted_distance_functions;
mas01mc@516 39
mas01mc@516 40 bool operator>(const min_heap_element& a, const min_heap_element& b){
mas01mc@516 41 return a.second > b.second;
mas01mc@516 42 }
mas01mc@516 43
mas01mc@516 44 bool operator<(const min_heap_element& a, const min_heap_element& b){
mas01mc@516 45 return a.second < b.second;
mas01mc@516 46 }
mas01mc@516 47
mas01mc@516 48 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@517 49 return a.first > b.first;
mas01mc@516 50 }
mas01mc@516 51
mas01mc@516 52 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@517 53 return a.first < b.first;
mas01mc@516 54 }
mas01mc@516 55
mas01mc@516 56 class MultiProbe{
mas01mc@516 57 protected:
mas01mc@516 58 min_heap_of_perturbation_set* minHeap;
mas01mc@516 59 vector_of_perturbation_set* outSets;
mas01mc@517 60 vector_of_sorted_distance_functions* distFuns;
mas01mc@517 61 unsigned numHashBoundaries;
mas01mc@516 62 public:
mas01mc@516 63 MultiProbe();
mas01mc@516 64 ~MultiProbe();
mas01mc@516 65
mas01mc@516 66 // perturbation set operations
mas01mc@517 67 perturbation_set& shift(perturbation_set&);
mas01mc@517 68 perturbation_set& expand(perturbation_set&);
mas01mc@516 69 double score(perturbation_set&);
mas01mc@516 70 bool valid(perturbation_set&);
mas01mc@516 71
mas01mc@516 72 // generation of perturbation sets
mas01mc@517 73 void generatePerturbationSets(int, vector<double>&);
mas01mc@517 74 void makeSortedDistFuns(vector<double> &);
mas01mc@517 75 void dump(perturbation_set& );
mas01mc@516 76 };
mas01mc@516 77
mas01mc@516 78 /* NOTES:
mas01mc@516 79
mas01mc@516 80 Reference:
mas01mc@516 81 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 82 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 83 Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 84
mas01mc@516 85 i = 1..M (number of hash functions used)
mas01mc@516 86 f_i(q) = a_i * q + b_i // projection to real line
mas01mc@516 87 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
mas01mc@516 88 delta \in {-1, +1}
mas01mc@516 89 x_i( delta ) = distance of query to left or right boundary
mas01mc@516 90 Delta = [delta_1 delta_2 ... delta_M]
mas01mc@516 91 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
mas01mc@516 92
mas01mc@516 93 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
mas01mc@516 94 p_j = (i, delta) if z_j == x_i(delta)
mas01mc@516 95
mas01mc@516 96 A_k is index into p_j
mas01mc@516 97
mas01mc@516 98 Multi-probe algorithm (after Lv et al. 2007)
mas01mc@516 99 ---------------------------------------------
mas01mc@516 100
mas01mc@516 101 A0 = {1}
mas01mc@516 102 minHeap.insert(A0, score(A0))
mas01mc@516 103
mas01mc@516 104 for i = 1 to T do // T = number of perturbation sets
mas01mc@516 105 repeat
mas01mc@516 106 Ai = minHeap.extractMin()
mas01mc@516 107 As = shift(Ai)
mas01mc@516 108 minHeap.insert(As, score(As))
mas01mc@516 109 Ae = expand(Ai)
mas01mc@516 110 minHeap.insert(Ae, score(Ae))
mas01mc@516 111 until valid(Ai)
mas01mc@516 112 output Ai
mas01mc@516 113 end for
mas01mc@516 114
mas01mc@516 115 */