annotate multiprobe.h @ 518:ca1ee92c359c multiprobeLSH

Completed initial implementation of LSH MultiProbe class. Now needs to be joined to lshlib.cpp retrieve() method to perform multi-probe queries.
author mas01mc
date Mon, 26 Jan 2009 02:50:44 +0000
parents 807c8be7dd45
children fdcd436d7cbd
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@518 4 * Given a vector of LSH boundary distances for a query,
mas01mc@518 5 * perform lookup by probing nearby hash-function locations
mas01mc@516 6 *
mas01mc@516 7 * Implementation using C++ STL
mas01mc@516 8 *
mas01mc@516 9 * Reference:
mas01mc@516 10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 12 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 13 *
mas01mc@516 14 *
mas01mc@516 15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 16 * License: GNU Public License 2.0
mas01mc@516 17 *
mas01mc@516 18 */
mas01mc@516 19
mas01mc@518 20 #ifndef __LSH_MULTIPROBE_
mas01mc@518 21 #define __LSH_MULTIPROBE_
mas01mc@518 22
mas01mc@516 23 #include <functional>
mas01mc@516 24 #include <queue>
mas01mc@516 25 #include <vector>
mas01mc@516 26 #include <set>
mas01mc@517 27 #include <algorithm>
mas01mc@517 28 #include <iostream>
mas01mc@516 29
mas01mc@518 30 using namespace std;
mas01mc@516 31
mas01mc@516 32 typedef set<int > perturbation_set ;
mas01mc@518 33
mas01mc@518 34 typedef class MinHeapElement{
mas01mc@518 35 public:
mas01mc@518 36 perturbation_set perturbs;
mas01mc@518 37 double score;
mas01mc@518 38 MinHeapElement(perturbation_set a, double s);
mas01mc@518 39 virtual ~MinHeapElement();
mas01mc@518 40 } min_heap_element;
mas01mc@518 41
mas01mc@518 42 typedef priority_queue<min_heap_element,
mas01mc@518 43 vector<min_heap_element>,
mas01mc@518 44 greater<min_heap_element>
mas01mc@518 45 > min_heap_of_perturbation_set ;
mas01mc@518 46
mas01mc@516 47 typedef pair<double, pair<int, int> > sorted_distance_functions ;
mas01mc@516 48
mas01mc@518 49 bool operator> (const min_heap_element& a, const min_heap_element& b){
mas01mc@518 50 return a.score > b.score;
mas01mc@516 51 }
mas01mc@516 52
mas01mc@518 53 bool operator< (const min_heap_element& a, const min_heap_element& b){
mas01mc@518 54 return a.score < b.score;
mas01mc@516 55 }
mas01mc@516 56
mas01mc@516 57 bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@517 58 return a.first > b.first;
mas01mc@516 59 }
mas01mc@516 60
mas01mc@516 61 bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){
mas01mc@517 62 return a.first < b.first;
mas01mc@516 63 }
mas01mc@516 64
mas01mc@516 65 class MultiProbe{
mas01mc@516 66 protected:
mas01mc@516 67 min_heap_of_perturbation_set* minHeap;
mas01mc@518 68 min_heap_of_perturbation_set* outSets;
mas01mc@518 69 vector<sorted_distance_functions>* distFuns;
mas01mc@517 70 unsigned numHashBoundaries;
mas01mc@516 71
mas01mc@518 72 // data structure initialization and cleanup
mas01mc@518 73 void initialize();
mas01mc@518 74 void cleanup();
mas01mc@518 75
mas01mc@516 76 // perturbation set operations
mas01mc@517 77 perturbation_set& shift(perturbation_set&);
mas01mc@517 78 perturbation_set& expand(perturbation_set&);
mas01mc@516 79 double score(perturbation_set&);
mas01mc@516 80 bool valid(perturbation_set&);
mas01mc@518 81 void makeSortedDistFuns(vector<double> &);
mas01mc@518 82
mas01mc@518 83 public:
mas01mc@518 84 MultiProbe();
mas01mc@518 85 ~MultiProbe();
mas01mc@516 86
mas01mc@516 87 // generation of perturbation sets
mas01mc@518 88 void generatePerturbationSets(vector<double>& vectorOfBounaryDistances, int numSetsToGenerate);
mas01mc@518 89 perturbation_set getNextPerturbationSet();
mas01mc@518 90 void dump(perturbation_set);
mas01mc@518 91 size_t size(); // Number of perturbation sets are in the output queue
mas01mc@518 92 bool empty(); // predicate for empty MultiProbe set
mas01mc@516 93 };
mas01mc@516 94
mas01mc@518 95
mas01mc@516 96 /* NOTES:
mas01mc@516 97
mas01mc@516 98 Reference:
mas01mc@516 99 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 100 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 101 Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 102
mas01mc@516 103 i = 1..M (number of hash functions used)
mas01mc@516 104 f_i(q) = a_i * q + b_i // projection to real line
mas01mc@516 105 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
mas01mc@516 106 delta \in {-1, +1}
mas01mc@516 107 x_i( delta ) = distance of query to left or right boundary
mas01mc@516 108 Delta = [delta_1 delta_2 ... delta_M]
mas01mc@516 109 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
mas01mc@516 110
mas01mc@516 111 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
mas01mc@516 112 p_j = (i, delta) if z_j == x_i(delta)
mas01mc@516 113
mas01mc@516 114 A_k is index into p_j
mas01mc@516 115
mas01mc@516 116 Multi-probe algorithm (after Lv et al. 2007)
mas01mc@516 117 ---------------------------------------------
mas01mc@516 118
mas01mc@516 119 A0 = {1}
mas01mc@516 120 minHeap.insert(A0, score(A0))
mas01mc@516 121
mas01mc@516 122 for i = 1 to T do // T = number of perturbation sets
mas01mc@516 123 repeat
mas01mc@516 124 Ai = minHeap.extractMin()
mas01mc@516 125 As = shift(Ai)
mas01mc@516 126 minHeap.insert(As, score(As))
mas01mc@516 127 Ae = expand(Ai)
mas01mc@516 128 minHeap.insert(Ae, score(Ae))
mas01mc@516 129 until valid(Ai)
mas01mc@516 130 output Ai
mas01mc@516 131 end for
mas01mc@516 132
mas01mc@516 133 */
mas01mc@518 134
mas01mc@518 135 #endif // __LSH_MULTIPROBE_