annotate multiprobe.h @ 755:37c2b9cce23a multiprobeLSH

Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author mas01mc
date Thu, 25 Nov 2010 13:42:40 +0000
parents fdcd436d7cbd
children
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@518 4 * Given a vector of LSH boundary distances for a query,
mas01mc@518 5 * perform lookup by probing nearby hash-function locations
mas01mc@516 6 *
mas01mc@516 7 * Implementation using C++ STL
mas01mc@516 8 *
mas01mc@516 9 * Reference:
mas01mc@516 10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 12 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 13 *
mas01mc@516 14 *
mas01mc@516 15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 16 * License: GNU Public License 2.0
mas01mc@516 17 *
mas01mc@516 18 */
mas01mc@516 19
mas01mc@518 20 #ifndef __LSH_MULTIPROBE_
mas01mc@518 21 #define __LSH_MULTIPROBE_
mas01mc@518 22
mas01mc@516 23 #include <functional>
mas01mc@516 24 #include <queue>
mas01mc@516 25 #include <vector>
mas01mc@516 26 #include <set>
mas01mc@517 27 #include <algorithm>
mas01mc@517 28 #include <iostream>
mas01mc@516 29
mas01mc@518 30 using namespace std;
mas01mc@516 31
mas01mc@516 32 typedef set<int > perturbation_set ;
mas01mc@518 33
mas01mc@518 34 typedef class MinHeapElement{
mas01mc@518 35 public:
mas01mc@518 36 perturbation_set perturbs;
mas01mc@519 37 float score;
mas01mc@519 38 MinHeapElement(perturbation_set a, float s);
mas01mc@518 39 virtual ~MinHeapElement();
mas01mc@518 40 } min_heap_element;
mas01mc@518 41
mas01mc@518 42 typedef priority_queue<min_heap_element,
mas01mc@518 43 vector<min_heap_element>,
mas01mc@518 44 greater<min_heap_element>
mas01mc@518 45 > min_heap_of_perturbation_set ;
mas01mc@518 46
mas01mc@519 47 typedef pair<float, pair<int, int> > sorted_distance_functions ;
mas01mc@516 48
mas01mc@516 49 class MultiProbe{
mas01mc@516 50 protected:
mas01mc@516 51 min_heap_of_perturbation_set* minHeap;
mas01mc@518 52 min_heap_of_perturbation_set* outSets;
mas01mc@518 53 vector<sorted_distance_functions>* distFuns;
mas01mc@517 54 unsigned numHashBoundaries;
mas01mc@516 55
mas01mc@518 56 // data structure initialization and cleanup
mas01mc@518 57 void initialize();
mas01mc@518 58 void cleanup();
mas01mc@518 59
mas01mc@516 60 // perturbation set operations
mas01mc@517 61 perturbation_set& shift(perturbation_set&);
mas01mc@517 62 perturbation_set& expand(perturbation_set&);
mas01mc@519 63 float score(perturbation_set&);
mas01mc@516 64 bool valid(perturbation_set&);
mas01mc@519 65 void makeSortedDistFuns(vector<float> &);
mas01mc@519 66 void makeSortedDistFuns(float* x, unsigned N);
mas01mc@519 67
mas01mc@519 68 // perturbation set generation algorithm
mas01mc@519 69 void algorithm1(unsigned T);
mas01mc@518 70
mas01mc@518 71 public:
mas01mc@518 72 MultiProbe();
mas01mc@518 73 ~MultiProbe();
mas01mc@516 74
mas01mc@516 75 // generation of perturbation sets
mas01mc@519 76 void generatePerturbationSets(vector<float>& vectorOfBounaryDistances, unsigned numSetsToGenerate);
mas01mc@519 77 void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate);
mas01mc@518 78 perturbation_set getNextPerturbationSet();
mas01mc@518 79 void dump(perturbation_set);
mas01mc@518 80 size_t size(); // Number of perturbation sets are in the output queue
mas01mc@518 81 bool empty(); // predicate for empty MultiProbe set
mas01mc@519 82 int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry
mas01mc@519 83 int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry
mas01mc@516 84 };
mas01mc@516 85
mas01mc@518 86
mas01mc@516 87 /* NOTES:
mas01mc@516 88
mas01mc@516 89 Reference:
mas01mc@516 90 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 91 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 92 Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 93
mas01mc@516 94 i = 1..M (number of hash functions used)
mas01mc@516 95 f_i(q) = a_i * q + b_i // projection to real line
mas01mc@516 96 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
mas01mc@516 97 delta \in {-1, +1}
mas01mc@516 98 x_i( delta ) = distance of query to left or right boundary
mas01mc@516 99 Delta = [delta_1 delta_2 ... delta_M]
mas01mc@516 100 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
mas01mc@516 101
mas01mc@516 102 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
mas01mc@516 103 p_j = (i, delta) if z_j == x_i(delta)
mas01mc@516 104
mas01mc@516 105 A_k is index into p_j
mas01mc@516 106
mas01mc@516 107 Multi-probe algorithm (after Lv et al. 2007)
mas01mc@516 108 ---------------------------------------------
mas01mc@516 109
mas01mc@516 110 A0 = {1}
mas01mc@516 111 minHeap.insert(A0, score(A0))
mas01mc@516 112
mas01mc@516 113 for i = 1 to T do // T = number of perturbation sets
mas01mc@516 114 repeat
mas01mc@516 115 Ai = minHeap.extractMin()
mas01mc@516 116 As = shift(Ai)
mas01mc@516 117 minHeap.insert(As, score(As))
mas01mc@516 118 Ae = expand(Ai)
mas01mc@516 119 minHeap.insert(Ae, score(Ae))
mas01mc@516 120 until valid(Ai)
mas01mc@516 121 output Ai
mas01mc@516 122 end for
mas01mc@516 123
mas01mc@516 124 */
mas01mc@518 125
mas01mc@518 126 #endif // __LSH_MULTIPROBE_