annotate multiprobe.h @ 524:469b50a3dd84 multiprobeLSH

Fixed a bug in LSH hashtable writing to disk that doesn't always sort the t2 entries into strict weak ordering. Now it does. Lots of debugging informational code inserted.
author mas01mc
date Wed, 28 Jan 2009 16:02:17 +0000
parents fdcd436d7cbd
children
rev   line source
mas01mc@516 1 /*
mas01mc@516 2 * MultiProbe C++ class
mas01mc@516 3 *
mas01mc@518 4 * Given a vector of LSH boundary distances for a query,
mas01mc@518 5 * perform lookup by probing nearby hash-function locations
mas01mc@516 6 *
mas01mc@516 7 * Implementation using C++ STL
mas01mc@516 8 *
mas01mc@516 9 * Reference:
mas01mc@516 10 * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 11 * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 12 * Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 13 *
mas01mc@516 14 *
mas01mc@516 15 * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved
mas01mc@516 16 * License: GNU Public License 2.0
mas01mc@516 17 *
mas01mc@516 18 */
mas01mc@516 19
mas01mc@518 20 #ifndef __LSH_MULTIPROBE_
mas01mc@518 21 #define __LSH_MULTIPROBE_
mas01mc@518 22
mas01mc@516 23 #include <functional>
mas01mc@516 24 #include <queue>
mas01mc@516 25 #include <vector>
mas01mc@516 26 #include <set>
mas01mc@517 27 #include <algorithm>
mas01mc@517 28 #include <iostream>
mas01mc@516 29
mas01mc@518 30 using namespace std;
mas01mc@516 31
mas01mc@516 32 typedef set<int > perturbation_set ;
mas01mc@518 33
mas01mc@518 34 typedef class MinHeapElement{
mas01mc@518 35 public:
mas01mc@518 36 perturbation_set perturbs;
mas01mc@519 37 float score;
mas01mc@519 38 MinHeapElement(perturbation_set a, float s);
mas01mc@518 39 virtual ~MinHeapElement();
mas01mc@518 40 } min_heap_element;
mas01mc@518 41
mas01mc@518 42 typedef priority_queue<min_heap_element,
mas01mc@518 43 vector<min_heap_element>,
mas01mc@518 44 greater<min_heap_element>
mas01mc@518 45 > min_heap_of_perturbation_set ;
mas01mc@518 46
mas01mc@519 47 typedef pair<float, pair<int, int> > sorted_distance_functions ;
mas01mc@516 48
mas01mc@516 49 class MultiProbe{
mas01mc@516 50 protected:
mas01mc@516 51 min_heap_of_perturbation_set* minHeap;
mas01mc@518 52 min_heap_of_perturbation_set* outSets;
mas01mc@518 53 vector<sorted_distance_functions>* distFuns;
mas01mc@517 54 unsigned numHashBoundaries;
mas01mc@516 55
mas01mc@518 56 // data structure initialization and cleanup
mas01mc@518 57 void initialize();
mas01mc@518 58 void cleanup();
mas01mc@518 59
mas01mc@516 60 // perturbation set operations
mas01mc@517 61 perturbation_set& shift(perturbation_set&);
mas01mc@517 62 perturbation_set& expand(perturbation_set&);
mas01mc@519 63 float score(perturbation_set&);
mas01mc@516 64 bool valid(perturbation_set&);
mas01mc@519 65 void makeSortedDistFuns(vector<float> &);
mas01mc@519 66 void makeSortedDistFuns(float* x, unsigned N);
mas01mc@519 67
mas01mc@519 68 // perturbation set generation algorithm
mas01mc@519 69 void algorithm1(unsigned T);
mas01mc@518 70
mas01mc@518 71 public:
mas01mc@518 72 MultiProbe();
mas01mc@518 73 ~MultiProbe();
mas01mc@516 74
mas01mc@516 75 // generation of perturbation sets
mas01mc@519 76 void generatePerturbationSets(vector<float>& vectorOfBounaryDistances, unsigned numSetsToGenerate);
mas01mc@519 77 void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate);
mas01mc@518 78 perturbation_set getNextPerturbationSet();
mas01mc@518 79 void dump(perturbation_set);
mas01mc@518 80 size_t size(); // Number of perturbation sets are in the output queue
mas01mc@518 81 bool empty(); // predicate for empty MultiProbe set
mas01mc@519 82 int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry
mas01mc@519 83 int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry
mas01mc@516 84 };
mas01mc@516 85
mas01mc@518 86
mas01mc@516 87 /* NOTES:
mas01mc@516 88
mas01mc@516 89 Reference:
mas01mc@516 90 Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li,
mas01mc@516 91 "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity
mas01mc@516 92 Search", Proc. Intl. Conf. VLDB, 2007
mas01mc@516 93
mas01mc@516 94 i = 1..M (number of hash functions used)
mas01mc@516 95 f_i(q) = a_i * q + b_i // projection to real line
mas01mc@516 96 h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot
mas01mc@516 97 delta \in {-1, +1}
mas01mc@516 98 x_i( delta ) = distance of query to left or right boundary
mas01mc@516 99 Delta = [delta_1 delta_2 ... delta_M]
mas01mc@516 100 score(Delta) = sum_{i=1}_M x_i(delta_i).^2
mas01mc@516 101
mas01mc@516 102 z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1}
mas01mc@516 103 p_j = (i, delta) if z_j == x_i(delta)
mas01mc@516 104
mas01mc@516 105 A_k is index into p_j
mas01mc@516 106
mas01mc@516 107 Multi-probe algorithm (after Lv et al. 2007)
mas01mc@516 108 ---------------------------------------------
mas01mc@516 109
mas01mc@516 110 A0 = {1}
mas01mc@516 111 minHeap.insert(A0, score(A0))
mas01mc@516 112
mas01mc@516 113 for i = 1 to T do // T = number of perturbation sets
mas01mc@516 114 repeat
mas01mc@516 115 Ai = minHeap.extractMin()
mas01mc@516 116 As = shift(Ai)
mas01mc@516 117 minHeap.insert(As, score(As))
mas01mc@516 118 Ae = expand(Ai)
mas01mc@516 119 minHeap.insert(Ae, score(Ae))
mas01mc@516 120 until valid(Ai)
mas01mc@516 121 output Ai
mas01mc@516 122 end for
mas01mc@516 123
mas01mc@516 124 */
mas01mc@518 125
mas01mc@518 126 #endif // __LSH_MULTIPROBE_