mas01mc@764: /* mas01mc@764: * MultiProbe C++ class mas01mc@764: * mas01mc@764: * Given a vector of LSH boundary distances for a query, mas01mc@764: * perform lookup by probing nearby hash-function locations mas01mc@764: * mas01mc@764: * Implementation using C++ STL mas01mc@764: * mas01mc@764: * Reference: mas01mc@764: * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, mas01mc@764: * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity mas01mc@764: * Search", Proc. Intl. Conf. VLDB, 2007 mas01mc@764: * mas01mc@764: * mas01mc@764: * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved mas01mc@764: * License: GNU Public License 2.0 mas01mc@764: * mas01mc@764: */ mas01mc@764: mas01mc@764: #ifndef __LSH_MULTIPROBE_ mas01mc@764: #define __LSH_MULTIPROBE_ mas01mc@764: mas01mc@764: #include mas01mc@764: #include mas01mc@764: #include mas01mc@764: #include mas01mc@764: #include mas01mc@764: #include mas01mc@764: mas01mc@764: using namespace std; mas01mc@764: mas01mc@764: typedef set perturbation_set ; mas01mc@764: mas01mc@764: typedef class MinHeapElement{ mas01mc@764: public: mas01mc@764: perturbation_set perturbs; mas01mc@764: float score; mas01mc@764: MinHeapElement(perturbation_set a, float s); mas01mc@764: virtual ~MinHeapElement(); mas01mc@764: } min_heap_element; mas01mc@764: mas01mc@764: typedef priority_queue, mas01mc@764: greater mas01mc@764: > min_heap_of_perturbation_set ; mas01mc@764: mas01mc@764: typedef pair > sorted_distance_functions ; mas01mc@764: mas01mc@764: class MultiProbe{ mas01mc@764: protected: mas01mc@764: min_heap_of_perturbation_set* minHeap; mas01mc@764: min_heap_of_perturbation_set* outSets; mas01mc@764: vector* distFuns; mas01mc@764: unsigned numHashBoundaries; mas01mc@764: mas01mc@764: // data structure initialization and cleanup mas01mc@764: void initialize(); mas01mc@764: void cleanup(); mas01mc@764: mas01mc@764: // perturbation set operations mas01mc@764: perturbation_set& shift(perturbation_set&); mas01mc@764: perturbation_set& expand(perturbation_set&); mas01mc@764: float score(perturbation_set&); mas01mc@764: bool valid(perturbation_set&); mas01mc@764: void makeSortedDistFuns(vector &); mas01mc@764: void makeSortedDistFuns(float* x, unsigned N); mas01mc@764: mas01mc@764: // perturbation set generation algorithm mas01mc@764: void algorithm1(unsigned T); mas01mc@764: mas01mc@764: public: mas01mc@764: MultiProbe(); mas01mc@764: ~MultiProbe(); mas01mc@764: mas01mc@764: // generation of perturbation sets mas01mc@764: void generatePerturbationSets(vector& vectorOfBounaryDistances, unsigned numSetsToGenerate); mas01mc@764: void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate); mas01mc@764: perturbation_set getNextPerturbationSet(); mas01mc@764: void dump(perturbation_set); mas01mc@764: size_t size(); // Number of perturbation sets are in the output queue mas01mc@764: bool empty(); // predicate for empty MultiProbe set mas01mc@764: int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry mas01mc@764: int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry mas01mc@764: }; mas01mc@764: mas01mc@764: mas01mc@764: /* NOTES: mas01mc@764: mas01mc@764: Reference: mas01mc@764: Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, mas01mc@764: "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity mas01mc@764: Search", Proc. Intl. Conf. VLDB, 2007 mas01mc@764: mas01mc@764: i = 1..M (number of hash functions used) mas01mc@764: f_i(q) = a_i * q + b_i // projection to real line mas01mc@764: h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot mas01mc@764: delta \in {-1, +1} mas01mc@764: x_i( delta ) = distance of query to left or right boundary mas01mc@764: Delta = [delta_1 delta_2 ... delta_M] mas01mc@764: score(Delta) = sum_{i=1}_M x_i(delta_i).^2 mas01mc@764: mas01mc@764: z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1} mas01mc@764: p_j = (i, delta) if z_j == x_i(delta) mas01mc@764: mas01mc@764: A_k is index into p_j mas01mc@764: mas01mc@764: Multi-probe algorithm (after Lv et al. 2007) mas01mc@764: --------------------------------------------- mas01mc@764: mas01mc@764: A0 = {1} mas01mc@764: minHeap.insert(A0, score(A0)) mas01mc@764: mas01mc@764: for i = 1 to T do // T = number of perturbation sets mas01mc@764: repeat mas01mc@764: Ai = minHeap.extractMin() mas01mc@764: As = shift(Ai) mas01mc@764: minHeap.insert(As, score(As)) mas01mc@764: Ae = expand(Ai) mas01mc@764: minHeap.insert(Ae, score(Ae)) mas01mc@764: until valid(Ai) mas01mc@764: output Ai mas01mc@764: end for mas01mc@764: mas01mc@764: */ mas01mc@764: mas01mc@764: #endif // __LSH_MULTIPROBE_