Mercurial > hg > audiodb
diff multiprobe.h @ 754:9bd13c7819ae mkc_lsh_update
Adding mkc_lsh_update branch, trunk candidate with improved LSH: merged trunk 1095 and branch multiprobe_lsh
author | mas01mc |
---|---|
date | Thu, 25 Nov 2010 13:42:40 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiprobe.h Thu Nov 25 13:42:40 2010 +0000 @@ -0,0 +1,126 @@ +/* + * MultiProbe C++ class + * + * Given a vector of LSH boundary distances for a query, + * perform lookup by probing nearby hash-function locations + * + * Implementation using C++ STL + * + * Reference: + * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, + * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity + * Search", Proc. Intl. Conf. VLDB, 2007 + * + * + * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved + * License: GNU Public License 2.0 + * + */ + +#ifndef __LSH_MULTIPROBE_ +#define __LSH_MULTIPROBE_ + +#include <functional> +#include <queue> +#include <vector> +#include <set> +#include <algorithm> +#include <iostream> + +using namespace std; + +typedef set<int > perturbation_set ; + +typedef class MinHeapElement{ +public: + perturbation_set perturbs; + float score; + MinHeapElement(perturbation_set a, float s); + virtual ~MinHeapElement(); +} min_heap_element; + +typedef priority_queue<min_heap_element, + vector<min_heap_element>, + greater<min_heap_element> + > min_heap_of_perturbation_set ; + +typedef pair<float, pair<int, int> > sorted_distance_functions ; + +class MultiProbe{ +protected: + min_heap_of_perturbation_set* minHeap; + min_heap_of_perturbation_set* outSets; + vector<sorted_distance_functions>* distFuns; + unsigned numHashBoundaries; + + // data structure initialization and cleanup + void initialize(); + void cleanup(); + + // perturbation set operations + perturbation_set& shift(perturbation_set&); + perturbation_set& expand(perturbation_set&); + float score(perturbation_set&); + bool valid(perturbation_set&); + void makeSortedDistFuns(vector<float> &); + void makeSortedDistFuns(float* x, unsigned N); + + // perturbation set generation algorithm + void algorithm1(unsigned T); + +public: + MultiProbe(); + ~MultiProbe(); + + // generation of perturbation sets + void generatePerturbationSets(vector<float>& vectorOfBounaryDistances, unsigned numSetsToGenerate); + void generatePerturbationSets(float* arrayOfBoundaryDistances, unsigned numDistances, unsigned numSetsToGenerate); + perturbation_set getNextPerturbationSet(); + void dump(perturbation_set); + size_t size(); // Number of perturbation sets are in the output queue + bool empty(); // predicate for empty MultiProbe set + int getIndex(perturbation_set::iterator it); // return index of hash function for given set entry + int getBoundary(perturbation_set::iterator it); // return boundary {-1,+1} for given set entry +}; + + +/* NOTES: + +Reference: +Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, +"Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity +Search", Proc. Intl. Conf. VLDB, 2007 + + i = 1..M (number of hash functions used) + f_i(q) = a_i * q + b_i // projection to real line + h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot + delta \in {-1, +1} + x_i( delta ) = distance of query to left or right boundary + Delta = [delta_1 delta_2 ... delta_M] + score(Delta) = sum_{i=1}_M x_i(delta_i).^2 + + z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1} + p_j = (i, delta) if z_j == x_i(delta) + + A_k is index into p_j + + Multi-probe algorithm (after Lv et al. 2007) + --------------------------------------------- + + A0 = {1} + minHeap.insert(A0, score(A0)) + + for i = 1 to T do // T = number of perturbation sets + repeat + Ai = minHeap.extractMin() + As = shift(Ai) + minHeap.insert(As, score(As)) + Ae = expand(Ai) + minHeap.insert(Ae, score(Ae)) + until valid(Ai) + output Ai + end for + + */ + +#endif // __LSH_MULTIPROBE_