Mercurial > hg > audiodb
changeset 516:2a7bad47a4a7 multiprobeLSH
New branch to implement multiprobe LSH. Copy of trunk:802. Added multiprobe.{cpp,h} source files.
author | mas01mc |
---|---|
date | Sat, 24 Jan 2009 19:51:46 +0000 |
parents | c7bdb7913762 |
children | 807c8be7dd45 |
files | lshlib.cpp multiprobe.cpp multiprobe.h |
diffstat | 3 files changed, 227 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/lshlib.cpp Sat Jan 24 09:39:39 2009 +0000 +++ b/lshlib.cpp Sat Jan 24 19:51:46 2009 +0000 @@ -138,7 +138,7 @@ } } - // Storage for whole or partial function evaluation depdenting on USE_U_FUNCTIONS + // Storage for whole or partial function evaluation depending on USE_U_FUNCTIONS H::initialize_partial_functions(); }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiprobe.cpp Sat Jan 24 19:51:46 2009 +0000 @@ -0,0 +1,116 @@ +/* + * MultiProbe C++ class + * + * Given a LSH structure, perform lookup by probing nearby hash-function locations + * + * Implementation using C++ STL + * + * Reference: + * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, + * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity + * Search", Proc. Intl. Conf. VLDB, 2007 + * + * + * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved + * License: GNU Public License 2.0 + * + */ + +#include "multiprobe.h" + + +#define _TEST_MP_LSH + +MultiProbe::MultiProbe(){ + minHeap = new min_heap_of_perturbation_set(); + outSets = new vector_of_perturbation_set(); +} + +MultiProbe::~MultiProbe(){ + // FIXME: Are these arrays ? + delete minHeap; + delete outSets; +} + +perturbation_set& MultiProbe::shift(perturbation_set a){ + // a[a.length]++ +} + +perturbation_set& MultiProbe::expand(perturbation_set a){ + // a[a.length+1]=a[a.length]+1; +} + +// Generate the optimal T perturbation sets for current query +// pre-conditions: +// an LSH structure was initialized and passed to constructor +// a query vector was passed to lsh->compute_hash_functions() +// the query-to-boundary distances are stored in x[hashFunIndex] +// +// post-conditions: +// +// generates an ordered list of perturbation sets (stored as an array of sets) +// these are indexes into pi_j=(i,delta) pairs representing x_i(delta) in sort order z_j +// data structures are cleared and reset to zeros thereby making them re-entrant +// +void MultiProbe::generatePerturbationSets(int T, vector<int>& x){ + perturbation_set ai,as,ae; + + distFuns = makeSortedDistFuns(x); // NEED TO IMPLEMENT, distances from query to boundaries + + ai.insert(1); + minHeap->push(make_pair(perturbation_set(ai), score(ai))); // unique instance stored in mhe + + for(int i = 0 ; i < T ; i++ ){ + do{ + min_heap_element mhe = minHeap->top(); + minHeap->pop(); + ai = mhe.first; + as = shift(perturbation_set(ai)); + minHeap->push(make_pair(perturbation_set(as), score(as))); + ae = expand(perturbation_set(ai)); + minHeap->push(make_pair(perturbation_set(ae), score(ae))); + }while(!valid(ai)); + outSets->push_back(ai); // Ordered list of perturbation sets + } +} + + +// Take the list of distances (x) assuming list len is 2M and +// delta = (-1)^i, i = { 0 .. 2M-1 } +sorted_distance_functions& MultiProbe::makeSortedDistFuns(vector<int>& x){ + // Change these to vector<pair<double, pair<int,int> > > + sorted_distance_functions* distFuns = new sorted_distance_functions[x.size()]; + for(int i = 0; i < x.size() ; i++){ + distFuns[i] = make_pair(x[i], make_pair(i, i%2?-1:+1)); + } +} + +double MultiProbe::score(perturbation_set& a){ + // Needs implementing +} + +bool MultiProbe::valid(perturbation_set& a){ + + + // A valid set must have at most one + // of the two elements {j, 2M + 1 - j} for every j + // + // A perturbation set containing an element > 2M is invalid + + + +} + +#ifdef _TEST_MP_LSH +int main(const int argc, const char* argv[]){ + + MultiProbe mp(); + +} +#endif + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multiprobe.h Sat Jan 24 19:51:46 2009 +0000 @@ -0,0 +1,110 @@ +/* + * MultiProbe C++ class + * + * Given a LSH structure, perform lookup by probing nearby hash-function locations + * + * Implementation using C++ STL + * + * Reference: + * Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, + * "Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity + * Search", Proc. Intl. Conf. VLDB, 2007 + * + * + * Copyright (C) 2009 Michael Casey, Dartmouth College, All Rights Reserved + * License: GNU Public License 2.0 + * + */ + +#include <functional> +#include <queue> +#include <vector> +#include <set> + +using namespace std ; + +typedef set<int > perturbation_set ; +typedef vector<perturbation_set > vector_of_perturbation_set ; +typedef pair<perturbation_set, float > min_heap_element ; +typedef vector<min_heap_element > vector_of_min_heap_element ; +typedef priority_queue< + min_heap_element, + vector_of_min_heap_element, + greater<min_heap_element> + > min_heap_of_perturbation_set ; +typedef pair<double, pair<int, int> > sorted_distance_functions ; + +bool operator>(const min_heap_element& a, const min_heap_element& b){ + return a.second > b.second; +} + +bool operator<(const min_heap_element& a, const min_heap_element& b){ + return a.second < b.second; +} + +bool operator>(const sorted_distance_functions& a, const sorted_distance_functions& b){ + return a.second > b.second; +} + +bool operator<(const sorted_distance_functions& a, const sorted_distance_functions& b){ + return a.second < b.second; +} + +class MultiProbe{ +protected: + min_heap_of_perturbation_set* minHeap; + vector_of_perturbation_set* outSets; + sorted_distance_functions distFuns; +public: + MultiProbe(); + ~MultiProbe(); + + // perturbation set operations + perturbation_set& shift(perturbation_set); + perturbation_set& expand(perturbation_set); + double score(perturbation_set&); + bool valid(perturbation_set&); + + // generation of perturbation sets + void generatePerturbationSets(int, vector<int>&); + sorted_distance_functions& makeSortedDistFuns(vector<int> &); +}; + +/* NOTES: + +Reference: +Qin Lv, William Josephson, Zhe Wang, Moses Charikar and Kai Li, +"Multi-Probe LSH: Efficient Indexing for High-Dimensional Similarity +Search", Proc. Intl. Conf. VLDB, 2007 + + i = 1..M (number of hash functions used) + f_i(q) = a_i * q + b_i // projection to real line + h_i(q) = floor( ( a_i * q + b_i ) / w ) // hash slot + delta \in {-1, +1} + x_i( delta ) = distance of query to left or right boundary + Delta = [delta_1 delta_2 ... delta_M] + score(Delta) = sum_{i=1}_M x_i(delta_i).^2 + + z = sort(x(delta_i), increasing) // i = 1..2M delta={+1,-1} + p_j = (i, delta) if z_j == x_i(delta) + + A_k is index into p_j + + Multi-probe algorithm (after Lv et al. 2007) + --------------------------------------------- + + A0 = {1} + minHeap.insert(A0, score(A0)) + + for i = 1 to T do // T = number of perturbation sets + repeat + Ai = minHeap.extractMin() + As = shift(Ai) + minHeap.insert(As, score(As)) + Ae = expand(Ai) + minHeap.insert(Ae, score(Ae)) + until valid(Ai) + output Ai + end for + + */