changeset 768:b9dbe4611dde

Adding Kullback-Leibler divergence as alternate distance function
author mas01mc
date Sat, 15 Oct 2011 17:28:07 +0000
parents 6d0d41604aba
children 512797443824
files Makefile audioDB-internals.h audioDB.cpp audioDB.h audioDB_API.h bindings/python/pyadb.py bindings/python/pyadbmodule.c gengetopt.in query.cpp
diffstat 9 files changed, 86 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Thu Jun 02 16:31:44 2011 +0000
+++ b/Makefile	Sat Oct 15 17:28:07 2011 +0000
@@ -1,7 +1,7 @@
 HELP2MAN=help2man
 GENGETOPT=gengetopt
 SOAPCPP2=soapcpp2
-GSOAP_INCLUDE=$(shell pkg-config --cflags gsoap++)
+GSOAP_INCLUDE=-I$(HOME)/src/gsoap-2.8/gsoap -L$(HOME)/src/gsoap-2.8/gsoap
 GSOAP_CPP=$(shell pkg-config --libs gsoap++)
 GSL_INCLUDE=$(shell pkg-config --cflags gsl)
 LIBGSL=$(shell pkg-config --libs gsl)
@@ -26,8 +26,12 @@
 LIBRARY=lib$(EXECUTABLE).so.$(SOVERSION).$(MINORVERSION)
 SHARED_LIB_FLAGS=-shared -Wl,-soname,lib$(EXECUTABLE).so.$(SOVERSION)
 
+# set for production operation
 override CFLAGS+=-g -O3 -fPIC 
 
+# set for debug operation
+#override CFLAGS+=-ggdb -DDEBUG -fPIC
+
 # set to generate profile (gprof) and coverage (gcov) info
 #override CFLAGS+=-fprofile-arcs -ftest-coverage -pg
 
--- a/audioDB-internals.h	Thu Jun 02 16:31:44 2011 +0000
+++ b/audioDB-internals.h	Sat Oct 15 17:28:07 2011 +0000
@@ -272,6 +272,23 @@
   return result;
 }
 
+static inline double audiodb_kullback_leibler(double *p, double *q, size_t count) {
+  double a,b, tmp1, tmp2, result = 0;
+  while(count--){
+    a = *p++;
+    b = *q++;    
+    tmp1 = a * log( a / b );
+    if(isnan(tmp1))
+      tmp1=0.0;
+    tmp2 = b * log( b / a );
+    if(isnan(tmp2))
+      tmp2=0.0;
+    result += ( tmp1 + tmp2 ) / 2.0;
+  }
+  return result;
+}
+
+
 static inline void audiodb_l2norm_buffer(double *d, size_t dim, size_t nvectors, double *l) {
   while(nvectors--) {
     double *d1 = d;
@@ -386,9 +403,9 @@
 #define ADB_TRACKTABLE_ENTRY_SIZE (sizeof(uint32_t))
 #define ADB_DISTANCE_TOLERANCE (1e-6)
 
-#define ADB_DEFAULT_DATASIZE (1355U) /* in MB */
-#define ADB_DEFAULT_NTRACKS (20000U)
-#define ADB_DEFAULT_DATADIM (9U)
+#define ADB_DEFAULT_DATASIZE (2000U) /* in MB */
+#define ADB_DEFAULT_NTRACKS (200000U)
+#define ADB_DEFAULT_DATADIM (12U)
 
 #define ADB_FIXME_LARGE_ADB_SIZE (ADB_DEFAULT_DATASIZE+1)
 #define ADB_FIXME_LARGE_ADB_NTRACKS (ADB_DEFAULT_NTRACKS+1)
--- a/audioDB.cpp	Thu Jun 02 16:31:44 2011 +0000
+++ b/audioDB.cpp	Sat Oct 15 17:28:07 2011 +0000
@@ -447,6 +447,7 @@
   }
 
   // Set no_unit_norm flag  
+  distance_kullback = args_info.distance_kullback_flag;
   no_unit_norming = args_info.no_unit_norming_flag;
   lsh_use_u_functions = args_info.lsh_use_u_functions_flag;
 
@@ -919,7 +920,10 @@
   case O2_SEQUENCE_QUERY:
   case O2_N_SEQUENCE_QUERY:
     qspec.params.accumulation = ADB_ACCUMULATION_PER_TRACK;
-    qspec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
+    if (distance_kullback)
+      qspec.params.distance = ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE;
+    else
+      qspec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
     qspec.params.npoints = pointNN;
     qspec.params.ntracks = trackNN;
     switch(queryType) {
@@ -941,7 +945,10 @@
     break;
   case O2_ONE_TO_ONE_N_SEQUENCE_QUERY:
     qspec.params.accumulation = ADB_ACCUMULATION_ONE_TO_ONE;
-    qspec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
+    if (distance_kullback)
+      qspec.params.distance = ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE;
+    else
+      qspec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
     qspec.params.npoints = 0;
     qspec.params.ntracks = 0;
     if(!(qspec.refine.flags & ADB_REFINE_RADIUS)) {
@@ -1096,8 +1103,10 @@
   spec.qid.sequence_length = sequenceLength;
   spec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE;
   spec.qid.sequence_start = queryPoint;
-
-  spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
+  if (distance_kullback)
+    spec.params.distance = ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE;
+  else
+    spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
   spec.params.accumulation = ADB_ACCUMULATION_DB;
   spec.params.npoints = nsamples;
 
--- a/audioDB.h	Thu Jun 02 16:31:44 2011 +0000
+++ b/audioDB.h	Sat Oct 15 17:28:07 2011 +0000
@@ -71,6 +71,7 @@
 #define COM_EXHAUSTIVE "--exhaustive"
 #define COM_LSH_EXACT "--lsh_exact"
 #define COM_NO_UNIT_NORMING "--no_unit_norming"
+#define COM_DISTANCE_KULLBACK "--distance_kullback"
 
 #define O2_DEFAULT_POINTNN (10U)
 #define O2_DEFAULT_TRACKNN  (10U)
@@ -236,6 +237,7 @@
   unsigned sequenceHop;
   bool normalizedDistance;
   bool no_unit_norming;
+  bool distance_kullback;
   unsigned queryPoint;
   unsigned usingQueryPoint;
   unsigned usingTimes;
@@ -369,6 +371,7 @@
     sequenceHop(1),				\
     normalizedDistance(true),			\
     no_unit_norming(false),                     \
+    distance_kullback(false),                   \
     queryPoint(0),				\
     usingQueryPoint(0),				\
     usingTimes(0),				\
--- a/audioDB_API.h	Thu Jun 02 16:31:44 2011 +0000
+++ b/audioDB_API.h	Sat Oct 15 17:28:07 2011 +0000
@@ -88,6 +88,7 @@
 #define ADB_DISTANCE_DOT_PRODUCT 1
 #define ADB_DISTANCE_EUCLIDEAN_NORMED 2
 #define ADB_DISTANCE_EUCLIDEAN 3
+#define ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE 4
 
 typedef struct adb_query_parameters {
   uint32_t accumulation;
--- a/bindings/python/pyadb.py	Thu Jun 02 16:31:44 2011 +0000
+++ b/bindings/python/pyadb.py	Sat Oct 15 17:28:07 2011 +0000
@@ -116,7 +116,7 @@
 		exhaustive    : boolean - True for exhaustive (false by default),\n\
 		falsePositives: boolean - True to keep fps (false by defaults),\n\
 		accumulation  : [\"db\"|\"track\"|\"one2one\"] (\"db\" by default),\n\
-		distance      : [\"dot\"|\"eucNorm\"|\"euclidean\"] (\"dot\" by default),\n\
+		distance      : [\"dot\"|\"eucNorm\"|\"euclidean\"|\"kullback\"] (\"dot\" by default),\n\
 		npoints       : int number of points per track,\n\
 		ntracks       : max number of results returned in db accu mode,\n\
 		includeKeys   : list of strings to include (use all by default),\n\
--- a/bindings/python/pyadbmodule.c	Thu Jun 02 16:31:44 2011 +0000
+++ b/bindings/python/pyadbmodule.c	Sat Oct 15 17:28:07 2011 +0000
@@ -419,15 +419,17 @@
 		return NULL;
 	}
 	if (strcmp(distMode, "dot")==0){
-		spec->params.distance = ADB_DISTANCE_DOT_PRODUCT;
+	        spec->params.distance = ADB_DISTANCE_DOT_PRODUCT;
 	}else if (strcmp(distMode, "eucNorm")==0){
-		spec->params.distance = ADB_DISTANCE_EUCLIDEAN_NORMED;
+	        spec->params.distance = ADB_DISTANCE_EUCLIDEAN_NORMED;
 	}else if (strcmp(distMode, "euclidean")==0){
-		spec->params.distance = ADB_DISTANCE_EUCLIDEAN;
+	        spec->params.distance = ADB_DISTANCE_EUCLIDEAN;
+	}else if (strcmp(distMode, "kullback")==0){
+	        spec->params.distance = ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE;
 	}else{
-		PyErr_SetString(PyExc_ValueError, 
-			"Poorly specified distance mode. distance must either be \'dot\', \'eucNorm\' or  \'euclidean\'.\n");
-		return NULL;
+	        PyErr_SetString(PyExc_ValueError, 
+			  "Poorly specified distance mode. distance must either be \'dot\', \'eucNorm\' ,\'euclidean\' or \'kullback\'.\n");
+	        return NULL;
 	}
 	
 	//set up spec->refine
@@ -746,9 +748,11 @@
 		spec->params.distance = ADB_DISTANCE_EUCLIDEAN_NORMED;
 	}else if (strcmp(distMode, "euclidean")==0){
 		spec->params.distance = ADB_DISTANCE_EUCLIDEAN;
+	}else if (strcmp(distMode, "kullback")==0){
+		spec->params.distance = ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE;
 	}else{
 		PyErr_SetString(PyExc_ValueError, 
-			"Poorly specified distance mode. distance must either be \'dot\', \'eucNorm\' or  \'euclidean\'.\n");
+			  "Poorly specified distance mode. distance must either be \'dot\', \'eucNorm\' ,\'euclidean\' or \'kullback\'.\n");
 		return NULL;
 	}
 	
--- a/gengetopt.in	Thu Jun 02 16:31:44 2011 +0000
+++ b/gengetopt.in	Sat Oct 15 17:28:07 2011 +0000
@@ -80,6 +80,7 @@
 section "Normalization control parameters" sectiondesc="These parameters control the normalization of feaures at query time\n"
 
 option "no_unit_norming" - "do not unit norm features when querying an L2Norm databases." flag off
+option "distance_kullback" - "use symmetric kullback divergence for distance function" flag off
 
 section "Web Services" sectiondesc="These commands enable the database process to establish a connection via the internet and operate as separate client and server processes.\n"
 
--- a/query.cpp	Thu Jun 02 16:31:44 2011 +0000
+++ b/query.cpp	Sat Oct 15 17:28:07 2011 +0000
@@ -55,6 +55,7 @@
     break;
   case ADB_DISTANCE_EUCLIDEAN_NORMED:
   case ADB_DISTANCE_EUCLIDEAN:
+  case ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE:
     switch(qspec->params.accumulation) {
     case ADB_ACCUMULATION_DB:
       qstate.accumulator = new DBAccumulator<adb_result_dist_lt>(qspec->params.npoints);
@@ -108,6 +109,10 @@
 static void audiodb_initialize_arrays(adb_t *adb, const adb_query_spec_t *spec, int track, unsigned int numVectors, double *query, double *data_buffer, double **D, double **DD) {
   unsigned int j, k, l, w;
   double *dp, *qp, *sp;
+  double a,b, tmp1;
+#ifdef SYMMETRIC_KL
+  double tmp2;
+#endif
 
   const unsigned wL = spec->qid.sequence_length;
 
@@ -127,8 +132,27 @@
       dp = &D[j][k];  // point to correlation cell j,k
       *dp = 0.0;      // initialize correlation cell
       l = adb->header->dim;         // size of vectors
-      while(l--)
-        *dp += *qp++ * *sp++;
+      if (spec->params.distance!=ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE){
+	while(l--)
+	  *dp += *qp++ * *sp++;
+      }
+      else{ // KL
+	while(l--){
+	  a = *qp++;
+	  b = *sp++;    
+	  tmp1 = a * log( a / b );
+	  if(isnan(tmp1))
+	    tmp1=0.0;
+#ifdef SYMMETRIC_KL
+	  tmp2 = b * log( b / a );
+	  if(isnan(tmp2))
+	    tmp2=0.0;
+	  *dp += ( tmp1 + tmp2 ) / 2.0;
+#else
+	  *dp += tmp1;
+#endif
+	}
+      }
     }
   
   double* spd;
@@ -461,7 +485,11 @@
     if( ( (!power_refine) || audiodb_powers_acceptable(&spec->refine, qpointers->power[qPos], dbpointers.power[sPos])) &&
 	( qPos<qpointers->nvectors-sequence_length+1 && sPos<(*adb->track_lengths)[pp.trackID]-sequence_length+1 ) ){
       // Compute distance
-      dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
+      dist = 1.0e9;
+      if (spec->params.distance==ADB_DISTANCE_EUCLIDEAN_NORMED || spec->params.distance==ADB_DISTANCE_EUCLIDEAN)
+	dist = audiodb_dot_product(query + qPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
+      else if(spec->params.distance==ADB_DISTANCE_KULLBACK_LEIBLER_DIVERGENCE)
+	dist = audiodb_kullback_leibler(query + qPos*adb->header->dim, dbdata + sPos*adb->header->dim, adb->header->dim*sequence_length);
       double qn = qpointers->l2norm[qPos];
       double sn = dbpointers.l2norm[sPos];
       switch(spec->params.distance) {