comparison audioDB.cpp @ 693:b1723ae7675e

begin work on sampling API This is motivated by the need to be able to sample with arbitrary feature data (e.g. from a feature file) against a database, for the JNMR "collections" paper revisions or possible ISMIR paper revisions. That bit doesn't work yet, but the C-ified version of the current functionality (sample db x db and sample key x db) works to the level of anecdotal tests. The general approach is to mirror the _query_spec() API, where a whole heap of knobs and twiddles are available to the user. Unlike in the _query_spec() API, not quite all of the knobs make sense (and even fewer are actually implemented), but the basic idea is the same. I pity the poor chump who will have to document all this.
author mas01cr
date Thu, 22 Apr 2010 21:03:47 +0000
parents c62041316a44
children 01e25f938b63
comparison
equal deleted inserted replaced
692:02756c5ca15a 693:b1723ae7675e
145 munmap(timesFileNameTable, fileTableLength); 145 munmap(timesFileNameTable, fileTableLength);
146 if(powerFileNameTable) 146 if(powerFileNameTable)
147 munmap(powerFileNameTable, fileTableLength); 147 munmap(powerFileNameTable, fileTableLength);
148 if(reporter) 148 if(reporter)
149 delete reporter; 149 delete reporter;
150 if(rng)
151 gsl_rng_free(rng);
152 if(infid>0) { 150 if(infid>0) {
153 close(infid); 151 close(infid);
154 infid = 0; 152 infid = 0;
155 } 153 }
156 if(powerfd) { 154 if(powerfd) {
844 datum.key = key; 842 datum.key = key;
845 } else { 843 } else {
846 int fd; 844 int fd;
847 struct stat st; 845 struct stat st;
848 846
849 /* FIXME: around here there are all sorts of hideous leaks. */ 847 /* FIXME: around here error conditions will cause all sorts of
848 hideous leaks. */
850 fd = open(inFile, O_RDONLY); 849 fd = open(inFile, O_RDONLY);
851 if(fd < 0) { 850 if(fd < 0) {
852 error("failed to open feature file", inFile); 851 error("failed to open feature file", inFile);
853 } 852 }
854 fstat(fd, &st); 853 fstat(fd, &st);
1008 adbLisztResponse->result.__sizeRlen = k; 1007 adbLisztResponse->result.__sizeRlen = k;
1009 } 1008 }
1010 audiodb_liszt_free_results(adb, results); 1009 audiodb_liszt_free_results(adb, results);
1011 } 1010 }
1012 1011
1012 #include <gsl/gsl_sf.h>
1013
1014 static
1015 double yfun(double d) {
1016 return gsl_sf_log(d) - gsl_sf_psi(d);
1017 }
1018
1019 static
1020 double yinv(double y) {
1021 double a = 1.0e-5;
1022 double b = 1000.0;
1023
1024 double ay = yfun(a);
1025 double by = yfun(b);
1026
1027 double c = 0;
1028 double cy;
1029
1030 /* FIXME: simple binary search; there's probably some clever solver
1031 in gsl somewhere which is less sucky. */
1032 while ((b - a) > 1.0e-5) {
1033 c = (a + b) / 2;
1034 cy = yfun(c);
1035 if (cy > y) {
1036 a = c;
1037 ay = cy;
1038 } else {
1039 b = c;
1040 by = cy;
1041 }
1042 }
1043
1044 return c;
1045 }
1046
1047 void audioDB::sample(const char *dbName) {
1048 if(!adb) {
1049 if(!(adb = audiodb_open(dbName, O_RDONLY))) {
1050 error("failed to open database", dbName);
1051 }
1052 }
1053
1054 adb_status_t status;
1055 if(audiodb_status(adb, &status)) {
1056 error("error getting status");
1057 }
1058
1059 double sumdist = 0;
1060 double sumlogdist = 0;
1061
1062 adb_query_results_t *results;
1063 adb_query_spec_t spec = {{0},{0},{0}};
1064 adb_datum_t datum = {0};
1065
1066 spec.refine.qhopsize = sequenceHop;
1067 spec.refine.ihopsize = sequenceHop;
1068 if(sequenceHop != 1) {
1069 spec.refine.flags |= ADB_REFINE_HOP_SIZE;
1070 }
1071
1072 if(query_from_key) {
1073 datum.key = key;
1074 spec.qid.datum = &datum;
1075 spec.qid.flags |= ADB_QID_FLAG_EXHAUSTIVE;
1076 spec.refine.flags |= ADB_REFINE_EXCLUDE_KEYLIST;
1077 spec.refine.exclude.nkeys = 1;
1078 spec.refine.exclude.keys = &key;
1079 } else if(inFile) {
1080 error("sample from feature file not supported (yet)");
1081 } else {
1082 spec.qid.datum = NULL; /* full db sample */
1083 }
1084 spec.qid.sequence_length = sequenceLength;
1085 spec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE;
1086 spec.qid.sequence_start = queryPoint;
1087
1088 spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED;
1089 spec.params.accumulation = ADB_ACCUMULATION_DB;
1090 spec.params.npoints = nsamples;
1091
1092 if(!(results = audiodb_sample_spec(adb, &spec))) {
1093 error("error in audiodb_sample_spec");
1094 }
1095
1096 if(results->nresults != nsamples) {
1097 error("mismatch in sample count");
1098 }
1099
1100 for(uint32_t i = 0; i < nsamples; i++) {
1101 double d = results->results[i].dist;
1102 sumdist += d;
1103 sumlogdist += log(d);
1104 }
1105
1106 audiodb_query_free_results(adb, &spec, results);
1107
1108 unsigned total = 0;
1109 unsigned count = 0;
1110 adb_liszt_results_t *liszt;
1111 if(!(liszt = audiodb_liszt(adb))) {
1112 error("liszt failed");
1113 }
1114 for(uint32_t i = 0; i < liszt->nresults; i++) {
1115 unsigned int prop = (liszt->entries[i].nvectors - sequenceLength)/sequenceHop + 1;
1116 prop = prop > 0 ? prop : 0;
1117 if (prop > 0) {
1118 count++;
1119 }
1120 total += prop;
1121 }
1122
1123 /* FIXME: the mean isn't really what we should be using here; it's
1124 more a question of "how many independent sequences of length
1125 sequenceLength are there in the database? */
1126 unsigned meanN = total / count;
1127
1128 double sigma2 = sumdist / (sequenceLength * status.dim * nsamples);
1129 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
1130
1131 std::cout << "Summary statistics" << std::endl;
1132 std::cout << "number of samples: " << nsamples << std::endl;
1133 std::cout << "sum of distances (S): " << sumdist << std::endl;
1134 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
1135
1136 /* FIXME: we'll also want some more summary statistics based on
1137 propTable, for the minimum-of-X estimate */
1138 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
1139 std::cout << std::endl;
1140 std::cout << "Estimated parameters" << std::endl;
1141 std::cout << "sigma^2: " << sigma2 << "; ";
1142 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl;
1143 std::cout << "d: " << d << std::endl;
1144
1145 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
1146 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
1147 - (2 / d) * gsl_sf_log(meanN)
1148 - gsl_sf_log(d/2)
1149 - (2 / d) * gsl_sf_log(2 / d)
1150 + (2 / d) * gsl_sf_lngamma(d / 2);
1151
1152 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
1153 }
1154
1155
1013 // This entry point is visited once per instance 1156 // This entry point is visited once per instance
1014 // so it is a good place to set any global state variables 1157 // so it is a good place to set any global state variables
1015 int main(const int argc, const char* argv[]){ 1158 int main(const int argc, const char* argv[]){
1016 SERVER_ADB_ROOT = 0; // Server-side database root prefix 1159 SERVER_ADB_ROOT = 0; // Server-side database root prefix
1017 SERVER_ADB_FEATURE_ROOT = 0; // Server-side features root prefix 1160 SERVER_ADB_FEATURE_ROOT = 0; // Server-side features root prefix