Mercurial > hg > audiodb
comparison audioDB.cpp @ 693:b1723ae7675e
begin work on sampling API
This is motivated by the need to be able to sample with arbitrary feature data
(e.g. from a feature file) against a database, for the JNMR "collections" paper
revisions or possible ISMIR paper revisions. That bit doesn't work yet, but
the C-ified version of the current functionality (sample db x db and
sample key x db) works to the level of anecdotal tests.
The general approach is to mirror the _query_spec() API, where a whole heap
of knobs and twiddles are available to the user. Unlike in the _query_spec()
API, not quite all of the knobs make sense (and even fewer are actually
implemented), but the basic idea is the same.
I pity the poor chump who will have to document all this.
author | mas01cr |
---|---|
date | Thu, 22 Apr 2010 21:03:47 +0000 |
parents | c62041316a44 |
children | 01e25f938b63 |
comparison
equal
deleted
inserted
replaced
692:02756c5ca15a | 693:b1723ae7675e |
---|---|
145 munmap(timesFileNameTable, fileTableLength); | 145 munmap(timesFileNameTable, fileTableLength); |
146 if(powerFileNameTable) | 146 if(powerFileNameTable) |
147 munmap(powerFileNameTable, fileTableLength); | 147 munmap(powerFileNameTable, fileTableLength); |
148 if(reporter) | 148 if(reporter) |
149 delete reporter; | 149 delete reporter; |
150 if(rng) | |
151 gsl_rng_free(rng); | |
152 if(infid>0) { | 150 if(infid>0) { |
153 close(infid); | 151 close(infid); |
154 infid = 0; | 152 infid = 0; |
155 } | 153 } |
156 if(powerfd) { | 154 if(powerfd) { |
844 datum.key = key; | 842 datum.key = key; |
845 } else { | 843 } else { |
846 int fd; | 844 int fd; |
847 struct stat st; | 845 struct stat st; |
848 | 846 |
849 /* FIXME: around here there are all sorts of hideous leaks. */ | 847 /* FIXME: around here error conditions will cause all sorts of |
848 hideous leaks. */ | |
850 fd = open(inFile, O_RDONLY); | 849 fd = open(inFile, O_RDONLY); |
851 if(fd < 0) { | 850 if(fd < 0) { |
852 error("failed to open feature file", inFile); | 851 error("failed to open feature file", inFile); |
853 } | 852 } |
854 fstat(fd, &st); | 853 fstat(fd, &st); |
1008 adbLisztResponse->result.__sizeRlen = k; | 1007 adbLisztResponse->result.__sizeRlen = k; |
1009 } | 1008 } |
1010 audiodb_liszt_free_results(adb, results); | 1009 audiodb_liszt_free_results(adb, results); |
1011 } | 1010 } |
1012 | 1011 |
1012 #include <gsl/gsl_sf.h> | |
1013 | |
1014 static | |
1015 double yfun(double d) { | |
1016 return gsl_sf_log(d) - gsl_sf_psi(d); | |
1017 } | |
1018 | |
1019 static | |
1020 double yinv(double y) { | |
1021 double a = 1.0e-5; | |
1022 double b = 1000.0; | |
1023 | |
1024 double ay = yfun(a); | |
1025 double by = yfun(b); | |
1026 | |
1027 double c = 0; | |
1028 double cy; | |
1029 | |
1030 /* FIXME: simple binary search; there's probably some clever solver | |
1031 in gsl somewhere which is less sucky. */ | |
1032 while ((b - a) > 1.0e-5) { | |
1033 c = (a + b) / 2; | |
1034 cy = yfun(c); | |
1035 if (cy > y) { | |
1036 a = c; | |
1037 ay = cy; | |
1038 } else { | |
1039 b = c; | |
1040 by = cy; | |
1041 } | |
1042 } | |
1043 | |
1044 return c; | |
1045 } | |
1046 | |
1047 void audioDB::sample(const char *dbName) { | |
1048 if(!adb) { | |
1049 if(!(adb = audiodb_open(dbName, O_RDONLY))) { | |
1050 error("failed to open database", dbName); | |
1051 } | |
1052 } | |
1053 | |
1054 adb_status_t status; | |
1055 if(audiodb_status(adb, &status)) { | |
1056 error("error getting status"); | |
1057 } | |
1058 | |
1059 double sumdist = 0; | |
1060 double sumlogdist = 0; | |
1061 | |
1062 adb_query_results_t *results; | |
1063 adb_query_spec_t spec = {{0},{0},{0}}; | |
1064 adb_datum_t datum = {0}; | |
1065 | |
1066 spec.refine.qhopsize = sequenceHop; | |
1067 spec.refine.ihopsize = sequenceHop; | |
1068 if(sequenceHop != 1) { | |
1069 spec.refine.flags |= ADB_REFINE_HOP_SIZE; | |
1070 } | |
1071 | |
1072 if(query_from_key) { | |
1073 datum.key = key; | |
1074 spec.qid.datum = &datum; | |
1075 spec.qid.flags |= ADB_QID_FLAG_EXHAUSTIVE; | |
1076 spec.refine.flags |= ADB_REFINE_EXCLUDE_KEYLIST; | |
1077 spec.refine.exclude.nkeys = 1; | |
1078 spec.refine.exclude.keys = &key; | |
1079 } else if(inFile) { | |
1080 error("sample from feature file not supported (yet)"); | |
1081 } else { | |
1082 spec.qid.datum = NULL; /* full db sample */ | |
1083 } | |
1084 spec.qid.sequence_length = sequenceLength; | |
1085 spec.qid.flags |= usingQueryPoint ? 0 : ADB_QID_FLAG_EXHAUSTIVE; | |
1086 spec.qid.sequence_start = queryPoint; | |
1087 | |
1088 spec.params.distance = no_unit_norming ? ADB_DISTANCE_EUCLIDEAN : ADB_DISTANCE_EUCLIDEAN_NORMED; | |
1089 spec.params.accumulation = ADB_ACCUMULATION_DB; | |
1090 spec.params.npoints = nsamples; | |
1091 | |
1092 if(!(results = audiodb_sample_spec(adb, &spec))) { | |
1093 error("error in audiodb_sample_spec"); | |
1094 } | |
1095 | |
1096 if(results->nresults != nsamples) { | |
1097 error("mismatch in sample count"); | |
1098 } | |
1099 | |
1100 for(uint32_t i = 0; i < nsamples; i++) { | |
1101 double d = results->results[i].dist; | |
1102 sumdist += d; | |
1103 sumlogdist += log(d); | |
1104 } | |
1105 | |
1106 audiodb_query_free_results(adb, &spec, results); | |
1107 | |
1108 unsigned total = 0; | |
1109 unsigned count = 0; | |
1110 adb_liszt_results_t *liszt; | |
1111 if(!(liszt = audiodb_liszt(adb))) { | |
1112 error("liszt failed"); | |
1113 } | |
1114 for(uint32_t i = 0; i < liszt->nresults; i++) { | |
1115 unsigned int prop = (liszt->entries[i].nvectors - sequenceLength)/sequenceHop + 1; | |
1116 prop = prop > 0 ? prop : 0; | |
1117 if (prop > 0) { | |
1118 count++; | |
1119 } | |
1120 total += prop; | |
1121 } | |
1122 | |
1123 /* FIXME: the mean isn't really what we should be using here; it's | |
1124 more a question of "how many independent sequences of length | |
1125 sequenceLength are there in the database? */ | |
1126 unsigned meanN = total / count; | |
1127 | |
1128 double sigma2 = sumdist / (sequenceLength * status.dim * nsamples); | |
1129 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples); | |
1130 | |
1131 std::cout << "Summary statistics" << std::endl; | |
1132 std::cout << "number of samples: " << nsamples << std::endl; | |
1133 std::cout << "sum of distances (S): " << sumdist << std::endl; | |
1134 std::cout << "sum of log distances (L): " << sumlogdist << std::endl; | |
1135 | |
1136 /* FIXME: we'll also want some more summary statistics based on | |
1137 propTable, for the minimum-of-X estimate */ | |
1138 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl; | |
1139 std::cout << std::endl; | |
1140 std::cout << "Estimated parameters" << std::endl; | |
1141 std::cout << "sigma^2: " << sigma2 << "; "; | |
1142 std::cout << "Msigma^2: " << sumdist / nsamples << std::endl; | |
1143 std::cout << "d: " << d << std::endl; | |
1144 | |
1145 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99)); | |
1146 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw | |
1147 - (2 / d) * gsl_sf_log(meanN) | |
1148 - gsl_sf_log(d/2) | |
1149 - (2 / d) * gsl_sf_log(2 / d) | |
1150 + (2 / d) * gsl_sf_lngamma(d / 2); | |
1151 | |
1152 std::cout << "track xthresh: " << exp(logxthresh) << std::endl; | |
1153 } | |
1154 | |
1155 | |
1013 // This entry point is visited once per instance | 1156 // This entry point is visited once per instance |
1014 // so it is a good place to set any global state variables | 1157 // so it is a good place to set any global state variables |
1015 int main(const int argc, const char* argv[]){ | 1158 int main(const int argc, const char* argv[]){ |
1016 SERVER_ADB_ROOT = 0; // Server-side database root prefix | 1159 SERVER_ADB_ROOT = 0; // Server-side database root prefix |
1017 SERVER_ADB_FEATURE_ROOT = 0; // Server-side features root prefix | 1160 SERVER_ADB_FEATURE_ROOT = 0; // Server-side features root prefix |