comparison sample.cpp @ 270:9636040ff503 sampling

Parametrize nsamples (though not on the command-line yet) Compute the mean number of possible sequences in tracks, and display that. Compute the track xthreshold as in TASLP paper. (Not quite getting the same answers, though).
author mas01cr
date Mon, 16 Jun 2008 11:59:43 +0000
parents dcbb30790b30
children 1f2c7d5e581c
comparison
equal deleted inserted replaced
269:dcbb30790b30 270:9636040ff503
61 cumTrack += trackTable[k] * dbH->dim; 61 cumTrack += trackTable[k] * dbH->dim;
62 } 62 }
63 63
64 unsigned *propTable = new unsigned[dbH->numFiles]; 64 unsigned *propTable = new unsigned[dbH->numFiles];
65 unsigned total = 0; 65 unsigned total = 0;
66 unsigned count = 0;
66 67
67 for (unsigned int i = 0; i < dbH->numFiles; i++) { 68 for (unsigned int i = 0; i < dbH->numFiles; i++) {
68 /* what kind of a stupid language doesn't have binary max(), let 69 /* what kind of a stupid language doesn't have binary max(), let
69 alone nary? */ 70 alone nary? */
70 unsigned int prop = trackTable[i] - sequenceLength + 1; 71 unsigned int prop = trackTable[i] - sequenceLength + 1;
71 prop = prop > 0 ? prop : 0; 72 prop = prop > 0 ? prop : 0;
73 if (prop > 0)
74 count++;
72 propTable[i] = prop; 75 propTable[i] = prop;
73 total += prop; 76 total += prop;
74 } 77 }
75 78
76 if (total == 0) { 79 if (total == 0) {
83 double v1norm, v2norm, v1v2; 86 double v1norm, v2norm, v1v2;
84 87
85 double sumdist = 0; 88 double sumdist = 0;
86 double sumlogdist = 0; 89 double sumlogdist = 0;
87 90
88 /* 1037 samples for now */ 91 unsigned int nsamples = 2049;
89 for (unsigned int i = 0; i < 1037;) { 92
93 for (unsigned int i = 0; i < nsamples;) {
90 /* FIXME: in Real Life we'll want to initialize the RNG using 94 /* FIXME: in Real Life we'll want to initialize the RNG using
91 /dev/random or the current time or something. */ 95 /dev/random or the current time or something. */
92 unsigned track1 = random_track(propTable, total); 96 unsigned track1 = random_track(propTable, total);
93 unsigned track2 = random_track(propTable, total); 97 unsigned track2 = random_track(propTable, total);
94 98
135 } else { 139 } else {
136 VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2); 140 VERB_LOG(1, "infinity found: %f %f %f\n", v1norm, v2norm, v1v2);
137 } 141 }
138 } 142 }
139 143
140 double sigma2 = (sumdist / (sequenceLength * dbH->dim * 1037)); 144 /* FIXME: the mean isn't really what we should be reporting here */
141 double d = 2 * yinv(log(sumdist/1037) - sumlogdist/1037); 145 unsigned meanN = total / count;
146
147 double sigma2 = sumdist / (sequenceLength * dbH->dim * nsamples);
148 double d = 2 * yinv(log(sumdist/nsamples) - sumlogdist/nsamples);
142 149
143 std::cout << "Summary statistics" << std::endl; 150 std::cout << "Summary statistics" << std::endl;
144 std::cout << "number of samples: " << 1037 << std::endl; 151 std::cout << "number of samples: " << nsamples << std::endl;
145 std::cout << "sum of distances (S): " << sumdist << std::endl; 152 std::cout << "sum of distances (S): " << sumdist << std::endl;
146 std::cout << "sum of log distances (L): " << sumlogdist << std::endl; 153 std::cout << "sum of log distances (L): " << sumlogdist << std::endl;
154 std::cout << "mean number of applicable sequences (N): " << meanN << std::endl;
147 std::cout << std::endl; 155 std::cout << std::endl;
148 std::cout << "Estimated parameters" << std::endl; 156 std::cout << "Estimated parameters" << std::endl;
149 std::cout << "sigma^2: " << sigma2 << std::endl; 157 std::cout << "sigma^2: " << sigma2 << std::endl;
150 std::cout << "d: " << d << std::endl; 158 std::cout << "d: " << d << std::endl;
151 std::cout << "check: " << yfun(d/2) << std::endl; 159
160 double logw = (2 / d) * gsl_sf_log(-gsl_sf_log(0.99));
161 double logxthresh = gsl_sf_log(sumdist / nsamples) + logw
162 - (2 / d) * gsl_sf_log(meanN)
163 - gsl_sf_log(d/2)
164 - (2 / d) * gsl_sf_log(2 / d)
165 + (2 / d) * gsl_sf_lngamma(d / 2);
166
167 std::cout << "track xthresh: " << exp(logxthresh) << std::endl;
152 168
153 /* FIXME: we'll also want some summary statistics based on 169 /* FIXME: we'll also want some summary statistics based on
154 propTable, for the minimum-of-X estimate */ 170 propTable, for the minimum-of-X estimate */
155 171
156 delete[] propTable; 172 delete[] propTable;