annotate src/fftw-3.3.5/libbench2/verify-rdft2.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21
Chris@42 22 #include "verify.h"
Chris@42 23
Chris@42 24 /* copy real A into real B, using output stride of A and input stride of B */
Chris@42 25 typedef struct {
Chris@42 26 dotens2_closure k;
Chris@42 27 R *ra;
Chris@42 28 R *rb;
Chris@42 29 } cpyr_closure;
Chris@42 30
Chris@42 31 static void cpyr0(dotens2_closure *k_,
Chris@42 32 int indxa, int ondxa, int indxb, int ondxb)
Chris@42 33 {
Chris@42 34 cpyr_closure *k = (cpyr_closure *)k_;
Chris@42 35 k->rb[indxb] = k->ra[ondxa];
Chris@42 36 UNUSED(indxa); UNUSED(ondxb);
Chris@42 37 }
Chris@42 38
Chris@42 39 static void cpyr(R *ra, const bench_tensor *sza,
Chris@42 40 R *rb, const bench_tensor *szb)
Chris@42 41 {
Chris@42 42 cpyr_closure k;
Chris@42 43 k.k.apply = cpyr0;
Chris@42 44 k.ra = ra; k.rb = rb;
Chris@42 45 bench_dotens2(sza, szb, &k.k);
Chris@42 46 }
Chris@42 47
Chris@42 48 /* copy unpacked halfcomplex A[n] into packed-complex B[n], using output stride
Chris@42 49 of A and input stride of B. Only copies non-redundant half; other
Chris@42 50 half must be copied via mkhermitian. */
Chris@42 51 typedef struct {
Chris@42 52 dotens2_closure k;
Chris@42 53 int n;
Chris@42 54 int as;
Chris@42 55 int scalea;
Chris@42 56 R *ra, *ia;
Chris@42 57 R *rb, *ib;
Chris@42 58 } cpyhc2_closure;
Chris@42 59
Chris@42 60 static void cpyhc20(dotens2_closure *k_,
Chris@42 61 int indxa, int ondxa, int indxb, int ondxb)
Chris@42 62 {
Chris@42 63 cpyhc2_closure *k = (cpyhc2_closure *)k_;
Chris@42 64 int i, n = k->n;
Chris@42 65 int scalea = k->scalea;
Chris@42 66 int as = k->as * scalea;
Chris@42 67 R *ra = k->ra + ondxa * scalea, *ia = k->ia + ondxa * scalea;
Chris@42 68 R *rb = k->rb + indxb, *ib = k->ib + indxb;
Chris@42 69 UNUSED(indxa); UNUSED(ondxb);
Chris@42 70
Chris@42 71 for (i = 0; i < n/2 + 1; ++i) {
Chris@42 72 rb[2*i] = ra[as*i];
Chris@42 73 ib[2*i] = ia[as*i];
Chris@42 74 }
Chris@42 75 }
Chris@42 76
Chris@42 77 static void cpyhc2(R *ra, R *ia,
Chris@42 78 const bench_tensor *sza, const bench_tensor *vecsza,
Chris@42 79 int scalea,
Chris@42 80 R *rb, R *ib, const bench_tensor *szb)
Chris@42 81 {
Chris@42 82 cpyhc2_closure k;
Chris@42 83 BENCH_ASSERT(sza->rnk <= 1);
Chris@42 84 k.k.apply = cpyhc20;
Chris@42 85 k.n = tensor_sz(sza);
Chris@42 86 k.scalea = scalea;
Chris@42 87 if (!BENCH_FINITE_RNK(sza->rnk) || sza->rnk == 0)
Chris@42 88 k.as = 0;
Chris@42 89 else
Chris@42 90 k.as = sza->dims[0].os;
Chris@42 91 k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
Chris@42 92 bench_dotens2(vecsza, szb, &k.k);
Chris@42 93 }
Chris@42 94
Chris@42 95 /* icpyhc2 is the inverse of cpyhc2 */
Chris@42 96
Chris@42 97 static void icpyhc20(dotens2_closure *k_,
Chris@42 98 int indxa, int ondxa, int indxb, int ondxb)
Chris@42 99 {
Chris@42 100 cpyhc2_closure *k = (cpyhc2_closure *)k_;
Chris@42 101 int i, n = k->n;
Chris@42 102 int scalea = k->scalea;
Chris@42 103 int as = k->as * scalea;
Chris@42 104 R *ra = k->ra + indxa * scalea, *ia = k->ia + indxa * scalea;
Chris@42 105 R *rb = k->rb + ondxb, *ib = k->ib + ondxb;
Chris@42 106 UNUSED(ondxa); UNUSED(indxb);
Chris@42 107
Chris@42 108 for (i = 0; i < n/2 + 1; ++i) {
Chris@42 109 ra[as*i] = rb[2*i];
Chris@42 110 ia[as*i] = ib[2*i];
Chris@42 111 }
Chris@42 112 }
Chris@42 113
Chris@42 114 static void icpyhc2(R *ra, R *ia,
Chris@42 115 const bench_tensor *sza, const bench_tensor *vecsza,
Chris@42 116 int scalea,
Chris@42 117 R *rb, R *ib, const bench_tensor *szb)
Chris@42 118 {
Chris@42 119 cpyhc2_closure k;
Chris@42 120 BENCH_ASSERT(sza->rnk <= 1);
Chris@42 121 k.k.apply = icpyhc20;
Chris@42 122 k.n = tensor_sz(sza);
Chris@42 123 k.scalea = scalea;
Chris@42 124 if (!BENCH_FINITE_RNK(sza->rnk) || sza->rnk == 0)
Chris@42 125 k.as = 0;
Chris@42 126 else
Chris@42 127 k.as = sza->dims[0].is;
Chris@42 128 k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
Chris@42 129 bench_dotens2(vecsza, szb, &k.k);
Chris@42 130 }
Chris@42 131
Chris@42 132 typedef struct {
Chris@42 133 dofft_closure k;
Chris@42 134 bench_problem *p;
Chris@42 135 } dofft_rdft2_closure;
Chris@42 136
Chris@42 137 static void rdft2_apply(dofft_closure *k_,
Chris@42 138 bench_complex *in, bench_complex *out)
Chris@42 139 {
Chris@42 140 dofft_rdft2_closure *k = (dofft_rdft2_closure *)k_;
Chris@42 141 bench_problem *p = k->p;
Chris@42 142 bench_tensor *totalsz, *pckdsz, *totalsz_swap, *pckdsz_swap;
Chris@42 143 bench_tensor *probsz2, *totalsz2, *pckdsz2;
Chris@42 144 bench_tensor *probsz2_swap, *totalsz2_swap, *pckdsz2_swap;
Chris@42 145 bench_real *ri, *ii, *ro, *io;
Chris@42 146 int n2, totalscale;
Chris@42 147
Chris@42 148 totalsz = tensor_append(p->vecsz, p->sz);
Chris@42 149 pckdsz = verify_pack(totalsz, 2);
Chris@42 150 n2 = tensor_sz(totalsz);
Chris@42 151 if (BENCH_FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0)
Chris@42 152 n2 = (n2 / p->sz->dims[p->sz->rnk - 1].n) *
Chris@42 153 (p->sz->dims[p->sz->rnk - 1].n / 2 + 1);
Chris@42 154 ri = (bench_real *) p->in;
Chris@42 155 ro = (bench_real *) p->out;
Chris@42 156
Chris@42 157 if (BENCH_FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0 && n2 > 0) {
Chris@42 158 probsz2 = tensor_copy_sub(p->sz, p->sz->rnk - 1, 1);
Chris@42 159 totalsz2 = tensor_copy_sub(totalsz, 0, totalsz->rnk - 1);
Chris@42 160 pckdsz2 = tensor_copy_sub(pckdsz, 0, pckdsz->rnk - 1);
Chris@42 161 }
Chris@42 162 else {
Chris@42 163 probsz2 = mktensor(0);
Chris@42 164 totalsz2 = tensor_copy(totalsz);
Chris@42 165 pckdsz2 = tensor_copy(pckdsz);
Chris@42 166 }
Chris@42 167
Chris@42 168 totalsz_swap = tensor_copy_swapio(totalsz);
Chris@42 169 pckdsz_swap = tensor_copy_swapio(pckdsz);
Chris@42 170 totalsz2_swap = tensor_copy_swapio(totalsz2);
Chris@42 171 pckdsz2_swap = tensor_copy_swapio(pckdsz2);
Chris@42 172 probsz2_swap = tensor_copy_swapio(probsz2);
Chris@42 173
Chris@42 174 /* confusion: the stride is the distance between complex elements
Chris@42 175 when using interleaved format, but it is the distance between
Chris@42 176 real elements when using split format */
Chris@42 177 if (p->split) {
Chris@42 178 ii = p->ini ? (bench_real *) p->ini : ri + n2;
Chris@42 179 io = p->outi ? (bench_real *) p->outi : ro + n2;
Chris@42 180 totalscale = 1;
Chris@42 181 } else {
Chris@42 182 ii = p->ini ? (bench_real *) p->ini : ri + 1;
Chris@42 183 io = p->outi ? (bench_real *) p->outi : ro + 1;
Chris@42 184 totalscale = 2;
Chris@42 185 }
Chris@42 186
Chris@42 187 if (p->sign < 0) { /* R2HC */
Chris@42 188 int N, vN, i;
Chris@42 189 cpyr(&c_re(in[0]), pckdsz, ri, totalsz);
Chris@42 190 after_problem_rcopy_from(p, ri);
Chris@42 191 doit(1, p);
Chris@42 192 after_problem_hccopy_to(p, ro, io);
Chris@42 193 if (k->k.recopy_input)
Chris@42 194 cpyr(ri, totalsz_swap, &c_re(in[0]), pckdsz_swap);
Chris@42 195 cpyhc2(ro, io, probsz2, totalsz2, totalscale,
Chris@42 196 &c_re(out[0]), &c_im(out[0]), pckdsz2);
Chris@42 197 N = tensor_sz(p->sz);
Chris@42 198 vN = tensor_sz(p->vecsz);
Chris@42 199 for (i = 0; i < vN; ++i)
Chris@42 200 mkhermitian(out + i*N, p->sz->rnk, p->sz->dims, 1);
Chris@42 201 }
Chris@42 202 else { /* HC2R */
Chris@42 203 icpyhc2(ri, ii, probsz2, totalsz2, totalscale,
Chris@42 204 &c_re(in[0]), &c_im(in[0]), pckdsz2);
Chris@42 205 after_problem_hccopy_from(p, ri, ii);
Chris@42 206 doit(1, p);
Chris@42 207 after_problem_rcopy_to(p, ro);
Chris@42 208 if (k->k.recopy_input)
Chris@42 209 cpyhc2(ri, ii, probsz2_swap, totalsz2_swap, totalscale,
Chris@42 210 &c_re(in[0]), &c_im(in[0]), pckdsz2_swap);
Chris@42 211 mkreal(out, tensor_sz(pckdsz));
Chris@42 212 cpyr(ro, totalsz, &c_re(out[0]), pckdsz);
Chris@42 213 }
Chris@42 214
Chris@42 215 tensor_destroy(totalsz);
Chris@42 216 tensor_destroy(pckdsz);
Chris@42 217 tensor_destroy(totalsz_swap);
Chris@42 218 tensor_destroy(pckdsz_swap);
Chris@42 219 tensor_destroy(probsz2);
Chris@42 220 tensor_destroy(totalsz2);
Chris@42 221 tensor_destroy(pckdsz2);
Chris@42 222 tensor_destroy(probsz2_swap);
Chris@42 223 tensor_destroy(totalsz2_swap);
Chris@42 224 tensor_destroy(pckdsz2_swap);
Chris@42 225 }
Chris@42 226
Chris@42 227 void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e)
Chris@42 228 {
Chris@42 229 C *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
Chris@42 230 int n, vecn, N;
Chris@42 231 dofft_rdft2_closure k;
Chris@42 232
Chris@42 233 BENCH_ASSERT(p->kind == PROBLEM_REAL);
Chris@42 234
Chris@42 235 if (!BENCH_FINITE_RNK(p->sz->rnk) || !BENCH_FINITE_RNK(p->vecsz->rnk))
Chris@42 236 return; /* give up */
Chris@42 237
Chris@42 238 k.k.apply = rdft2_apply;
Chris@42 239 k.k.recopy_input = 0;
Chris@42 240 k.p = p;
Chris@42 241
Chris@42 242 if (rounds == 0)
Chris@42 243 rounds = 20; /* default value */
Chris@42 244
Chris@42 245 n = tensor_sz(p->sz);
Chris@42 246 vecn = tensor_sz(p->vecsz);
Chris@42 247 N = n * vecn;
Chris@42 248
Chris@42 249 inA = (C *) bench_malloc(N * sizeof(C));
Chris@42 250 inB = (C *) bench_malloc(N * sizeof(C));
Chris@42 251 inC = (C *) bench_malloc(N * sizeof(C));
Chris@42 252 outA = (C *) bench_malloc(N * sizeof(C));
Chris@42 253 outB = (C *) bench_malloc(N * sizeof(C));
Chris@42 254 outC = (C *) bench_malloc(N * sizeof(C));
Chris@42 255 tmp = (C *) bench_malloc(N * sizeof(C));
Chris@42 256
Chris@42 257 e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC,
Chris@42 258 tmp, rounds, tol);
Chris@42 259 e->l = linear(&k.k, 1, N, inA, inB, inC, outA, outB, outC,
Chris@42 260 tmp, rounds, tol);
Chris@42 261
Chris@42 262 e->s = 0.0;
Chris@42 263 if (p->sign < 0)
Chris@42 264 e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
Chris@42 265 inA, inB, outA, outB,
Chris@42 266 tmp, rounds, tol, TIME_SHIFT));
Chris@42 267 else
Chris@42 268 e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
Chris@42 269 inA, inB, outA, outB,
Chris@42 270 tmp, rounds, tol, FREQ_SHIFT));
Chris@42 271
Chris@42 272 if (!p->in_place && !p->destroy_input)
Chris@42 273 preserves_input(&k.k, p->sign < 0 ? mkreal : mkhermitian1,
Chris@42 274 N, inA, inB, outB, rounds);
Chris@42 275
Chris@42 276 bench_free(tmp);
Chris@42 277 bench_free(outC);
Chris@42 278 bench_free(outB);
Chris@42 279 bench_free(outA);
Chris@42 280 bench_free(inC);
Chris@42 281 bench_free(inB);
Chris@42 282 bench_free(inA);
Chris@42 283 }
Chris@42 284
Chris@42 285 void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds,
Chris@42 286 double t[6])
Chris@42 287 {
Chris@42 288 dofft_rdft2_closure k;
Chris@42 289 int n;
Chris@42 290 C *a, *b;
Chris@42 291
Chris@42 292 BENCH_ASSERT(p->kind == PROBLEM_REAL);
Chris@42 293 BENCH_ASSERT(p->sz->rnk == 1);
Chris@42 294 BENCH_ASSERT(p->vecsz->rnk == 0);
Chris@42 295
Chris@42 296 k.k.apply = rdft2_apply;
Chris@42 297 k.k.recopy_input = 0;
Chris@42 298 k.p = p;
Chris@42 299 n = tensor_sz(p->sz);
Chris@42 300
Chris@42 301 a = (C *) bench_malloc(n * sizeof(C));
Chris@42 302 b = (C *) bench_malloc(n * sizeof(C));
Chris@42 303 accuracy_test(&k.k, p->sign < 0 ? mkreal : mkhermitian1, p->sign,
Chris@42 304 n, a, b, rounds, impulse_rounds, t);
Chris@42 305 bench_free(b);
Chris@42 306 bench_free(a);
Chris@42 307 }