Chris@42
|
1 /*
|
Chris@42
|
2 * Copyright (c) 2003, 2007-14 Matteo Frigo
|
Chris@42
|
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
Chris@42
|
4 *
|
Chris@42
|
5 * This program is free software; you can redistribute it and/or modify
|
Chris@42
|
6 * it under the terms of the GNU General Public License as published by
|
Chris@42
|
7 * the Free Software Foundation; either version 2 of the License, or
|
Chris@42
|
8 * (at your option) any later version.
|
Chris@42
|
9 *
|
Chris@42
|
10 * This program is distributed in the hope that it will be useful,
|
Chris@42
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Chris@42
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Chris@42
|
13 * GNU General Public License for more details.
|
Chris@42
|
14 *
|
Chris@42
|
15 * You should have received a copy of the GNU General Public License
|
Chris@42
|
16 * along with this program; if not, write to the Free Software
|
Chris@42
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Chris@42
|
18 *
|
Chris@42
|
19 */
|
Chris@42
|
20
|
Chris@42
|
21
|
Chris@42
|
22 #include "verify.h"
|
Chris@42
|
23
|
Chris@42
|
24 /* copy real A into real B, using output stride of A and input stride of B */
|
Chris@42
|
25 typedef struct {
|
Chris@42
|
26 dotens2_closure k;
|
Chris@42
|
27 R *ra;
|
Chris@42
|
28 R *rb;
|
Chris@42
|
29 } cpyr_closure;
|
Chris@42
|
30
|
Chris@42
|
31 static void cpyr0(dotens2_closure *k_,
|
Chris@42
|
32 int indxa, int ondxa, int indxb, int ondxb)
|
Chris@42
|
33 {
|
Chris@42
|
34 cpyr_closure *k = (cpyr_closure *)k_;
|
Chris@42
|
35 k->rb[indxb] = k->ra[ondxa];
|
Chris@42
|
36 UNUSED(indxa); UNUSED(ondxb);
|
Chris@42
|
37 }
|
Chris@42
|
38
|
Chris@42
|
39 static void cpyr(R *ra, const bench_tensor *sza,
|
Chris@42
|
40 R *rb, const bench_tensor *szb)
|
Chris@42
|
41 {
|
Chris@42
|
42 cpyr_closure k;
|
Chris@42
|
43 k.k.apply = cpyr0;
|
Chris@42
|
44 k.ra = ra; k.rb = rb;
|
Chris@42
|
45 bench_dotens2(sza, szb, &k.k);
|
Chris@42
|
46 }
|
Chris@42
|
47
|
Chris@42
|
48 /* copy unpacked halfcomplex A[n] into packed-complex B[n], using output stride
|
Chris@42
|
49 of A and input stride of B. Only copies non-redundant half; other
|
Chris@42
|
50 half must be copied via mkhermitian. */
|
Chris@42
|
51 typedef struct {
|
Chris@42
|
52 dotens2_closure k;
|
Chris@42
|
53 int n;
|
Chris@42
|
54 int as;
|
Chris@42
|
55 int scalea;
|
Chris@42
|
56 R *ra, *ia;
|
Chris@42
|
57 R *rb, *ib;
|
Chris@42
|
58 } cpyhc2_closure;
|
Chris@42
|
59
|
Chris@42
|
60 static void cpyhc20(dotens2_closure *k_,
|
Chris@42
|
61 int indxa, int ondxa, int indxb, int ondxb)
|
Chris@42
|
62 {
|
Chris@42
|
63 cpyhc2_closure *k = (cpyhc2_closure *)k_;
|
Chris@42
|
64 int i, n = k->n;
|
Chris@42
|
65 int scalea = k->scalea;
|
Chris@42
|
66 int as = k->as * scalea;
|
Chris@42
|
67 R *ra = k->ra + ondxa * scalea, *ia = k->ia + ondxa * scalea;
|
Chris@42
|
68 R *rb = k->rb + indxb, *ib = k->ib + indxb;
|
Chris@42
|
69 UNUSED(indxa); UNUSED(ondxb);
|
Chris@42
|
70
|
Chris@42
|
71 for (i = 0; i < n/2 + 1; ++i) {
|
Chris@42
|
72 rb[2*i] = ra[as*i];
|
Chris@42
|
73 ib[2*i] = ia[as*i];
|
Chris@42
|
74 }
|
Chris@42
|
75 }
|
Chris@42
|
76
|
Chris@42
|
77 static void cpyhc2(R *ra, R *ia,
|
Chris@42
|
78 const bench_tensor *sza, const bench_tensor *vecsza,
|
Chris@42
|
79 int scalea,
|
Chris@42
|
80 R *rb, R *ib, const bench_tensor *szb)
|
Chris@42
|
81 {
|
Chris@42
|
82 cpyhc2_closure k;
|
Chris@42
|
83 BENCH_ASSERT(sza->rnk <= 1);
|
Chris@42
|
84 k.k.apply = cpyhc20;
|
Chris@42
|
85 k.n = tensor_sz(sza);
|
Chris@42
|
86 k.scalea = scalea;
|
Chris@42
|
87 if (!BENCH_FINITE_RNK(sza->rnk) || sza->rnk == 0)
|
Chris@42
|
88 k.as = 0;
|
Chris@42
|
89 else
|
Chris@42
|
90 k.as = sza->dims[0].os;
|
Chris@42
|
91 k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
|
Chris@42
|
92 bench_dotens2(vecsza, szb, &k.k);
|
Chris@42
|
93 }
|
Chris@42
|
94
|
Chris@42
|
95 /* icpyhc2 is the inverse of cpyhc2 */
|
Chris@42
|
96
|
Chris@42
|
97 static void icpyhc20(dotens2_closure *k_,
|
Chris@42
|
98 int indxa, int ondxa, int indxb, int ondxb)
|
Chris@42
|
99 {
|
Chris@42
|
100 cpyhc2_closure *k = (cpyhc2_closure *)k_;
|
Chris@42
|
101 int i, n = k->n;
|
Chris@42
|
102 int scalea = k->scalea;
|
Chris@42
|
103 int as = k->as * scalea;
|
Chris@42
|
104 R *ra = k->ra + indxa * scalea, *ia = k->ia + indxa * scalea;
|
Chris@42
|
105 R *rb = k->rb + ondxb, *ib = k->ib + ondxb;
|
Chris@42
|
106 UNUSED(ondxa); UNUSED(indxb);
|
Chris@42
|
107
|
Chris@42
|
108 for (i = 0; i < n/2 + 1; ++i) {
|
Chris@42
|
109 ra[as*i] = rb[2*i];
|
Chris@42
|
110 ia[as*i] = ib[2*i];
|
Chris@42
|
111 }
|
Chris@42
|
112 }
|
Chris@42
|
113
|
Chris@42
|
114 static void icpyhc2(R *ra, R *ia,
|
Chris@42
|
115 const bench_tensor *sza, const bench_tensor *vecsza,
|
Chris@42
|
116 int scalea,
|
Chris@42
|
117 R *rb, R *ib, const bench_tensor *szb)
|
Chris@42
|
118 {
|
Chris@42
|
119 cpyhc2_closure k;
|
Chris@42
|
120 BENCH_ASSERT(sza->rnk <= 1);
|
Chris@42
|
121 k.k.apply = icpyhc20;
|
Chris@42
|
122 k.n = tensor_sz(sza);
|
Chris@42
|
123 k.scalea = scalea;
|
Chris@42
|
124 if (!BENCH_FINITE_RNK(sza->rnk) || sza->rnk == 0)
|
Chris@42
|
125 k.as = 0;
|
Chris@42
|
126 else
|
Chris@42
|
127 k.as = sza->dims[0].is;
|
Chris@42
|
128 k.ra = ra; k.ia = ia; k.rb = rb; k.ib = ib;
|
Chris@42
|
129 bench_dotens2(vecsza, szb, &k.k);
|
Chris@42
|
130 }
|
Chris@42
|
131
|
Chris@42
|
132 typedef struct {
|
Chris@42
|
133 dofft_closure k;
|
Chris@42
|
134 bench_problem *p;
|
Chris@42
|
135 } dofft_rdft2_closure;
|
Chris@42
|
136
|
Chris@42
|
137 static void rdft2_apply(dofft_closure *k_,
|
Chris@42
|
138 bench_complex *in, bench_complex *out)
|
Chris@42
|
139 {
|
Chris@42
|
140 dofft_rdft2_closure *k = (dofft_rdft2_closure *)k_;
|
Chris@42
|
141 bench_problem *p = k->p;
|
Chris@42
|
142 bench_tensor *totalsz, *pckdsz, *totalsz_swap, *pckdsz_swap;
|
Chris@42
|
143 bench_tensor *probsz2, *totalsz2, *pckdsz2;
|
Chris@42
|
144 bench_tensor *probsz2_swap, *totalsz2_swap, *pckdsz2_swap;
|
Chris@42
|
145 bench_real *ri, *ii, *ro, *io;
|
Chris@42
|
146 int n2, totalscale;
|
Chris@42
|
147
|
Chris@42
|
148 totalsz = tensor_append(p->vecsz, p->sz);
|
Chris@42
|
149 pckdsz = verify_pack(totalsz, 2);
|
Chris@42
|
150 n2 = tensor_sz(totalsz);
|
Chris@42
|
151 if (BENCH_FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0)
|
Chris@42
|
152 n2 = (n2 / p->sz->dims[p->sz->rnk - 1].n) *
|
Chris@42
|
153 (p->sz->dims[p->sz->rnk - 1].n / 2 + 1);
|
Chris@42
|
154 ri = (bench_real *) p->in;
|
Chris@42
|
155 ro = (bench_real *) p->out;
|
Chris@42
|
156
|
Chris@42
|
157 if (BENCH_FINITE_RNK(p->sz->rnk) && p->sz->rnk > 0 && n2 > 0) {
|
Chris@42
|
158 probsz2 = tensor_copy_sub(p->sz, p->sz->rnk - 1, 1);
|
Chris@42
|
159 totalsz2 = tensor_copy_sub(totalsz, 0, totalsz->rnk - 1);
|
Chris@42
|
160 pckdsz2 = tensor_copy_sub(pckdsz, 0, pckdsz->rnk - 1);
|
Chris@42
|
161 }
|
Chris@42
|
162 else {
|
Chris@42
|
163 probsz2 = mktensor(0);
|
Chris@42
|
164 totalsz2 = tensor_copy(totalsz);
|
Chris@42
|
165 pckdsz2 = tensor_copy(pckdsz);
|
Chris@42
|
166 }
|
Chris@42
|
167
|
Chris@42
|
168 totalsz_swap = tensor_copy_swapio(totalsz);
|
Chris@42
|
169 pckdsz_swap = tensor_copy_swapio(pckdsz);
|
Chris@42
|
170 totalsz2_swap = tensor_copy_swapio(totalsz2);
|
Chris@42
|
171 pckdsz2_swap = tensor_copy_swapio(pckdsz2);
|
Chris@42
|
172 probsz2_swap = tensor_copy_swapio(probsz2);
|
Chris@42
|
173
|
Chris@42
|
174 /* confusion: the stride is the distance between complex elements
|
Chris@42
|
175 when using interleaved format, but it is the distance between
|
Chris@42
|
176 real elements when using split format */
|
Chris@42
|
177 if (p->split) {
|
Chris@42
|
178 ii = p->ini ? (bench_real *) p->ini : ri + n2;
|
Chris@42
|
179 io = p->outi ? (bench_real *) p->outi : ro + n2;
|
Chris@42
|
180 totalscale = 1;
|
Chris@42
|
181 } else {
|
Chris@42
|
182 ii = p->ini ? (bench_real *) p->ini : ri + 1;
|
Chris@42
|
183 io = p->outi ? (bench_real *) p->outi : ro + 1;
|
Chris@42
|
184 totalscale = 2;
|
Chris@42
|
185 }
|
Chris@42
|
186
|
Chris@42
|
187 if (p->sign < 0) { /* R2HC */
|
Chris@42
|
188 int N, vN, i;
|
Chris@42
|
189 cpyr(&c_re(in[0]), pckdsz, ri, totalsz);
|
Chris@42
|
190 after_problem_rcopy_from(p, ri);
|
Chris@42
|
191 doit(1, p);
|
Chris@42
|
192 after_problem_hccopy_to(p, ro, io);
|
Chris@42
|
193 if (k->k.recopy_input)
|
Chris@42
|
194 cpyr(ri, totalsz_swap, &c_re(in[0]), pckdsz_swap);
|
Chris@42
|
195 cpyhc2(ro, io, probsz2, totalsz2, totalscale,
|
Chris@42
|
196 &c_re(out[0]), &c_im(out[0]), pckdsz2);
|
Chris@42
|
197 N = tensor_sz(p->sz);
|
Chris@42
|
198 vN = tensor_sz(p->vecsz);
|
Chris@42
|
199 for (i = 0; i < vN; ++i)
|
Chris@42
|
200 mkhermitian(out + i*N, p->sz->rnk, p->sz->dims, 1);
|
Chris@42
|
201 }
|
Chris@42
|
202 else { /* HC2R */
|
Chris@42
|
203 icpyhc2(ri, ii, probsz2, totalsz2, totalscale,
|
Chris@42
|
204 &c_re(in[0]), &c_im(in[0]), pckdsz2);
|
Chris@42
|
205 after_problem_hccopy_from(p, ri, ii);
|
Chris@42
|
206 doit(1, p);
|
Chris@42
|
207 after_problem_rcopy_to(p, ro);
|
Chris@42
|
208 if (k->k.recopy_input)
|
Chris@42
|
209 cpyhc2(ri, ii, probsz2_swap, totalsz2_swap, totalscale,
|
Chris@42
|
210 &c_re(in[0]), &c_im(in[0]), pckdsz2_swap);
|
Chris@42
|
211 mkreal(out, tensor_sz(pckdsz));
|
Chris@42
|
212 cpyr(ro, totalsz, &c_re(out[0]), pckdsz);
|
Chris@42
|
213 }
|
Chris@42
|
214
|
Chris@42
|
215 tensor_destroy(totalsz);
|
Chris@42
|
216 tensor_destroy(pckdsz);
|
Chris@42
|
217 tensor_destroy(totalsz_swap);
|
Chris@42
|
218 tensor_destroy(pckdsz_swap);
|
Chris@42
|
219 tensor_destroy(probsz2);
|
Chris@42
|
220 tensor_destroy(totalsz2);
|
Chris@42
|
221 tensor_destroy(pckdsz2);
|
Chris@42
|
222 tensor_destroy(probsz2_swap);
|
Chris@42
|
223 tensor_destroy(totalsz2_swap);
|
Chris@42
|
224 tensor_destroy(pckdsz2_swap);
|
Chris@42
|
225 }
|
Chris@42
|
226
|
Chris@42
|
227 void verify_rdft2(bench_problem *p, int rounds, double tol, errors *e)
|
Chris@42
|
228 {
|
Chris@42
|
229 C *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
|
Chris@42
|
230 int n, vecn, N;
|
Chris@42
|
231 dofft_rdft2_closure k;
|
Chris@42
|
232
|
Chris@42
|
233 BENCH_ASSERT(p->kind == PROBLEM_REAL);
|
Chris@42
|
234
|
Chris@42
|
235 if (!BENCH_FINITE_RNK(p->sz->rnk) || !BENCH_FINITE_RNK(p->vecsz->rnk))
|
Chris@42
|
236 return; /* give up */
|
Chris@42
|
237
|
Chris@42
|
238 k.k.apply = rdft2_apply;
|
Chris@42
|
239 k.k.recopy_input = 0;
|
Chris@42
|
240 k.p = p;
|
Chris@42
|
241
|
Chris@42
|
242 if (rounds == 0)
|
Chris@42
|
243 rounds = 20; /* default value */
|
Chris@42
|
244
|
Chris@42
|
245 n = tensor_sz(p->sz);
|
Chris@42
|
246 vecn = tensor_sz(p->vecsz);
|
Chris@42
|
247 N = n * vecn;
|
Chris@42
|
248
|
Chris@42
|
249 inA = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
250 inB = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
251 inC = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
252 outA = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
253 outB = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
254 outC = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
255 tmp = (C *) bench_malloc(N * sizeof(C));
|
Chris@42
|
256
|
Chris@42
|
257 e->i = impulse(&k.k, n, vecn, inA, inB, inC, outA, outB, outC,
|
Chris@42
|
258 tmp, rounds, tol);
|
Chris@42
|
259 e->l = linear(&k.k, 1, N, inA, inB, inC, outA, outB, outC,
|
Chris@42
|
260 tmp, rounds, tol);
|
Chris@42
|
261
|
Chris@42
|
262 e->s = 0.0;
|
Chris@42
|
263 if (p->sign < 0)
|
Chris@42
|
264 e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
|
Chris@42
|
265 inA, inB, outA, outB,
|
Chris@42
|
266 tmp, rounds, tol, TIME_SHIFT));
|
Chris@42
|
267 else
|
Chris@42
|
268 e->s = dmax(e->s, tf_shift(&k.k, 1, p->sz, n, vecn, p->sign,
|
Chris@42
|
269 inA, inB, outA, outB,
|
Chris@42
|
270 tmp, rounds, tol, FREQ_SHIFT));
|
Chris@42
|
271
|
Chris@42
|
272 if (!p->in_place && !p->destroy_input)
|
Chris@42
|
273 preserves_input(&k.k, p->sign < 0 ? mkreal : mkhermitian1,
|
Chris@42
|
274 N, inA, inB, outB, rounds);
|
Chris@42
|
275
|
Chris@42
|
276 bench_free(tmp);
|
Chris@42
|
277 bench_free(outC);
|
Chris@42
|
278 bench_free(outB);
|
Chris@42
|
279 bench_free(outA);
|
Chris@42
|
280 bench_free(inC);
|
Chris@42
|
281 bench_free(inB);
|
Chris@42
|
282 bench_free(inA);
|
Chris@42
|
283 }
|
Chris@42
|
284
|
Chris@42
|
285 void accuracy_rdft2(bench_problem *p, int rounds, int impulse_rounds,
|
Chris@42
|
286 double t[6])
|
Chris@42
|
287 {
|
Chris@42
|
288 dofft_rdft2_closure k;
|
Chris@42
|
289 int n;
|
Chris@42
|
290 C *a, *b;
|
Chris@42
|
291
|
Chris@42
|
292 BENCH_ASSERT(p->kind == PROBLEM_REAL);
|
Chris@42
|
293 BENCH_ASSERT(p->sz->rnk == 1);
|
Chris@42
|
294 BENCH_ASSERT(p->vecsz->rnk == 0);
|
Chris@42
|
295
|
Chris@42
|
296 k.k.apply = rdft2_apply;
|
Chris@42
|
297 k.k.recopy_input = 0;
|
Chris@42
|
298 k.p = p;
|
Chris@42
|
299 n = tensor_sz(p->sz);
|
Chris@42
|
300
|
Chris@42
|
301 a = (C *) bench_malloc(n * sizeof(C));
|
Chris@42
|
302 b = (C *) bench_malloc(n * sizeof(C));
|
Chris@42
|
303 accuracy_test(&k.k, p->sign < 0 ? mkreal : mkhermitian1, p->sign,
|
Chris@42
|
304 n, a, b, rounds, impulse_rounds, t);
|
Chris@42
|
305 bench_free(b);
|
Chris@42
|
306 bench_free(a);
|
Chris@42
|
307 }
|