Mercurial > hg > sv-dependency-builds
comparison src/fftw-3.3.5/rdft/rank0.c @ 127:7867fa7e1b6b
Current fftw source
author | Chris Cannam <cannam@all-day-breakfast.com> |
---|---|
date | Tue, 18 Oct 2016 13:40:26 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
126:4a7071416412 | 127:7867fa7e1b6b |
---|---|
1 /* | |
2 * Copyright (c) 2003, 2007-14 Matteo Frigo | |
3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
18 * | |
19 */ | |
20 | |
21 | |
22 /* plans for rank-0 RDFTs (copy operations) */ | |
23 | |
24 #include "rdft.h" | |
25 | |
26 #ifdef HAVE_STRING_H | |
27 #include <string.h> /* for memcpy() */ | |
28 #endif | |
29 | |
30 #define MAXRNK 32 /* FIXME: should malloc() */ | |
31 | |
32 typedef struct { | |
33 plan_rdft super; | |
34 INT vl; | |
35 int rnk; | |
36 iodim d[MAXRNK]; | |
37 const char *nam; | |
38 } P; | |
39 | |
40 typedef struct { | |
41 solver super; | |
42 rdftapply apply; | |
43 int (*applicable)(const P *pln, const problem_rdft *p); | |
44 const char *nam; | |
45 } S; | |
46 | |
47 /* copy up to MAXRNK dimensions from problem into plan. If a | |
48 contiguous dimension exists, save its length in pln->vl */ | |
49 static int fill_iodim(P *pln, const problem_rdft *p) | |
50 { | |
51 int i; | |
52 const tensor *vecsz = p->vecsz; | |
53 | |
54 pln->vl = 1; | |
55 pln->rnk = 0; | |
56 for (i = 0; i < vecsz->rnk; ++i) { | |
57 /* extract contiguous dimensions */ | |
58 if (pln->vl == 1 && | |
59 vecsz->dims[i].is == 1 && vecsz->dims[i].os == 1) | |
60 pln->vl = vecsz->dims[i].n; | |
61 else if (pln->rnk == MAXRNK) | |
62 return 0; | |
63 else | |
64 pln->d[pln->rnk++] = vecsz->dims[i]; | |
65 } | |
66 | |
67 return 1; | |
68 } | |
69 | |
70 /* generic higher-rank copy routine, calls cpy2d() to do the real work */ | |
71 static void copy(const iodim *d, int rnk, INT vl, | |
72 R *I, R *O, | |
73 cpy2d_func cpy2d) | |
74 { | |
75 A(rnk >= 2); | |
76 if (rnk == 2) | |
77 cpy2d(I, O, d[0].n, d[0].is, d[0].os, d[1].n, d[1].is, d[1].os, vl); | |
78 else { | |
79 INT i; | |
80 for (i = 0; i < d[0].n; ++i, I += d[0].is, O += d[0].os) | |
81 copy(d + 1, rnk - 1, vl, I, O, cpy2d); | |
82 } | |
83 } | |
84 | |
85 /* FIXME: should be more general */ | |
86 static int transposep(const P *pln) | |
87 { | |
88 int i; | |
89 | |
90 for (i = 0; i < pln->rnk - 2; ++i) | |
91 if (pln->d[i].is != pln->d[i].os) | |
92 return 0; | |
93 | |
94 return (pln->d[i].n == pln->d[i+1].n && | |
95 pln->d[i].is == pln->d[i+1].os && | |
96 pln->d[i].os == pln->d[i+1].is); | |
97 } | |
98 | |
99 /* generic higher-rank transpose routine, calls transpose2d() to do | |
100 * the real work */ | |
101 static void transpose(const iodim *d, int rnk, INT vl, | |
102 R *I, | |
103 transpose_func transpose2d) | |
104 { | |
105 A(rnk >= 2); | |
106 if (rnk == 2) | |
107 transpose2d(I, d[0].n, d[0].is, d[0].os, vl); | |
108 else { | |
109 INT i; | |
110 for (i = 0; i < d[0].n; ++i, I += d[0].is) | |
111 transpose(d + 1, rnk - 1, vl, I, transpose2d); | |
112 } | |
113 } | |
114 | |
115 /**************************************************************/ | |
116 /* rank 0,1,2, out of place, iterative */ | |
117 static void apply_iter(const plan *ego_, R *I, R *O) | |
118 { | |
119 const P *ego = (const P *) ego_; | |
120 | |
121 switch (ego->rnk) { | |
122 case 0: | |
123 X(cpy1d)(I, O, ego->vl, 1, 1, 1); | |
124 break; | |
125 case 1: | |
126 X(cpy1d)(I, O, | |
127 ego->d[0].n, ego->d[0].is, ego->d[0].os, | |
128 ego->vl); | |
129 break; | |
130 default: | |
131 copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_ci)); | |
132 break; | |
133 } | |
134 } | |
135 | |
136 static int applicable_iter(const P *pln, const problem_rdft *p) | |
137 { | |
138 UNUSED(pln); | |
139 return (p->I != p->O); | |
140 } | |
141 | |
142 /**************************************************************/ | |
143 /* out of place, write contiguous output */ | |
144 static void apply_cpy2dco(const plan *ego_, R *I, R *O) | |
145 { | |
146 const P *ego = (const P *) ego_; | |
147 copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_co)); | |
148 } | |
149 | |
150 static int applicable_cpy2dco(const P *pln, const problem_rdft *p) | |
151 { | |
152 int rnk = pln->rnk; | |
153 return (1 | |
154 && p->I != p->O | |
155 && rnk >= 2 | |
156 | |
157 /* must not duplicate apply_iter */ | |
158 && (X(iabs)(pln->d[rnk - 2].is) <= X(iabs)(pln->d[rnk - 1].is) | |
159 || | |
160 X(iabs)(pln->d[rnk - 2].os) <= X(iabs)(pln->d[rnk - 1].os)) | |
161 ); | |
162 } | |
163 | |
164 /**************************************************************/ | |
165 /* out of place, tiled, no buffering */ | |
166 static void apply_tiled(const plan *ego_, R *I, R *O) | |
167 { | |
168 const P *ego = (const P *) ego_; | |
169 copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiled)); | |
170 } | |
171 | |
172 static int applicable_tiled(const P *pln, const problem_rdft *p) | |
173 { | |
174 return (1 | |
175 && p->I != p->O | |
176 && pln->rnk >= 2 | |
177 | |
178 /* somewhat arbitrary */ | |
179 && X(compute_tilesz)(pln->vl, 1) > 4 | |
180 ); | |
181 } | |
182 | |
183 /**************************************************************/ | |
184 /* out of place, tiled, with buffer */ | |
185 static void apply_tiledbuf(const plan *ego_, R *I, R *O) | |
186 { | |
187 const P *ego = (const P *) ego_; | |
188 copy(ego->d, ego->rnk, ego->vl, I, O, X(cpy2d_tiledbuf)); | |
189 } | |
190 | |
191 #define applicable_tiledbuf applicable_tiled | |
192 | |
193 /**************************************************************/ | |
194 /* rank 0, out of place, using memcpy */ | |
195 static void apply_memcpy(const plan *ego_, R *I, R *O) | |
196 { | |
197 const P *ego = (const P *) ego_; | |
198 | |
199 A(ego->rnk == 0); | |
200 memcpy(O, I, ego->vl * sizeof(R)); | |
201 } | |
202 | |
203 static int applicable_memcpy(const P *pln, const problem_rdft *p) | |
204 { | |
205 return (1 | |
206 && p->I != p->O | |
207 && pln->rnk == 0 | |
208 && pln->vl > 2 /* do not bother memcpy-ing complex numbers */ | |
209 ); | |
210 } | |
211 | |
212 /**************************************************************/ | |
213 /* rank > 0 vecloop, out of place, using memcpy (e.g. out-of-place | |
214 transposes of vl-tuples ... for large vl it should be more | |
215 efficient to use memcpy than the tiled stuff). */ | |
216 | |
217 static void memcpy_loop(size_t cpysz, int rnk, const iodim *d, R *I, R *O) | |
218 { | |
219 INT i, n = d->n, is = d->is, os = d->os; | |
220 if (rnk == 1) | |
221 for (i = 0; i < n; ++i, I += is, O += os) | |
222 memcpy(O, I, cpysz); | |
223 else { | |
224 --rnk; ++d; | |
225 for (i = 0; i < n; ++i, I += is, O += os) | |
226 memcpy_loop(cpysz, rnk, d, I, O); | |
227 } | |
228 } | |
229 | |
230 static void apply_memcpy_loop(const plan *ego_, R *I, R *O) | |
231 { | |
232 const P *ego = (const P *) ego_; | |
233 memcpy_loop(ego->vl * sizeof(R), ego->rnk, ego->d, I, O); | |
234 } | |
235 | |
236 static int applicable_memcpy_loop(const P *pln, const problem_rdft *p) | |
237 { | |
238 return (p->I != p->O | |
239 && pln->rnk > 0 | |
240 && pln->vl > 2 /* do not bother memcpy-ing complex numbers */); | |
241 } | |
242 | |
243 /**************************************************************/ | |
244 /* rank 2, in place, square transpose, iterative */ | |
245 static void apply_ip_sq(const plan *ego_, R *I, R *O) | |
246 { | |
247 const P *ego = (const P *) ego_; | |
248 UNUSED(O); | |
249 transpose(ego->d, ego->rnk, ego->vl, I, X(transpose)); | |
250 } | |
251 | |
252 | |
253 static int applicable_ip_sq(const P *pln, const problem_rdft *p) | |
254 { | |
255 return (1 | |
256 && p->I == p->O | |
257 && pln->rnk >= 2 | |
258 && transposep(pln)); | |
259 } | |
260 | |
261 /**************************************************************/ | |
262 /* rank 2, in place, square transpose, tiled */ | |
263 static void apply_ip_sq_tiled(const plan *ego_, R *I, R *O) | |
264 { | |
265 const P *ego = (const P *) ego_; | |
266 UNUSED(O); | |
267 transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiled)); | |
268 } | |
269 | |
270 static int applicable_ip_sq_tiled(const P *pln, const problem_rdft *p) | |
271 { | |
272 return (1 | |
273 && applicable_ip_sq(pln, p) | |
274 | |
275 /* somewhat arbitrary */ | |
276 && X(compute_tilesz)(pln->vl, 2) > 4 | |
277 ); | |
278 } | |
279 | |
280 /**************************************************************/ | |
281 /* rank 2, in place, square transpose, tiled, buffered */ | |
282 static void apply_ip_sq_tiledbuf(const plan *ego_, R *I, R *O) | |
283 { | |
284 const P *ego = (const P *) ego_; | |
285 UNUSED(O); | |
286 transpose(ego->d, ego->rnk, ego->vl, I, X(transpose_tiledbuf)); | |
287 } | |
288 | |
289 #define applicable_ip_sq_tiledbuf applicable_ip_sq_tiled | |
290 | |
291 /**************************************************************/ | |
292 static int applicable(const S *ego, const problem *p_) | |
293 { | |
294 const problem_rdft *p = (const problem_rdft *) p_; | |
295 P pln; | |
296 return (1 | |
297 && p->sz->rnk == 0 | |
298 && FINITE_RNK(p->vecsz->rnk) | |
299 && fill_iodim(&pln, p) | |
300 && ego->applicable(&pln, p) | |
301 ); | |
302 } | |
303 | |
304 static void print(const plan *ego_, printer *p) | |
305 { | |
306 const P *ego = (const P *) ego_; | |
307 int i; | |
308 p->print(p, "(%s/%D", ego->nam, ego->vl); | |
309 for (i = 0; i < ego->rnk; ++i) | |
310 p->print(p, "%v", ego->d[i].n); | |
311 p->print(p, ")"); | |
312 } | |
313 | |
314 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) | |
315 { | |
316 const problem_rdft *p; | |
317 const S *ego = (const S *) ego_; | |
318 P *pln; | |
319 int retval; | |
320 | |
321 static const plan_adt padt = { | |
322 X(rdft_solve), X(null_awake), print, X(plan_null_destroy) | |
323 }; | |
324 | |
325 UNUSED(plnr); | |
326 | |
327 if (!applicable(ego, p_)) | |
328 return (plan *) 0; | |
329 | |
330 p = (const problem_rdft *) p_; | |
331 pln = MKPLAN_RDFT(P, &padt, ego->apply); | |
332 | |
333 retval = fill_iodim(pln, p); | |
334 (void)retval; /* UNUSED unless DEBUG */ | |
335 A(retval); | |
336 A(pln->vl > 0); /* because FINITE_RNK(p->vecsz->rnk) holds */ | |
337 pln->nam = ego->nam; | |
338 | |
339 /* X(tensor_sz)(p->vecsz) loads, X(tensor_sz)(p->vecsz) stores */ | |
340 X(ops_other)(2 * X(tensor_sz)(p->vecsz), &pln->super.super.ops); | |
341 return &(pln->super.super); | |
342 } | |
343 | |
344 | |
345 void X(rdft_rank0_register)(planner *p) | |
346 { | |
347 unsigned i; | |
348 static struct { | |
349 rdftapply apply; | |
350 int (*applicable)(const P *, const problem_rdft *); | |
351 const char *nam; | |
352 } tab[] = { | |
353 { apply_memcpy, applicable_memcpy, "rdft-rank0-memcpy" }, | |
354 { apply_memcpy_loop, applicable_memcpy_loop, | |
355 "rdft-rank0-memcpy-loop" }, | |
356 { apply_iter, applicable_iter, "rdft-rank0-iter-ci" }, | |
357 { apply_cpy2dco, applicable_cpy2dco, "rdft-rank0-iter-co" }, | |
358 { apply_tiled, applicable_tiled, "rdft-rank0-tiled" }, | |
359 { apply_tiledbuf, applicable_tiledbuf, "rdft-rank0-tiledbuf" }, | |
360 { apply_ip_sq, applicable_ip_sq, "rdft-rank0-ip-sq" }, | |
361 { | |
362 apply_ip_sq_tiled, | |
363 applicable_ip_sq_tiled, | |
364 "rdft-rank0-ip-sq-tiled" | |
365 }, | |
366 { | |
367 apply_ip_sq_tiledbuf, | |
368 applicable_ip_sq_tiledbuf, | |
369 "rdft-rank0-ip-sq-tiledbuf" | |
370 }, | |
371 }; | |
372 | |
373 for (i = 0; i < sizeof(tab) / sizeof(tab[0]); ++i) { | |
374 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 }; | |
375 S *slv = MKSOLVER(S, &sadt); | |
376 slv->apply = tab[i].apply; | |
377 slv->applicable = tab[i].applicable; | |
378 slv->nam = tab[i].nam; | |
379 REGISTER_SOLVER(p, &(slv->super)); | |
380 } | |
381 } |