comparison src/fftw-3.3.3/rdft/vrank3-transpose.c @ 10:37bf6b4a2645

Add FFTW3
author Chris Cannam
date Wed, 20 Mar 2013 15:35:50 +0000
parents
children
comparison
equal deleted inserted replaced
9:c0fb53affa76 10:37bf6b4a2645
1 /*
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 */
20
21
22 /* rank-0, vector-rank-3, non-square in-place transposition
23 (see rank0.c for square transposition) */
24
25 #include "rdft.h"
26
27 #ifdef HAVE_STRING_H
28 #include <string.h> /* for memcpy() */
29 #endif
30
31 struct P_s;
32
33 typedef struct {
34 rdftapply apply;
35 int (*applicable)(const problem_rdft *p, planner *plnr,
36 int dim0, int dim1, int dim2, INT *nbuf);
37 int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
38 const char *nam;
39 } transpose_adt;
40
41 typedef struct {
42 solver super;
43 const transpose_adt *adt;
44 } S;
45
46 typedef struct P_s {
47 plan_rdft super;
48 INT n, m, vl; /* transpose n x m matrix of vl-tuples */
49 INT nbuf; /* buffer size */
50 INT nd, md, d; /* transpose-gcd params */
51 INT nc, mc; /* transpose-cut params */
52 plan *cld1, *cld2, *cld3; /* children, null if unused */
53 const S *slv;
54 } P;
55
56
57 /*************************************************************************/
58 /* some utilities for the solvers */
59
60 static INT gcd(INT a, INT b)
61 {
62 INT r;
63 do {
64 r = a % b;
65 a = b;
66 b = r;
67 } while (r != 0);
68
69 return a;
70 }
71
72 /* whether we can transpose with one of our routines expecting
73 contiguous Ntuples */
74 static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
75 {
76 return (vs == 1 && b->is == vl && a->os == vl &&
77 ((a->n == b->n && a->is == b->os
78 && a->is >= b->n && a->is % vl == 0)
79 || (a->is == b->n * vl && b->os == a->n * vl)));
80 }
81
82 /* check whether a and b correspond to the first and second dimensions
83 of a transpose of tuples with vector length = vl, stride = vs. */
84 static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
85 {
86 return ((a->n == b->n && a->os == b->is && a->is == b->os)
87 || Ntuple_transposable(a, b, vl, vs));
88 }
89
90 static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
91 {
92 int dim0, dim1;
93
94 for (dim0 = 0; dim0 < s->rnk; ++dim0)
95 for (dim1 = 0; dim1 < s->rnk; ++dim1) {
96 int dim2 = 3 - dim0 - dim1;
97 if (dim0 == dim1) continue;
98 if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
99 && transposable(s->dims + dim0, s->dims + dim1,
100 s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
101 s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
102 *pdim0 = dim0;
103 *pdim1 = dim1;
104 *pdim2 = dim2;
105 return 1;
106 }
107 }
108 return 0;
109 }
110
111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
112 #define MAXBUF 65536 /* maximum non-ugly buffer */
113
114 /* generic applicability function */
115 static int applicable(const solver *ego_, const problem *p_, planner *plnr,
116 int *dim0, int *dim1, int *dim2, INT *nbuf)
117 {
118 const S *ego = (const S *) ego_;
119 const problem_rdft *p = (const problem_rdft *) p_;
120
121 return (1
122 && p->I == p->O
123 && p->sz->rnk == 0
124 && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
125
126 && pickdim(p->vecsz, dim0, dim1, dim2)
127
128 /* UGLY if vecloop in wrong order for locality */
129 && (!NO_UGLYP(plnr) ||
130 p->vecsz->rnk == 2 ||
131 X(iabs)(p->vecsz->dims[*dim2].is)
132 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
133 X(iabs)(p->vecsz->dims[*dim0].os)))
134
135 /* SLOW if non-square */
136 && (!NO_SLOWP(plnr)
137 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
138
139 && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
140
141 /* buffers too big are UGLY */
142 && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
143 || *nbuf <= MAXBUF
144 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
145 );
146 }
147
148 static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
149 {
150 if (p->vecsz->rnk == 2) {
151 *vl = 1; *vs = 1;
152 }
153 else {
154 *vl = p->vecsz->dims[dim2].n;
155 *vs = p->vecsz->dims[dim2].is; /* == os */
156 }
157 }
158
159 /*************************************************************************/
160 /* Cache-oblivious in-place transpose of non-square matrices, based
161 on transposes of blocks given by the gcd of the dimensions.
162
163 This algorithm is related to algorithm V5 from Murray Dow,
164 "Transposing a matrix on a vector computer," Parallel Computing 21
165 (12), 1997-2005 (1995), with the modification that we use
166 cache-oblivious recursive transpose subroutines (and we derived
167 it independently).
168
169 For a p x q matrix, this requires scratch space equal to the size
170 of the matrix divided by gcd(p,q). Alternatively, see also the
171 "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
172
173 static void apply_gcd(const plan *ego_, R *I, R *O)
174 {
175 const P *ego = (const P *) ego_;
176 INT n = ego->nd, m = ego->md, d = ego->d;
177 INT vl = ego->vl;
178 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
179 INT i, num_el = n*m*d*vl;
180
181 A(ego->n == n * d && ego->m == m * d);
182 UNUSED(O);
183
184 /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
185 of vl-tuples and buf contains n*m*d*vl elements.
186
187 In general, to transpose a p x q matrix, you should call this
188 routine with d = gcd(p, q), n = p/d, and m = q/d. */
189
190 A(n > 0 && m > 0 && vl > 0);
191 A(d > 1);
192
193 /* treat as (d x n) x (d' x m) matrix. (d' = d) */
194
195 /* First, transpose d x (n x d') x m to d x (d' x n) x m,
196 using the buf matrix. This consists of d transposes
197 of contiguous n x d' matrices of m-tuples. */
198 if (n > 1) {
199 rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
200 for (i = 0; i < d; ++i) {
201 cldapply(ego->cld1, I + i*num_el, buf);
202 memcpy(I + i*num_el, buf, num_el*sizeof(R));
203 }
204 }
205
206 /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
207 is a square in-place transpose of n*m-tuples: */
208 {
209 rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
210 cldapply(ego->cld2, I, I);
211 }
212
213 /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
214 using the buf matrix. This consists of d' transposes
215 of contiguous d*n x m matrices. */
216 if (m > 1) {
217 rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
218 for (i = 0; i < d; ++i) {
219 cldapply(ego->cld3, I + i*num_el, buf);
220 memcpy(I + i*num_el, buf, num_el*sizeof(R));
221 }
222 }
223
224 X(ifree)(buf);
225 }
226
227 static int applicable_gcd(const problem_rdft *p, planner *plnr,
228 int dim0, int dim1, int dim2, INT *nbuf)
229 {
230 INT n = p->vecsz->dims[dim0].n;
231 INT m = p->vecsz->dims[dim1].n;
232 INT d, vl, vs;
233 get_transpose_vec(p, dim2, &vl, &vs);
234 d = gcd(n, m);
235 *nbuf = n * (m / d) * vl;
236 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
237 && n != m
238 && d > 1
239 && Ntuple_transposable(p->vecsz->dims + dim0,
240 p->vecsz->dims + dim1,
241 vl, vs));
242 }
243
244 static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
245 {
246 INT n = ego->nd, m = ego->md, d = ego->d;
247 INT vl = ego->vl;
248 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
249 INT num_el = n*m*d*vl;
250
251 if (n > 1) {
252 ego->cld1 = X(mkplan_d)(plnr,
253 X(mkproblem_rdft_0_d)(
254 X(mktensor_3d)(n, d*m*vl, m*vl,
255 d, m*vl, n*m*vl,
256 m*vl, 1, 1),
257 TAINT(p->I, num_el), buf));
258 if (!ego->cld1)
259 goto nada;
260 X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
261 &ego->super.super.ops);
262 ego->super.super.ops.other += num_el * d * 2;
263 }
264
265 ego->cld2 = X(mkplan_d)(plnr,
266 X(mkproblem_rdft_0_d)(
267 X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
268 d, n*m*vl, d*n*m*vl,
269 n*m*vl, 1, 1),
270 p->I, p->I));
271 if (!ego->cld2)
272 goto nada;
273 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
274
275 if (m > 1) {
276 ego->cld3 = X(mkplan_d)(plnr,
277 X(mkproblem_rdft_0_d)(
278 X(mktensor_3d)(d*n, m*vl, vl,
279 m, vl, d*n*vl,
280 vl, 1, 1),
281 TAINT(p->I, num_el), buf));
282 if (!ego->cld3)
283 goto nada;
284 X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
285 ego->super.super.ops.other += num_el * d * 2;
286 }
287
288 X(ifree)(buf);
289 return 1;
290
291 nada:
292 X(ifree)(buf);
293 return 0;
294 }
295
296 static const transpose_adt adt_gcd =
297 {
298 apply_gcd, applicable_gcd, mkcldrn_gcd,
299 "rdft-transpose-gcd"
300 };
301
302 /*************************************************************************/
303 /* Cache-oblivious in-place transpose of non-square n x m matrices,
304 based on transposing a sub-matrix first and then transposing the
305 remainder(s) with the help of a buffer. See also transpose-gcd,
306 above, if gcd(n,m) is large.
307
308 This algorithm is related to algorithm V3 from Murray Dow,
309 "Transposing a matrix on a vector computer," Parallel Computing 21
310 (12), 1997-2005 (1995), with the modifications that we use
311 cache-oblivious recursive transpose subroutines and we have the
312 generalization for large |n-m| below.
313
314 The best case, and the one described by Dow, is for |n-m| small, in
315 which case we transpose a square sub-matrix of size min(n,m),
316 handling the remainder via a buffer. This requires scratch space
317 equal to the size of the matrix times |n-m| / max(n,m).
318
319 As a generalization when |n-m| is not small, we also support cutting
320 *both* dimensions to an nc x mc matrix which is *not* necessarily
321 square, but has a large gcd (and can therefore use transpose-gcd).
322 */
323
324 static void apply_cut(const plan *ego_, R *I, R *O)
325 {
326 const P *ego = (const P *) ego_;
327 INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
328 INT i;
329 R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
330 UNUSED(O);
331
332 if (m > mc) {
333 ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
334 for (i = 0; i < nc; ++i)
335 memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
336 }
337
338 ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
339
340 if (n > nc) {
341 R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
342 memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
343 for (i = mc-1; i >= 0; --i)
344 memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
345 ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
346 }
347
348 if (m > mc) {
349 if (n > nc)
350 for (i = mc; i < m; ++i)
351 memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
352 (nc*vl)*sizeof(R));
353 else
354 memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
355 }
356
357 X(ifree)(buf1);
358 }
359
360 /* only cut one dimension if the resulting buffer is small enough */
361 static int cut1(INT n, INT m, INT vl)
362 {
363 return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
364 || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
365 }
366
367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
368
369 static int applicable_cut(const problem_rdft *p, planner *plnr,
370 int dim0, int dim1, int dim2, INT *nbuf)
371 {
372 INT n = p->vecsz->dims[dim0].n;
373 INT m = p->vecsz->dims[dim1].n;
374 INT vl, vs;
375 get_transpose_vec(p, dim2, &vl, &vs);
376 *nbuf = 0; /* always small enough to be non-UGLY (?) */
377 A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
378 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
379 && n != m
380
381 /* Don't call transpose-cut recursively (avoid inf. loops):
382 the non-square sub-transpose produced when !cut1
383 should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
384 for which transpose-gcd is applicable */
385 && (cut1(n, m, vl)
386 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
387
388 && Ntuple_transposable(p->vecsz->dims + dim0,
389 p->vecsz->dims + dim1,
390 vl, vs));
391 }
392
393 static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
394 {
395 INT n = ego->n, m = ego->m, nc, mc;
396 INT vl = ego->vl;
397 R *buf;
398
399 /* pick the "best" cut */
400 if (cut1(n, m, vl)) {
401 nc = mc = X(imin)(n,m);
402 }
403 else {
404 INT dc, ns, ms;
405 dc = gcd(m, n); nc = n; mc = m;
406 /* search for cut with largest gcd
407 (TODO: different optimality criteria? different search range?) */
408 for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
409 for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
410 INT ds = gcd(ms, ns);
411 if (ds > dc) {
412 dc = ds; nc = ns; mc = ms;
413 if (dc == X(imin)(ns, ms))
414 break; /* cannot get larger than this */
415 }
416 }
417 if (dc == X(imin)(n, ms))
418 break; /* cannot get larger than this */
419 }
420 A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
421 }
422 ego->nc = nc;
423 ego->mc = mc;
424 ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
425
426 buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
427
428 if (m > mc) {
429 ego->cld1 = X(mkplan_d)(plnr,
430 X(mkproblem_rdft_0_d)(
431 X(mktensor_3d)(nc, m*vl, vl,
432 m-mc, vl, nc*vl,
433 vl, 1, 1),
434 p->I + mc*vl, buf));
435 if (!ego->cld1)
436 goto nada;
437 X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
438 }
439
440 ego->cld2 = X(mkplan_d)(plnr,
441 X(mkproblem_rdft_0_d)(
442 X(mktensor_3d)(nc, mc*vl, vl,
443 mc, vl, nc*vl,
444 vl, 1, 1),
445 p->I, p->I));
446 if (!ego->cld2)
447 goto nada;
448 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
449
450 if (n > nc) {
451 ego->cld3 = X(mkplan_d)(plnr,
452 X(mkproblem_rdft_0_d)(
453 X(mktensor_3d)(n-nc, m*vl, vl,
454 m, vl, n*vl,
455 vl, 1, 1),
456 buf + (m-mc)*(nc*vl), p->I + nc*vl));
457 if (!ego->cld3)
458 goto nada;
459 X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
460 }
461
462 /* memcpy/memmove operations */
463 ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
464 + (n-nc)*m + (m-mc)*nc);
465
466 X(ifree)(buf);
467 return 1;
468
469 nada:
470 X(ifree)(buf);
471 return 0;
472 }
473
474 static const transpose_adt adt_cut =
475 {
476 apply_cut, applicable_cut, mkcldrn_cut,
477 "rdft-transpose-cut"
478 };
479
480 /*************************************************************************/
481 /* In-place transpose routine from TOMS, which follows the cycles of
482 the permutation so that it writes to each location only once.
483 Because of cache-line and other issues, however, this routine is
484 typically much slower than transpose-gcd or transpose-cut, even
485 though the latter do some extra writes. On the other hand, if the
486 vector length is large then the TOMS routine is best.
487
488 The TOMS routine also has the advantage of requiring less buffer
489 space for the case of gcd(nx,ny) small. However, in this case it
490 has been superseded by the combination of the generalized
491 transpose-cut method with the transpose-gcd method, which can
492 always transpose with buffers a small fraction of the array size
493 regardless of gcd(nx,ny). */
494
495 /*
496 * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380).
497 *
498 * These routines do in-place transposes of arrays.
499 *
500 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software,
501 * vol. 3, no. 1, 104-110 (1977) ]
502 *
503 * C version by Steven G. Johnson (February 1997).
504 */
505
506 /*
507 * "a" is a 1D array of length ny*nx*N which constains the nx x ny
508 * matrix of N-tuples to be transposed. "a" is stored in row-major
509 * order (last index varies fastest). move is a 1D array of length
510 * move_size used to store information to speed up the process. The
511 * value move_size=(ny+nx)/2 is recommended. buf should be an array
512 * of length 2*N.
513 *
514 */
515
516 static void transpose_toms513(R *a, INT nx, INT ny, INT N,
517 char *move, INT move_size, R *buf)
518 {
519 INT i, im, mn;
520 R *b, *c, *d;
521 INT ncount;
522 INT k;
523
524 /* check arguments and initialize: */
525 A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
526
527 b = buf;
528
529 /* Cate & Twigg have a special case for nx == ny, but we don't
530 bother, since we already have special code for this case elsewhere. */
531
532 c = buf + N;
533 ncount = 2; /* always at least 2 fixed points */
534 k = (mn = ny * nx) - 1;
535
536 for (i = 0; i < move_size; ++i)
537 move[i] = 0;
538
539 if (ny >= 3 && nx >= 3)
540 ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */
541
542 i = 1;
543 im = ny;
544
545 while (1) {
546 INT i1, i2, i1c, i2c;
547 INT kmi;
548
549 /** Rearrange the elements of a loop
550 and its companion loop: **/
551
552 i1 = i;
553 kmi = k - i;
554 i1c = kmi;
555 switch (N) {
556 case 1:
557 b[0] = a[i1];
558 c[0] = a[i1c];
559 break;
560 case 2:
561 b[0] = a[2*i1];
562 b[1] = a[2*i1+1];
563 c[0] = a[2*i1c];
564 c[1] = a[2*i1c+1];
565 break;
566 default:
567 memcpy(b, &a[N * i1], N * sizeof(R));
568 memcpy(c, &a[N * i1c], N * sizeof(R));
569 }
570 while (1) {
571 i2 = ny * i1 - k * (i1 / nx);
572 i2c = k - i2;
573 if (i1 < move_size)
574 move[i1] = 1;
575 if (i1c < move_size)
576 move[i1c] = 1;
577 ncount += 2;
578 if (i2 == i)
579 break;
580 if (i2 == kmi) {
581 d = b;
582 b = c;
583 c = d;
584 break;
585 }
586 switch (N) {
587 case 1:
588 a[i1] = a[i2];
589 a[i1c] = a[i2c];
590 break;
591 case 2:
592 a[2*i1] = a[2*i2];
593 a[2*i1+1] = a[2*i2+1];
594 a[2*i1c] = a[2*i2c];
595 a[2*i1c+1] = a[2*i2c+1];
596 break;
597 default:
598 memcpy(&a[N * i1], &a[N * i2],
599 N * sizeof(R));
600 memcpy(&a[N * i1c], &a[N * i2c],
601 N * sizeof(R));
602 }
603 i1 = i2;
604 i1c = i2c;
605 }
606 switch (N) {
607 case 1:
608 a[i1] = b[0];
609 a[i1c] = c[0];
610 break;
611 case 2:
612 a[2*i1] = b[0];
613 a[2*i1+1] = b[1];
614 a[2*i1c] = c[0];
615 a[2*i1c+1] = c[1];
616 break;
617 default:
618 memcpy(&a[N * i1], b, N * sizeof(R));
619 memcpy(&a[N * i1c], c, N * sizeof(R));
620 }
621 if (ncount >= mn)
622 break; /* we've moved all elements */
623
624 /** Search for loops to rearrange: **/
625
626 while (1) {
627 INT max = k - i;
628 ++i;
629 A(i <= max);
630 im += ny;
631 if (im > k)
632 im -= k;
633 i2 = im;
634 if (i == i2)
635 continue;
636 if (i >= move_size) {
637 while (i2 > i && i2 < max) {
638 i1 = i2;
639 i2 = ny * i1 - k * (i1 / nx);
640 }
641 if (i2 == i)
642 break;
643 } else if (!move[i])
644 break;
645 }
646 }
647 }
648
649 static void apply_toms513(const plan *ego_, R *I, R *O)
650 {
651 const P *ego = (const P *) ego_;
652 INT n = ego->n, m = ego->m;
653 INT vl = ego->vl;
654 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
655 UNUSED(O);
656 transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
657 X(ifree)(buf);
658 }
659
660 static int applicable_toms513(const problem_rdft *p, planner *plnr,
661 int dim0, int dim1, int dim2, INT *nbuf)
662 {
663 INT n = p->vecsz->dims[dim0].n;
664 INT m = p->vecsz->dims[dim1].n;
665 INT vl, vs;
666 get_transpose_vec(p, dim2, &vl, &vs);
667 *nbuf = 2*vl
668 + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
669 return (!NO_SLOWP(plnr)
670 && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
671 && n != m
672 && Ntuple_transposable(p->vecsz->dims + dim0,
673 p->vecsz->dims + dim1,
674 vl, vs));
675 }
676
677 static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
678 {
679 UNUSED(p); UNUSED(plnr);
680 /* heuristic so that TOMS algorithm is last resort for small vl */
681 ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
682 return 1;
683 }
684
685 static const transpose_adt adt_toms513 =
686 {
687 apply_toms513, applicable_toms513, mkcldrn_toms513,
688 "rdft-transpose-toms513"
689 };
690
691 /*-----------------------------------------------------------------------*/
692 /*-----------------------------------------------------------------------*/
693 /* generic stuff: */
694
695 static void awake(plan *ego_, enum wakefulness wakefulness)
696 {
697 P *ego = (P *) ego_;
698 X(plan_awake)(ego->cld1, wakefulness);
699 X(plan_awake)(ego->cld2, wakefulness);
700 X(plan_awake)(ego->cld3, wakefulness);
701 }
702
703 static void print(const plan *ego_, printer *p)
704 {
705 const P *ego = (const P *) ego_;
706 p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
707 ego->n, ego->m, ego->vl);
708 if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
709 if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
710 if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
711 p->print(p, ")");
712 }
713
714 static void destroy(plan *ego_)
715 {
716 P *ego = (P *) ego_;
717 X(plan_destroy_internal)(ego->cld3);
718 X(plan_destroy_internal)(ego->cld2);
719 X(plan_destroy_internal)(ego->cld1);
720 }
721
722 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
723 {
724 const S *ego = (const S *) ego_;
725 const problem_rdft *p;
726 int dim0, dim1, dim2;
727 INT nbuf, vs;
728 P *pln;
729
730 static const plan_adt padt = {
731 X(rdft_solve), awake, print, destroy
732 };
733
734 if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
735 return (plan *) 0;
736
737 p = (const problem_rdft *) p_;
738 pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
739
740 pln->n = p->vecsz->dims[dim0].n;
741 pln->m = p->vecsz->dims[dim1].n;
742 get_transpose_vec(p, dim2, &pln->vl, &vs);
743 pln->nbuf = nbuf;
744 pln->d = gcd(pln->n, pln->m);
745 pln->nd = pln->n / pln->d;
746 pln->md = pln->m / pln->d;
747 pln->slv = ego;
748
749 X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
750
751 pln->cld1 = pln->cld2 = pln->cld3 = 0;
752 if (!ego->adt->mkcldrn(p, plnr, pln)) {
753 X(plan_destroy_internal)(&(pln->super.super));
754 return 0;
755 }
756
757 return &(pln->super.super);
758 }
759
760 static solver *mksolver(const transpose_adt *adt)
761 {
762 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
763 S *slv = MKSOLVER(S, &sadt);
764 slv->adt = adt;
765 return &(slv->super);
766 }
767
768 void X(rdft_vrank3_transpose_register)(planner *p)
769 {
770 unsigned i;
771 static const transpose_adt *const adts[] = {
772 &adt_gcd, &adt_cut,
773 &adt_toms513
774 };
775 for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
776 REGISTER_SOLVER(p, mksolver(adts[i]));
777 }