Chris@10
|
1 /*
|
Chris@10
|
2 * Copyright (c) 2003, 2007-11 Matteo Frigo
|
Chris@10
|
3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
|
Chris@10
|
4 *
|
Chris@10
|
5 * This program is free software; you can redistribute it and/or modify
|
Chris@10
|
6 * it under the terms of the GNU General Public License as published by
|
Chris@10
|
7 * the Free Software Foundation; either version 2 of the License, or
|
Chris@10
|
8 * (at your option) any later version.
|
Chris@10
|
9 *
|
Chris@10
|
10 * This program is distributed in the hope that it will be useful,
|
Chris@10
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Chris@10
|
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Chris@10
|
13 * GNU General Public License for more details.
|
Chris@10
|
14 *
|
Chris@10
|
15 * You should have received a copy of the GNU General Public License
|
Chris@10
|
16 * along with this program; if not, write to the Free Software
|
Chris@10
|
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Chris@10
|
18 *
|
Chris@10
|
19 */
|
Chris@10
|
20
|
Chris@10
|
21
|
Chris@10
|
22 /* rank-0, vector-rank-3, non-square in-place transposition
|
Chris@10
|
23 (see rank0.c for square transposition) */
|
Chris@10
|
24
|
Chris@10
|
25 #include "rdft.h"
|
Chris@10
|
26
|
Chris@10
|
27 #ifdef HAVE_STRING_H
|
Chris@10
|
28 #include <string.h> /* for memcpy() */
|
Chris@10
|
29 #endif
|
Chris@10
|
30
|
Chris@10
|
31 struct P_s;
|
Chris@10
|
32
|
Chris@10
|
33 typedef struct {
|
Chris@10
|
34 rdftapply apply;
|
Chris@10
|
35 int (*applicable)(const problem_rdft *p, planner *plnr,
|
Chris@10
|
36 int dim0, int dim1, int dim2, INT *nbuf);
|
Chris@10
|
37 int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
|
Chris@10
|
38 const char *nam;
|
Chris@10
|
39 } transpose_adt;
|
Chris@10
|
40
|
Chris@10
|
41 typedef struct {
|
Chris@10
|
42 solver super;
|
Chris@10
|
43 const transpose_adt *adt;
|
Chris@10
|
44 } S;
|
Chris@10
|
45
|
Chris@10
|
46 typedef struct P_s {
|
Chris@10
|
47 plan_rdft super;
|
Chris@10
|
48 INT n, m, vl; /* transpose n x m matrix of vl-tuples */
|
Chris@10
|
49 INT nbuf; /* buffer size */
|
Chris@10
|
50 INT nd, md, d; /* transpose-gcd params */
|
Chris@10
|
51 INT nc, mc; /* transpose-cut params */
|
Chris@10
|
52 plan *cld1, *cld2, *cld3; /* children, null if unused */
|
Chris@10
|
53 const S *slv;
|
Chris@10
|
54 } P;
|
Chris@10
|
55
|
Chris@10
|
56
|
Chris@10
|
57 /*************************************************************************/
|
Chris@10
|
58 /* some utilities for the solvers */
|
Chris@10
|
59
|
Chris@10
|
60 static INT gcd(INT a, INT b)
|
Chris@10
|
61 {
|
Chris@10
|
62 INT r;
|
Chris@10
|
63 do {
|
Chris@10
|
64 r = a % b;
|
Chris@10
|
65 a = b;
|
Chris@10
|
66 b = r;
|
Chris@10
|
67 } while (r != 0);
|
Chris@10
|
68
|
Chris@10
|
69 return a;
|
Chris@10
|
70 }
|
Chris@10
|
71
|
Chris@10
|
72 /* whether we can transpose with one of our routines expecting
|
Chris@10
|
73 contiguous Ntuples */
|
Chris@10
|
74 static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
|
Chris@10
|
75 {
|
Chris@10
|
76 return (vs == 1 && b->is == vl && a->os == vl &&
|
Chris@10
|
77 ((a->n == b->n && a->is == b->os
|
Chris@10
|
78 && a->is >= b->n && a->is % vl == 0)
|
Chris@10
|
79 || (a->is == b->n * vl && b->os == a->n * vl)));
|
Chris@10
|
80 }
|
Chris@10
|
81
|
Chris@10
|
82 /* check whether a and b correspond to the first and second dimensions
|
Chris@10
|
83 of a transpose of tuples with vector length = vl, stride = vs. */
|
Chris@10
|
84 static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
|
Chris@10
|
85 {
|
Chris@10
|
86 return ((a->n == b->n && a->os == b->is && a->is == b->os)
|
Chris@10
|
87 || Ntuple_transposable(a, b, vl, vs));
|
Chris@10
|
88 }
|
Chris@10
|
89
|
Chris@10
|
90 static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
|
Chris@10
|
91 {
|
Chris@10
|
92 int dim0, dim1;
|
Chris@10
|
93
|
Chris@10
|
94 for (dim0 = 0; dim0 < s->rnk; ++dim0)
|
Chris@10
|
95 for (dim1 = 0; dim1 < s->rnk; ++dim1) {
|
Chris@10
|
96 int dim2 = 3 - dim0 - dim1;
|
Chris@10
|
97 if (dim0 == dim1) continue;
|
Chris@10
|
98 if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
|
Chris@10
|
99 && transposable(s->dims + dim0, s->dims + dim1,
|
Chris@10
|
100 s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
|
Chris@10
|
101 s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
|
Chris@10
|
102 *pdim0 = dim0;
|
Chris@10
|
103 *pdim1 = dim1;
|
Chris@10
|
104 *pdim2 = dim2;
|
Chris@10
|
105 return 1;
|
Chris@10
|
106 }
|
Chris@10
|
107 }
|
Chris@10
|
108 return 0;
|
Chris@10
|
109 }
|
Chris@10
|
110
|
Chris@10
|
111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
|
Chris@10
|
112 #define MAXBUF 65536 /* maximum non-ugly buffer */
|
Chris@10
|
113
|
Chris@10
|
114 /* generic applicability function */
|
Chris@10
|
115 static int applicable(const solver *ego_, const problem *p_, planner *plnr,
|
Chris@10
|
116 int *dim0, int *dim1, int *dim2, INT *nbuf)
|
Chris@10
|
117 {
|
Chris@10
|
118 const S *ego = (const S *) ego_;
|
Chris@10
|
119 const problem_rdft *p = (const problem_rdft *) p_;
|
Chris@10
|
120
|
Chris@10
|
121 return (1
|
Chris@10
|
122 && p->I == p->O
|
Chris@10
|
123 && p->sz->rnk == 0
|
Chris@10
|
124 && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
|
Chris@10
|
125
|
Chris@10
|
126 && pickdim(p->vecsz, dim0, dim1, dim2)
|
Chris@10
|
127
|
Chris@10
|
128 /* UGLY if vecloop in wrong order for locality */
|
Chris@10
|
129 && (!NO_UGLYP(plnr) ||
|
Chris@10
|
130 p->vecsz->rnk == 2 ||
|
Chris@10
|
131 X(iabs)(p->vecsz->dims[*dim2].is)
|
Chris@10
|
132 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
|
Chris@10
|
133 X(iabs)(p->vecsz->dims[*dim0].os)))
|
Chris@10
|
134
|
Chris@10
|
135 /* SLOW if non-square */
|
Chris@10
|
136 && (!NO_SLOWP(plnr)
|
Chris@10
|
137 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
|
Chris@10
|
138
|
Chris@10
|
139 && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
|
Chris@10
|
140
|
Chris@10
|
141 /* buffers too big are UGLY */
|
Chris@10
|
142 && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
|
Chris@10
|
143 || *nbuf <= MAXBUF
|
Chris@10
|
144 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
|
Chris@10
|
145 );
|
Chris@10
|
146 }
|
Chris@10
|
147
|
Chris@10
|
148 static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
|
Chris@10
|
149 {
|
Chris@10
|
150 if (p->vecsz->rnk == 2) {
|
Chris@10
|
151 *vl = 1; *vs = 1;
|
Chris@10
|
152 }
|
Chris@10
|
153 else {
|
Chris@10
|
154 *vl = p->vecsz->dims[dim2].n;
|
Chris@10
|
155 *vs = p->vecsz->dims[dim2].is; /* == os */
|
Chris@10
|
156 }
|
Chris@10
|
157 }
|
Chris@10
|
158
|
Chris@10
|
159 /*************************************************************************/
|
Chris@10
|
160 /* Cache-oblivious in-place transpose of non-square matrices, based
|
Chris@10
|
161 on transposes of blocks given by the gcd of the dimensions.
|
Chris@10
|
162
|
Chris@10
|
163 This algorithm is related to algorithm V5 from Murray Dow,
|
Chris@10
|
164 "Transposing a matrix on a vector computer," Parallel Computing 21
|
Chris@10
|
165 (12), 1997-2005 (1995), with the modification that we use
|
Chris@10
|
166 cache-oblivious recursive transpose subroutines (and we derived
|
Chris@10
|
167 it independently).
|
Chris@10
|
168
|
Chris@10
|
169 For a p x q matrix, this requires scratch space equal to the size
|
Chris@10
|
170 of the matrix divided by gcd(p,q). Alternatively, see also the
|
Chris@10
|
171 "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
|
Chris@10
|
172
|
Chris@10
|
173 static void apply_gcd(const plan *ego_, R *I, R *O)
|
Chris@10
|
174 {
|
Chris@10
|
175 const P *ego = (const P *) ego_;
|
Chris@10
|
176 INT n = ego->nd, m = ego->md, d = ego->d;
|
Chris@10
|
177 INT vl = ego->vl;
|
Chris@10
|
178 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
|
Chris@10
|
179 INT i, num_el = n*m*d*vl;
|
Chris@10
|
180
|
Chris@10
|
181 A(ego->n == n * d && ego->m == m * d);
|
Chris@10
|
182 UNUSED(O);
|
Chris@10
|
183
|
Chris@10
|
184 /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
|
Chris@10
|
185 of vl-tuples and buf contains n*m*d*vl elements.
|
Chris@10
|
186
|
Chris@10
|
187 In general, to transpose a p x q matrix, you should call this
|
Chris@10
|
188 routine with d = gcd(p, q), n = p/d, and m = q/d. */
|
Chris@10
|
189
|
Chris@10
|
190 A(n > 0 && m > 0 && vl > 0);
|
Chris@10
|
191 A(d > 1);
|
Chris@10
|
192
|
Chris@10
|
193 /* treat as (d x n) x (d' x m) matrix. (d' = d) */
|
Chris@10
|
194
|
Chris@10
|
195 /* First, transpose d x (n x d') x m to d x (d' x n) x m,
|
Chris@10
|
196 using the buf matrix. This consists of d transposes
|
Chris@10
|
197 of contiguous n x d' matrices of m-tuples. */
|
Chris@10
|
198 if (n > 1) {
|
Chris@10
|
199 rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
|
Chris@10
|
200 for (i = 0; i < d; ++i) {
|
Chris@10
|
201 cldapply(ego->cld1, I + i*num_el, buf);
|
Chris@10
|
202 memcpy(I + i*num_el, buf, num_el*sizeof(R));
|
Chris@10
|
203 }
|
Chris@10
|
204 }
|
Chris@10
|
205
|
Chris@10
|
206 /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
|
Chris@10
|
207 is a square in-place transpose of n*m-tuples: */
|
Chris@10
|
208 {
|
Chris@10
|
209 rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
|
Chris@10
|
210 cldapply(ego->cld2, I, I);
|
Chris@10
|
211 }
|
Chris@10
|
212
|
Chris@10
|
213 /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
|
Chris@10
|
214 using the buf matrix. This consists of d' transposes
|
Chris@10
|
215 of contiguous d*n x m matrices. */
|
Chris@10
|
216 if (m > 1) {
|
Chris@10
|
217 rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
|
Chris@10
|
218 for (i = 0; i < d; ++i) {
|
Chris@10
|
219 cldapply(ego->cld3, I + i*num_el, buf);
|
Chris@10
|
220 memcpy(I + i*num_el, buf, num_el*sizeof(R));
|
Chris@10
|
221 }
|
Chris@10
|
222 }
|
Chris@10
|
223
|
Chris@10
|
224 X(ifree)(buf);
|
Chris@10
|
225 }
|
Chris@10
|
226
|
Chris@10
|
227 static int applicable_gcd(const problem_rdft *p, planner *plnr,
|
Chris@10
|
228 int dim0, int dim1, int dim2, INT *nbuf)
|
Chris@10
|
229 {
|
Chris@10
|
230 INT n = p->vecsz->dims[dim0].n;
|
Chris@10
|
231 INT m = p->vecsz->dims[dim1].n;
|
Chris@10
|
232 INT d, vl, vs;
|
Chris@10
|
233 get_transpose_vec(p, dim2, &vl, &vs);
|
Chris@10
|
234 d = gcd(n, m);
|
Chris@10
|
235 *nbuf = n * (m / d) * vl;
|
Chris@10
|
236 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
|
Chris@10
|
237 && n != m
|
Chris@10
|
238 && d > 1
|
Chris@10
|
239 && Ntuple_transposable(p->vecsz->dims + dim0,
|
Chris@10
|
240 p->vecsz->dims + dim1,
|
Chris@10
|
241 vl, vs));
|
Chris@10
|
242 }
|
Chris@10
|
243
|
Chris@10
|
244 static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
|
Chris@10
|
245 {
|
Chris@10
|
246 INT n = ego->nd, m = ego->md, d = ego->d;
|
Chris@10
|
247 INT vl = ego->vl;
|
Chris@10
|
248 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
|
Chris@10
|
249 INT num_el = n*m*d*vl;
|
Chris@10
|
250
|
Chris@10
|
251 if (n > 1) {
|
Chris@10
|
252 ego->cld1 = X(mkplan_d)(plnr,
|
Chris@10
|
253 X(mkproblem_rdft_0_d)(
|
Chris@10
|
254 X(mktensor_3d)(n, d*m*vl, m*vl,
|
Chris@10
|
255 d, m*vl, n*m*vl,
|
Chris@10
|
256 m*vl, 1, 1),
|
Chris@10
|
257 TAINT(p->I, num_el), buf));
|
Chris@10
|
258 if (!ego->cld1)
|
Chris@10
|
259 goto nada;
|
Chris@10
|
260 X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
|
Chris@10
|
261 &ego->super.super.ops);
|
Chris@10
|
262 ego->super.super.ops.other += num_el * d * 2;
|
Chris@10
|
263 }
|
Chris@10
|
264
|
Chris@10
|
265 ego->cld2 = X(mkplan_d)(plnr,
|
Chris@10
|
266 X(mkproblem_rdft_0_d)(
|
Chris@10
|
267 X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
|
Chris@10
|
268 d, n*m*vl, d*n*m*vl,
|
Chris@10
|
269 n*m*vl, 1, 1),
|
Chris@10
|
270 p->I, p->I));
|
Chris@10
|
271 if (!ego->cld2)
|
Chris@10
|
272 goto nada;
|
Chris@10
|
273 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
|
Chris@10
|
274
|
Chris@10
|
275 if (m > 1) {
|
Chris@10
|
276 ego->cld3 = X(mkplan_d)(plnr,
|
Chris@10
|
277 X(mkproblem_rdft_0_d)(
|
Chris@10
|
278 X(mktensor_3d)(d*n, m*vl, vl,
|
Chris@10
|
279 m, vl, d*n*vl,
|
Chris@10
|
280 vl, 1, 1),
|
Chris@10
|
281 TAINT(p->I, num_el), buf));
|
Chris@10
|
282 if (!ego->cld3)
|
Chris@10
|
283 goto nada;
|
Chris@10
|
284 X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
|
Chris@10
|
285 ego->super.super.ops.other += num_el * d * 2;
|
Chris@10
|
286 }
|
Chris@10
|
287
|
Chris@10
|
288 X(ifree)(buf);
|
Chris@10
|
289 return 1;
|
Chris@10
|
290
|
Chris@10
|
291 nada:
|
Chris@10
|
292 X(ifree)(buf);
|
Chris@10
|
293 return 0;
|
Chris@10
|
294 }
|
Chris@10
|
295
|
Chris@10
|
296 static const transpose_adt adt_gcd =
|
Chris@10
|
297 {
|
Chris@10
|
298 apply_gcd, applicable_gcd, mkcldrn_gcd,
|
Chris@10
|
299 "rdft-transpose-gcd"
|
Chris@10
|
300 };
|
Chris@10
|
301
|
Chris@10
|
302 /*************************************************************************/
|
Chris@10
|
303 /* Cache-oblivious in-place transpose of non-square n x m matrices,
|
Chris@10
|
304 based on transposing a sub-matrix first and then transposing the
|
Chris@10
|
305 remainder(s) with the help of a buffer. See also transpose-gcd,
|
Chris@10
|
306 above, if gcd(n,m) is large.
|
Chris@10
|
307
|
Chris@10
|
308 This algorithm is related to algorithm V3 from Murray Dow,
|
Chris@10
|
309 "Transposing a matrix on a vector computer," Parallel Computing 21
|
Chris@10
|
310 (12), 1997-2005 (1995), with the modifications that we use
|
Chris@10
|
311 cache-oblivious recursive transpose subroutines and we have the
|
Chris@10
|
312 generalization for large |n-m| below.
|
Chris@10
|
313
|
Chris@10
|
314 The best case, and the one described by Dow, is for |n-m| small, in
|
Chris@10
|
315 which case we transpose a square sub-matrix of size min(n,m),
|
Chris@10
|
316 handling the remainder via a buffer. This requires scratch space
|
Chris@10
|
317 equal to the size of the matrix times |n-m| / max(n,m).
|
Chris@10
|
318
|
Chris@10
|
319 As a generalization when |n-m| is not small, we also support cutting
|
Chris@10
|
320 *both* dimensions to an nc x mc matrix which is *not* necessarily
|
Chris@10
|
321 square, but has a large gcd (and can therefore use transpose-gcd).
|
Chris@10
|
322 */
|
Chris@10
|
323
|
Chris@10
|
324 static void apply_cut(const plan *ego_, R *I, R *O)
|
Chris@10
|
325 {
|
Chris@10
|
326 const P *ego = (const P *) ego_;
|
Chris@10
|
327 INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
|
Chris@10
|
328 INT i;
|
Chris@10
|
329 R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
|
Chris@10
|
330 UNUSED(O);
|
Chris@10
|
331
|
Chris@10
|
332 if (m > mc) {
|
Chris@10
|
333 ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
|
Chris@10
|
334 for (i = 0; i < nc; ++i)
|
Chris@10
|
335 memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
|
Chris@10
|
336 }
|
Chris@10
|
337
|
Chris@10
|
338 ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
|
Chris@10
|
339
|
Chris@10
|
340 if (n > nc) {
|
Chris@10
|
341 R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
|
Chris@10
|
342 memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
|
Chris@10
|
343 for (i = mc-1; i >= 0; --i)
|
Chris@10
|
344 memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
|
Chris@10
|
345 ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
|
Chris@10
|
346 }
|
Chris@10
|
347
|
Chris@10
|
348 if (m > mc) {
|
Chris@10
|
349 if (n > nc)
|
Chris@10
|
350 for (i = mc; i < m; ++i)
|
Chris@10
|
351 memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
|
Chris@10
|
352 (nc*vl)*sizeof(R));
|
Chris@10
|
353 else
|
Chris@10
|
354 memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
|
Chris@10
|
355 }
|
Chris@10
|
356
|
Chris@10
|
357 X(ifree)(buf1);
|
Chris@10
|
358 }
|
Chris@10
|
359
|
Chris@10
|
360 /* only cut one dimension if the resulting buffer is small enough */
|
Chris@10
|
361 static int cut1(INT n, INT m, INT vl)
|
Chris@10
|
362 {
|
Chris@10
|
363 return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
|
Chris@10
|
364 || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
|
Chris@10
|
365 }
|
Chris@10
|
366
|
Chris@10
|
367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
|
Chris@10
|
368
|
Chris@10
|
369 static int applicable_cut(const problem_rdft *p, planner *plnr,
|
Chris@10
|
370 int dim0, int dim1, int dim2, INT *nbuf)
|
Chris@10
|
371 {
|
Chris@10
|
372 INT n = p->vecsz->dims[dim0].n;
|
Chris@10
|
373 INT m = p->vecsz->dims[dim1].n;
|
Chris@10
|
374 INT vl, vs;
|
Chris@10
|
375 get_transpose_vec(p, dim2, &vl, &vs);
|
Chris@10
|
376 *nbuf = 0; /* always small enough to be non-UGLY (?) */
|
Chris@10
|
377 A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
|
Chris@10
|
378 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
|
Chris@10
|
379 && n != m
|
Chris@10
|
380
|
Chris@10
|
381 /* Don't call transpose-cut recursively (avoid inf. loops):
|
Chris@10
|
382 the non-square sub-transpose produced when !cut1
|
Chris@10
|
383 should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
|
Chris@10
|
384 for which transpose-gcd is applicable */
|
Chris@10
|
385 && (cut1(n, m, vl)
|
Chris@10
|
386 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
|
Chris@10
|
387
|
Chris@10
|
388 && Ntuple_transposable(p->vecsz->dims + dim0,
|
Chris@10
|
389 p->vecsz->dims + dim1,
|
Chris@10
|
390 vl, vs));
|
Chris@10
|
391 }
|
Chris@10
|
392
|
Chris@10
|
393 static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
|
Chris@10
|
394 {
|
Chris@10
|
395 INT n = ego->n, m = ego->m, nc, mc;
|
Chris@10
|
396 INT vl = ego->vl;
|
Chris@10
|
397 R *buf;
|
Chris@10
|
398
|
Chris@10
|
399 /* pick the "best" cut */
|
Chris@10
|
400 if (cut1(n, m, vl)) {
|
Chris@10
|
401 nc = mc = X(imin)(n,m);
|
Chris@10
|
402 }
|
Chris@10
|
403 else {
|
Chris@10
|
404 INT dc, ns, ms;
|
Chris@10
|
405 dc = gcd(m, n); nc = n; mc = m;
|
Chris@10
|
406 /* search for cut with largest gcd
|
Chris@10
|
407 (TODO: different optimality criteria? different search range?) */
|
Chris@10
|
408 for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
|
Chris@10
|
409 for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
|
Chris@10
|
410 INT ds = gcd(ms, ns);
|
Chris@10
|
411 if (ds > dc) {
|
Chris@10
|
412 dc = ds; nc = ns; mc = ms;
|
Chris@10
|
413 if (dc == X(imin)(ns, ms))
|
Chris@10
|
414 break; /* cannot get larger than this */
|
Chris@10
|
415 }
|
Chris@10
|
416 }
|
Chris@10
|
417 if (dc == X(imin)(n, ms))
|
Chris@10
|
418 break; /* cannot get larger than this */
|
Chris@10
|
419 }
|
Chris@10
|
420 A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
|
Chris@10
|
421 }
|
Chris@10
|
422 ego->nc = nc;
|
Chris@10
|
423 ego->mc = mc;
|
Chris@10
|
424 ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
|
Chris@10
|
425
|
Chris@10
|
426 buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
|
Chris@10
|
427
|
Chris@10
|
428 if (m > mc) {
|
Chris@10
|
429 ego->cld1 = X(mkplan_d)(plnr,
|
Chris@10
|
430 X(mkproblem_rdft_0_d)(
|
Chris@10
|
431 X(mktensor_3d)(nc, m*vl, vl,
|
Chris@10
|
432 m-mc, vl, nc*vl,
|
Chris@10
|
433 vl, 1, 1),
|
Chris@10
|
434 p->I + mc*vl, buf));
|
Chris@10
|
435 if (!ego->cld1)
|
Chris@10
|
436 goto nada;
|
Chris@10
|
437 X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
|
Chris@10
|
438 }
|
Chris@10
|
439
|
Chris@10
|
440 ego->cld2 = X(mkplan_d)(plnr,
|
Chris@10
|
441 X(mkproblem_rdft_0_d)(
|
Chris@10
|
442 X(mktensor_3d)(nc, mc*vl, vl,
|
Chris@10
|
443 mc, vl, nc*vl,
|
Chris@10
|
444 vl, 1, 1),
|
Chris@10
|
445 p->I, p->I));
|
Chris@10
|
446 if (!ego->cld2)
|
Chris@10
|
447 goto nada;
|
Chris@10
|
448 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
|
Chris@10
|
449
|
Chris@10
|
450 if (n > nc) {
|
Chris@10
|
451 ego->cld3 = X(mkplan_d)(plnr,
|
Chris@10
|
452 X(mkproblem_rdft_0_d)(
|
Chris@10
|
453 X(mktensor_3d)(n-nc, m*vl, vl,
|
Chris@10
|
454 m, vl, n*vl,
|
Chris@10
|
455 vl, 1, 1),
|
Chris@10
|
456 buf + (m-mc)*(nc*vl), p->I + nc*vl));
|
Chris@10
|
457 if (!ego->cld3)
|
Chris@10
|
458 goto nada;
|
Chris@10
|
459 X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
|
Chris@10
|
460 }
|
Chris@10
|
461
|
Chris@10
|
462 /* memcpy/memmove operations */
|
Chris@10
|
463 ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
|
Chris@10
|
464 + (n-nc)*m + (m-mc)*nc);
|
Chris@10
|
465
|
Chris@10
|
466 X(ifree)(buf);
|
Chris@10
|
467 return 1;
|
Chris@10
|
468
|
Chris@10
|
469 nada:
|
Chris@10
|
470 X(ifree)(buf);
|
Chris@10
|
471 return 0;
|
Chris@10
|
472 }
|
Chris@10
|
473
|
Chris@10
|
474 static const transpose_adt adt_cut =
|
Chris@10
|
475 {
|
Chris@10
|
476 apply_cut, applicable_cut, mkcldrn_cut,
|
Chris@10
|
477 "rdft-transpose-cut"
|
Chris@10
|
478 };
|
Chris@10
|
479
|
Chris@10
|
480 /*************************************************************************/
|
Chris@10
|
481 /* In-place transpose routine from TOMS, which follows the cycles of
|
Chris@10
|
482 the permutation so that it writes to each location only once.
|
Chris@10
|
483 Because of cache-line and other issues, however, this routine is
|
Chris@10
|
484 typically much slower than transpose-gcd or transpose-cut, even
|
Chris@10
|
485 though the latter do some extra writes. On the other hand, if the
|
Chris@10
|
486 vector length is large then the TOMS routine is best.
|
Chris@10
|
487
|
Chris@10
|
488 The TOMS routine also has the advantage of requiring less buffer
|
Chris@10
|
489 space for the case of gcd(nx,ny) small. However, in this case it
|
Chris@10
|
490 has been superseded by the combination of the generalized
|
Chris@10
|
491 transpose-cut method with the transpose-gcd method, which can
|
Chris@10
|
492 always transpose with buffers a small fraction of the array size
|
Chris@10
|
493 regardless of gcd(nx,ny). */
|
Chris@10
|
494
|
Chris@10
|
495 /*
|
Chris@10
|
496 * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380).
|
Chris@10
|
497 *
|
Chris@10
|
498 * These routines do in-place transposes of arrays.
|
Chris@10
|
499 *
|
Chris@10
|
500 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software,
|
Chris@10
|
501 * vol. 3, no. 1, 104-110 (1977) ]
|
Chris@10
|
502 *
|
Chris@10
|
503 * C version by Steven G. Johnson (February 1997).
|
Chris@10
|
504 */
|
Chris@10
|
505
|
Chris@10
|
506 /*
|
Chris@10
|
507 * "a" is a 1D array of length ny*nx*N which constains the nx x ny
|
Chris@10
|
508 * matrix of N-tuples to be transposed. "a" is stored in row-major
|
Chris@10
|
509 * order (last index varies fastest). move is a 1D array of length
|
Chris@10
|
510 * move_size used to store information to speed up the process. The
|
Chris@10
|
511 * value move_size=(ny+nx)/2 is recommended. buf should be an array
|
Chris@10
|
512 * of length 2*N.
|
Chris@10
|
513 *
|
Chris@10
|
514 */
|
Chris@10
|
515
|
Chris@10
|
516 static void transpose_toms513(R *a, INT nx, INT ny, INT N,
|
Chris@10
|
517 char *move, INT move_size, R *buf)
|
Chris@10
|
518 {
|
Chris@10
|
519 INT i, im, mn;
|
Chris@10
|
520 R *b, *c, *d;
|
Chris@10
|
521 INT ncount;
|
Chris@10
|
522 INT k;
|
Chris@10
|
523
|
Chris@10
|
524 /* check arguments and initialize: */
|
Chris@10
|
525 A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
|
Chris@10
|
526
|
Chris@10
|
527 b = buf;
|
Chris@10
|
528
|
Chris@10
|
529 /* Cate & Twigg have a special case for nx == ny, but we don't
|
Chris@10
|
530 bother, since we already have special code for this case elsewhere. */
|
Chris@10
|
531
|
Chris@10
|
532 c = buf + N;
|
Chris@10
|
533 ncount = 2; /* always at least 2 fixed points */
|
Chris@10
|
534 k = (mn = ny * nx) - 1;
|
Chris@10
|
535
|
Chris@10
|
536 for (i = 0; i < move_size; ++i)
|
Chris@10
|
537 move[i] = 0;
|
Chris@10
|
538
|
Chris@10
|
539 if (ny >= 3 && nx >= 3)
|
Chris@10
|
540 ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */
|
Chris@10
|
541
|
Chris@10
|
542 i = 1;
|
Chris@10
|
543 im = ny;
|
Chris@10
|
544
|
Chris@10
|
545 while (1) {
|
Chris@10
|
546 INT i1, i2, i1c, i2c;
|
Chris@10
|
547 INT kmi;
|
Chris@10
|
548
|
Chris@10
|
549 /** Rearrange the elements of a loop
|
Chris@10
|
550 and its companion loop: **/
|
Chris@10
|
551
|
Chris@10
|
552 i1 = i;
|
Chris@10
|
553 kmi = k - i;
|
Chris@10
|
554 i1c = kmi;
|
Chris@10
|
555 switch (N) {
|
Chris@10
|
556 case 1:
|
Chris@10
|
557 b[0] = a[i1];
|
Chris@10
|
558 c[0] = a[i1c];
|
Chris@10
|
559 break;
|
Chris@10
|
560 case 2:
|
Chris@10
|
561 b[0] = a[2*i1];
|
Chris@10
|
562 b[1] = a[2*i1+1];
|
Chris@10
|
563 c[0] = a[2*i1c];
|
Chris@10
|
564 c[1] = a[2*i1c+1];
|
Chris@10
|
565 break;
|
Chris@10
|
566 default:
|
Chris@10
|
567 memcpy(b, &a[N * i1], N * sizeof(R));
|
Chris@10
|
568 memcpy(c, &a[N * i1c], N * sizeof(R));
|
Chris@10
|
569 }
|
Chris@10
|
570 while (1) {
|
Chris@10
|
571 i2 = ny * i1 - k * (i1 / nx);
|
Chris@10
|
572 i2c = k - i2;
|
Chris@10
|
573 if (i1 < move_size)
|
Chris@10
|
574 move[i1] = 1;
|
Chris@10
|
575 if (i1c < move_size)
|
Chris@10
|
576 move[i1c] = 1;
|
Chris@10
|
577 ncount += 2;
|
Chris@10
|
578 if (i2 == i)
|
Chris@10
|
579 break;
|
Chris@10
|
580 if (i2 == kmi) {
|
Chris@10
|
581 d = b;
|
Chris@10
|
582 b = c;
|
Chris@10
|
583 c = d;
|
Chris@10
|
584 break;
|
Chris@10
|
585 }
|
Chris@10
|
586 switch (N) {
|
Chris@10
|
587 case 1:
|
Chris@10
|
588 a[i1] = a[i2];
|
Chris@10
|
589 a[i1c] = a[i2c];
|
Chris@10
|
590 break;
|
Chris@10
|
591 case 2:
|
Chris@10
|
592 a[2*i1] = a[2*i2];
|
Chris@10
|
593 a[2*i1+1] = a[2*i2+1];
|
Chris@10
|
594 a[2*i1c] = a[2*i2c];
|
Chris@10
|
595 a[2*i1c+1] = a[2*i2c+1];
|
Chris@10
|
596 break;
|
Chris@10
|
597 default:
|
Chris@10
|
598 memcpy(&a[N * i1], &a[N * i2],
|
Chris@10
|
599 N * sizeof(R));
|
Chris@10
|
600 memcpy(&a[N * i1c], &a[N * i2c],
|
Chris@10
|
601 N * sizeof(R));
|
Chris@10
|
602 }
|
Chris@10
|
603 i1 = i2;
|
Chris@10
|
604 i1c = i2c;
|
Chris@10
|
605 }
|
Chris@10
|
606 switch (N) {
|
Chris@10
|
607 case 1:
|
Chris@10
|
608 a[i1] = b[0];
|
Chris@10
|
609 a[i1c] = c[0];
|
Chris@10
|
610 break;
|
Chris@10
|
611 case 2:
|
Chris@10
|
612 a[2*i1] = b[0];
|
Chris@10
|
613 a[2*i1+1] = b[1];
|
Chris@10
|
614 a[2*i1c] = c[0];
|
Chris@10
|
615 a[2*i1c+1] = c[1];
|
Chris@10
|
616 break;
|
Chris@10
|
617 default:
|
Chris@10
|
618 memcpy(&a[N * i1], b, N * sizeof(R));
|
Chris@10
|
619 memcpy(&a[N * i1c], c, N * sizeof(R));
|
Chris@10
|
620 }
|
Chris@10
|
621 if (ncount >= mn)
|
Chris@10
|
622 break; /* we've moved all elements */
|
Chris@10
|
623
|
Chris@10
|
624 /** Search for loops to rearrange: **/
|
Chris@10
|
625
|
Chris@10
|
626 while (1) {
|
Chris@10
|
627 INT max = k - i;
|
Chris@10
|
628 ++i;
|
Chris@10
|
629 A(i <= max);
|
Chris@10
|
630 im += ny;
|
Chris@10
|
631 if (im > k)
|
Chris@10
|
632 im -= k;
|
Chris@10
|
633 i2 = im;
|
Chris@10
|
634 if (i == i2)
|
Chris@10
|
635 continue;
|
Chris@10
|
636 if (i >= move_size) {
|
Chris@10
|
637 while (i2 > i && i2 < max) {
|
Chris@10
|
638 i1 = i2;
|
Chris@10
|
639 i2 = ny * i1 - k * (i1 / nx);
|
Chris@10
|
640 }
|
Chris@10
|
641 if (i2 == i)
|
Chris@10
|
642 break;
|
Chris@10
|
643 } else if (!move[i])
|
Chris@10
|
644 break;
|
Chris@10
|
645 }
|
Chris@10
|
646 }
|
Chris@10
|
647 }
|
Chris@10
|
648
|
Chris@10
|
649 static void apply_toms513(const plan *ego_, R *I, R *O)
|
Chris@10
|
650 {
|
Chris@10
|
651 const P *ego = (const P *) ego_;
|
Chris@10
|
652 INT n = ego->n, m = ego->m;
|
Chris@10
|
653 INT vl = ego->vl;
|
Chris@10
|
654 R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
|
Chris@10
|
655 UNUSED(O);
|
Chris@10
|
656 transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
|
Chris@10
|
657 X(ifree)(buf);
|
Chris@10
|
658 }
|
Chris@10
|
659
|
Chris@10
|
660 static int applicable_toms513(const problem_rdft *p, planner *plnr,
|
Chris@10
|
661 int dim0, int dim1, int dim2, INT *nbuf)
|
Chris@10
|
662 {
|
Chris@10
|
663 INT n = p->vecsz->dims[dim0].n;
|
Chris@10
|
664 INT m = p->vecsz->dims[dim1].n;
|
Chris@10
|
665 INT vl, vs;
|
Chris@10
|
666 get_transpose_vec(p, dim2, &vl, &vs);
|
Chris@10
|
667 *nbuf = 2*vl
|
Chris@10
|
668 + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
|
Chris@10
|
669 return (!NO_SLOWP(plnr)
|
Chris@10
|
670 && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
|
Chris@10
|
671 && n != m
|
Chris@10
|
672 && Ntuple_transposable(p->vecsz->dims + dim0,
|
Chris@10
|
673 p->vecsz->dims + dim1,
|
Chris@10
|
674 vl, vs));
|
Chris@10
|
675 }
|
Chris@10
|
676
|
Chris@10
|
677 static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
|
Chris@10
|
678 {
|
Chris@10
|
679 UNUSED(p); UNUSED(plnr);
|
Chris@10
|
680 /* heuristic so that TOMS algorithm is last resort for small vl */
|
Chris@10
|
681 ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
|
Chris@10
|
682 return 1;
|
Chris@10
|
683 }
|
Chris@10
|
684
|
Chris@10
|
685 static const transpose_adt adt_toms513 =
|
Chris@10
|
686 {
|
Chris@10
|
687 apply_toms513, applicable_toms513, mkcldrn_toms513,
|
Chris@10
|
688 "rdft-transpose-toms513"
|
Chris@10
|
689 };
|
Chris@10
|
690
|
Chris@10
|
691 /*-----------------------------------------------------------------------*/
|
Chris@10
|
692 /*-----------------------------------------------------------------------*/
|
Chris@10
|
693 /* generic stuff: */
|
Chris@10
|
694
|
Chris@10
|
695 static void awake(plan *ego_, enum wakefulness wakefulness)
|
Chris@10
|
696 {
|
Chris@10
|
697 P *ego = (P *) ego_;
|
Chris@10
|
698 X(plan_awake)(ego->cld1, wakefulness);
|
Chris@10
|
699 X(plan_awake)(ego->cld2, wakefulness);
|
Chris@10
|
700 X(plan_awake)(ego->cld3, wakefulness);
|
Chris@10
|
701 }
|
Chris@10
|
702
|
Chris@10
|
703 static void print(const plan *ego_, printer *p)
|
Chris@10
|
704 {
|
Chris@10
|
705 const P *ego = (const P *) ego_;
|
Chris@10
|
706 p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
|
Chris@10
|
707 ego->n, ego->m, ego->vl);
|
Chris@10
|
708 if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
|
Chris@10
|
709 if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
|
Chris@10
|
710 if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
|
Chris@10
|
711 p->print(p, ")");
|
Chris@10
|
712 }
|
Chris@10
|
713
|
Chris@10
|
714 static void destroy(plan *ego_)
|
Chris@10
|
715 {
|
Chris@10
|
716 P *ego = (P *) ego_;
|
Chris@10
|
717 X(plan_destroy_internal)(ego->cld3);
|
Chris@10
|
718 X(plan_destroy_internal)(ego->cld2);
|
Chris@10
|
719 X(plan_destroy_internal)(ego->cld1);
|
Chris@10
|
720 }
|
Chris@10
|
721
|
Chris@10
|
722 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
Chris@10
|
723 {
|
Chris@10
|
724 const S *ego = (const S *) ego_;
|
Chris@10
|
725 const problem_rdft *p;
|
Chris@10
|
726 int dim0, dim1, dim2;
|
Chris@10
|
727 INT nbuf, vs;
|
Chris@10
|
728 P *pln;
|
Chris@10
|
729
|
Chris@10
|
730 static const plan_adt padt = {
|
Chris@10
|
731 X(rdft_solve), awake, print, destroy
|
Chris@10
|
732 };
|
Chris@10
|
733
|
Chris@10
|
734 if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
|
Chris@10
|
735 return (plan *) 0;
|
Chris@10
|
736
|
Chris@10
|
737 p = (const problem_rdft *) p_;
|
Chris@10
|
738 pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
|
Chris@10
|
739
|
Chris@10
|
740 pln->n = p->vecsz->dims[dim0].n;
|
Chris@10
|
741 pln->m = p->vecsz->dims[dim1].n;
|
Chris@10
|
742 get_transpose_vec(p, dim2, &pln->vl, &vs);
|
Chris@10
|
743 pln->nbuf = nbuf;
|
Chris@10
|
744 pln->d = gcd(pln->n, pln->m);
|
Chris@10
|
745 pln->nd = pln->n / pln->d;
|
Chris@10
|
746 pln->md = pln->m / pln->d;
|
Chris@10
|
747 pln->slv = ego;
|
Chris@10
|
748
|
Chris@10
|
749 X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
|
Chris@10
|
750
|
Chris@10
|
751 pln->cld1 = pln->cld2 = pln->cld3 = 0;
|
Chris@10
|
752 if (!ego->adt->mkcldrn(p, plnr, pln)) {
|
Chris@10
|
753 X(plan_destroy_internal)(&(pln->super.super));
|
Chris@10
|
754 return 0;
|
Chris@10
|
755 }
|
Chris@10
|
756
|
Chris@10
|
757 return &(pln->super.super);
|
Chris@10
|
758 }
|
Chris@10
|
759
|
Chris@10
|
760 static solver *mksolver(const transpose_adt *adt)
|
Chris@10
|
761 {
|
Chris@10
|
762 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
|
Chris@10
|
763 S *slv = MKSOLVER(S, &sadt);
|
Chris@10
|
764 slv->adt = adt;
|
Chris@10
|
765 return &(slv->super);
|
Chris@10
|
766 }
|
Chris@10
|
767
|
Chris@10
|
768 void X(rdft_vrank3_transpose_register)(planner *p)
|
Chris@10
|
769 {
|
Chris@10
|
770 unsigned i;
|
Chris@10
|
771 static const transpose_adt *const adts[] = {
|
Chris@10
|
772 &adt_gcd, &adt_cut,
|
Chris@10
|
773 &adt_toms513
|
Chris@10
|
774 };
|
Chris@10
|
775 for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
|
Chris@10
|
776 REGISTER_SOLVER(p, mksolver(adts[i]));
|
Chris@10
|
777 }
|