sv-dependency-builds: src/fftw-3.3.8/rdft/vrank3-transpose.c annotate

annotate src/fftw-3.3.8/rdft/vrank3-transpose.c @ 169:223a55898ab9 tip default

Add null config files

author	Chris Cannam <cannam@all-day-breakfast.com>
date	Mon, 02 Mar 2020 14:03:47 +0000
parents	bd3cc4d1df30
children

rev	line source
cannam@167	1 /*
cannam@167	2 * Copyright (c) 2003, 2007-14 Matteo Frigo
cannam@167	3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
cannam@167	4 *
cannam@167	5 * This program is free software; you can redistribute it and/or modify
cannam@167	6 * it under the terms of the GNU General Public License as published by
cannam@167	7 * the Free Software Foundation; either version 2 of the License, or
cannam@167	8 * (at your option) any later version.
cannam@167	9 *
cannam@167	10 * This program is distributed in the hope that it will be useful,
cannam@167	11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
cannam@167	12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cannam@167	13 * GNU General Public License for more details.
cannam@167	14 *
cannam@167	15 * You should have received a copy of the GNU General Public License
cannam@167	16 * along with this program; if not, write to the Free Software
cannam@167	17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
cannam@167	18 *
cannam@167	19 */
cannam@167	20
cannam@167	21
cannam@167	22 /* rank-0, vector-rank-3, non-square in-place transposition
cannam@167	23 (see rank0.c for square transposition) */
cannam@167	24
cannam@167	25 #include "rdft/rdft.h"
cannam@167	26
cannam@167	27 #ifdef HAVE_STRING_H
cannam@167	28 #include <string.h> /* for memcpy() */
cannam@167	29 #endif
cannam@167	30
cannam@167	31 struct P_s;
cannam@167	32
cannam@167	33 typedef struct {
cannam@167	34 rdftapply apply;
cannam@167	35 int (applicable)(const problem_rdft p, planner *plnr,
cannam@167	36 int dim0, int dim1, int dim2, INT *nbuf);
cannam@167	37 int (mkcldrn)(const problem_rdft p, planner plnr, struct P_s ego);
cannam@167	38 const char *nam;
cannam@167	39 } transpose_adt;
cannam@167	40
cannam@167	41 typedef struct {
cannam@167	42 solver super;
cannam@167	43 const transpose_adt *adt;
cannam@167	44 } S;
cannam@167	45
cannam@167	46 typedef struct P_s {
cannam@167	47 plan_rdft super;
cannam@167	48 INT n, m, vl; /* transpose n x m matrix of vl-tuples */
cannam@167	49 INT nbuf; /* buffer size */
cannam@167	50 INT nd, md, d; /* transpose-gcd params */
cannam@167	51 INT nc, mc; /* transpose-cut params */
cannam@167	52 plan cld1, cld2, cld3; / children, null if unused */
cannam@167	53 const S *slv;
cannam@167	54 } P;
cannam@167	55
cannam@167	56
cannam@167	57 /*************************************************************************/
cannam@167	58 /* some utilities for the solvers */
cannam@167	59
cannam@167	60 static INT gcd(INT a, INT b)
cannam@167	61 {
cannam@167	62 INT r;
cannam@167	63 do {
cannam@167	64 r = a % b;
cannam@167	65 a = b;
cannam@167	66 b = r;
cannam@167	67 } while (r != 0);
cannam@167	68
cannam@167	69 return a;
cannam@167	70 }
cannam@167	71
cannam@167	72 /* whether we can transpose with one of our routines expecting
cannam@167	73 contiguous Ntuples */
cannam@167	74 static int Ntuple_transposable(const iodim a, const iodim b, INT vl, INT vs)
cannam@167	75 {
cannam@167	76 return (vs == 1 && b->is == vl && a->os == vl &&
cannam@167	77 ((a->n == b->n && a->is == b->os
cannam@167	78 && a->is >= b->n && a->is % vl == 0)
cannam@167	79 \|\| (a->is == b->n * vl && b->os == a->n * vl)));
cannam@167	80 }
cannam@167	81
cannam@167	82 /* check whether a and b correspond to the first and second dimensions
cannam@167	83 of a transpose of tuples with vector length = vl, stride = vs. */
cannam@167	84 static int transposable(const iodim a, const iodim b, INT vl, INT vs)
cannam@167	85 {
cannam@167	86 return ((a->n == b->n && a->os == b->is && a->is == b->os)
cannam@167	87 \|\| Ntuple_transposable(a, b, vl, vs));
cannam@167	88 }
cannam@167	89
cannam@167	90 static int pickdim(const tensor s, int pdim0, int pdim1, int pdim2)
cannam@167	91 {
cannam@167	92 int dim0, dim1;
cannam@167	93
cannam@167	94 for (dim0 = 0; dim0 < s->rnk; ++dim0)
cannam@167	95 for (dim1 = 0; dim1 < s->rnk; ++dim1) {
cannam@167	96 int dim2 = 3 - dim0 - dim1;
cannam@167	97 if (dim0 == dim1) continue;
cannam@167	98 if ((s->rnk == 2 \|\| s->dims[dim2].is == s->dims[dim2].os)
cannam@167	99 && transposable(s->dims + dim0, s->dims + dim1,
cannam@167	100 s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
cannam@167	101 s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
cannam@167	102 *pdim0 = dim0;
cannam@167	103 *pdim1 = dim1;
cannam@167	104 *pdim2 = dim2;
cannam@167	105 return 1;
cannam@167	106 }
cannam@167	107 }
cannam@167	108 return 0;
cannam@167	109 }
cannam@167	110
cannam@167	111 #define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
cannam@167	112 #define MAXBUF 65536 /* maximum non-ugly buffer */
cannam@167	113
cannam@167	114 /* generic applicability function */
cannam@167	115 static int applicable(const solver ego_, const problem p_, planner *plnr,
cannam@167	116 int dim0, int dim1, int dim2, INT nbuf)
cannam@167	117 {
cannam@167	118 const S ego = (const S ) ego_;
cannam@167	119 const problem_rdft p = (const problem_rdft ) p_;
cannam@167	120
cannam@167	121 return (1
cannam@167	122 && p->I == p->O
cannam@167	123 && p->sz->rnk == 0
cannam@167	124 && (p->vecsz->rnk == 2 \|\| p->vecsz->rnk == 3)
cannam@167	125
cannam@167	126 && pickdim(p->vecsz, dim0, dim1, dim2)
cannam@167	127
cannam@167	128 /* UGLY if vecloop in wrong order for locality */
cannam@167	129 && (!NO_UGLYP(plnr) \|\|
cannam@167	130 p->vecsz->rnk == 2 \|\|
cannam@167	131 X(iabs)(p->vecsz->dims[*dim2].is)
cannam@167	132 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
cannam@167	133 X(iabs)(p->vecsz->dims[*dim0].os)))
cannam@167	134
cannam@167	135 /* SLOW if non-square */
cannam@167	136 && (!NO_SLOWP(plnr)
cannam@167	137 \|\| p->vecsz->dims[dim0].n == p->vecsz->dims[dim1].n)
cannam@167	138
cannam@167	139 && ego->adt->applicable(p, plnr, dim0,dim1,*dim2,nbuf)
cannam@167	140
cannam@167	141 /* buffers too big are UGLY */
cannam@167	142 && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
cannam@167	143 \|\| *nbuf <= MAXBUF
cannam@167	144 \|\| nbuf MINBUFDIV <= X(tensor_sz)(p->vecsz))
cannam@167	145 );
cannam@167	146 }
cannam@167	147
cannam@167	148 static void get_transpose_vec(const problem_rdft p, int dim2, INT vl,INT *vs)
cannam@167	149 {
cannam@167	150 if (p->vecsz->rnk == 2) {
cannam@167	151 vl = 1; vs = 1;
cannam@167	152 }
cannam@167	153 else {
cannam@167	154 *vl = p->vecsz->dims[dim2].n;
cannam@167	155 vs = p->vecsz->dims[dim2].is; / == os */
cannam@167	156 }
cannam@167	157 }
cannam@167	158
cannam@167	159 /*************************************************************************/
cannam@167	160 /* Cache-oblivious in-place transpose of non-square matrices, based
cannam@167	161 on transposes of blocks given by the gcd of the dimensions.
cannam@167	162
cannam@167	163 This algorithm is related to algorithm V5 from Murray Dow,
cannam@167	164 "Transposing a matrix on a vector computer," Parallel Computing 21
cannam@167	165 (12), 1997-2005 (1995), with the modification that we use
cannam@167	166 cache-oblivious recursive transpose subroutines (and we derived
cannam@167	167 it independently).
cannam@167	168
cannam@167	169 For a p x q matrix, this requires scratch space equal to the size
cannam@167	170 of the matrix divided by gcd(p,q). Alternatively, see also the
cannam@167	171 "cut" algorithm below, if \|p-q\| * gcd(p,q) < max(p,q). */
cannam@167	172
cannam@167	173 static void apply_gcd(const plan ego_, R I, R *O)
cannam@167	174 {
cannam@167	175 const P ego = (const P ) ego_;
cannam@167	176 INT n = ego->nd, m = ego->md, d = ego->d;
cannam@167	177 INT vl = ego->vl;
cannam@167	178 R buf = (R )MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@167	179 INT i, num_el = nmd*vl;
cannam@167	180
cannam@167	181 A(ego->n == n * d && ego->m == m * d);
cannam@167	182 UNUSED(O);
cannam@167	183
cannam@167	184 /* Transpose the matrix I in-place, where I is an (nd) x (md) matrix
cannam@167	185 of vl-tuples and buf contains nmd*vl elements.
cannam@167	186
cannam@167	187 In general, to transpose a p x q matrix, you should call this
cannam@167	188 routine with d = gcd(p, q), n = p/d, and m = q/d. */
cannam@167	189
cannam@167	190 A(n > 0 && m > 0 && vl > 0);
cannam@167	191 A(d > 1);
cannam@167	192
cannam@167	193 /* treat as (d x n) x (d' x m) matrix. (d' = d) */
cannam@167	194
cannam@167	195 /* First, transpose d x (n x d') x m to d x (d' x n) x m,
cannam@167	196 using the buf matrix. This consists of d transposes
cannam@167	197 of contiguous n x d' matrices of m-tuples. */
cannam@167	198 if (n > 1) {
cannam@167	199 rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
cannam@167	200 for (i = 0; i < d; ++i) {
cannam@167	201 cldapply(ego->cld1, I + i*num_el, buf);
cannam@167	202 memcpy(I + inum_el, buf, num_elsizeof(R));
cannam@167	203 }
cannam@167	204 }
cannam@167	205
cannam@167	206 /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
cannam@167	207 is a square in-place transpose of nm-tuples: /
cannam@167	208 {
cannam@167	209 rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
cannam@167	210 cldapply(ego->cld2, I, I);
cannam@167	211 }
cannam@167	212
cannam@167	213 /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
cannam@167	214 using the buf matrix. This consists of d' transposes
cannam@167	215 of contiguous dn x m matrices. /
cannam@167	216 if (m > 1) {
cannam@167	217 rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
cannam@167	218 for (i = 0; i < d; ++i) {
cannam@167	219 cldapply(ego->cld3, I + i*num_el, buf);
cannam@167	220 memcpy(I + inum_el, buf, num_elsizeof(R));
cannam@167	221 }
cannam@167	222 }
cannam@167	223
cannam@167	224 X(ifree)(buf);
cannam@167	225 }
cannam@167	226
cannam@167	227 static int applicable_gcd(const problem_rdft p, planner plnr,
cannam@167	228 int dim0, int dim1, int dim2, INT *nbuf)
cannam@167	229 {
cannam@167	230 INT n = p->vecsz->dims[dim0].n;
cannam@167	231 INT m = p->vecsz->dims[dim1].n;
cannam@167	232 INT d, vl, vs;
cannam@167	233 get_transpose_vec(p, dim2, &vl, &vs);
cannam@167	234 d = gcd(n, m);
cannam@167	235 nbuf = n (m / d) * vl;
cannam@167	236 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
cannam@167	237 && n != m
cannam@167	238 && d > 1
cannam@167	239 && Ntuple_transposable(p->vecsz->dims + dim0,
cannam@167	240 p->vecsz->dims + dim1,
cannam@167	241 vl, vs));
cannam@167	242 }
cannam@167	243
cannam@167	244 static int mkcldrn_gcd(const problem_rdft p, planner plnr, P *ego)
cannam@167	245 {
cannam@167	246 INT n = ego->nd, m = ego->md, d = ego->d;
cannam@167	247 INT vl = ego->vl;
cannam@167	248 R buf = (R )MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@167	249 INT num_el = nmd*vl;
cannam@167	250
cannam@167	251 if (n > 1) {
cannam@167	252 ego->cld1 = X(mkplan_d)(plnr,
cannam@167	253 X(mkproblem_rdft_0_d)(
cannam@167	254 X(mktensor_3d)(n, dmvl, m*vl,
cannam@167	255 d, mvl, nm*vl,
cannam@167	256 m*vl, 1, 1),
cannam@167	257 TAINT(p->I, num_el), buf));
cannam@167	258 if (!ego->cld1)
cannam@167	259 goto nada;
cannam@167	260 X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
cannam@167	261 &ego->super.super.ops);
cannam@167	262 ego->super.super.ops.other += num_el * d * 2;
cannam@167	263 }
cannam@167	264
cannam@167	265 ego->cld2 = X(mkplan_d)(plnr,
cannam@167	266 X(mkproblem_rdft_0_d)(
cannam@167	267 X(mktensor_3d)(d, dnmvl, nm*vl,
cannam@167	268 d, nmvl, dnm*vl,
cannam@167	269 nmvl, 1, 1),
cannam@167	270 p->I, p->I));
cannam@167	271 if (!ego->cld2)
cannam@167	272 goto nada;
cannam@167	273 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
cannam@167	274
cannam@167	275 if (m > 1) {
cannam@167	276 ego->cld3 = X(mkplan_d)(plnr,
cannam@167	277 X(mkproblem_rdft_0_d)(
cannam@167	278 X(mktensor_3d)(dn, mvl, vl,
cannam@167	279 m, vl, dnvl,
cannam@167	280 vl, 1, 1),
cannam@167	281 TAINT(p->I, num_el), buf));
cannam@167	282 if (!ego->cld3)
cannam@167	283 goto nada;
cannam@167	284 X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
cannam@167	285 ego->super.super.ops.other += num_el * d * 2;
cannam@167	286 }
cannam@167	287
cannam@167	288 X(ifree)(buf);
cannam@167	289 return 1;
cannam@167	290
cannam@167	291 nada:
cannam@167	292 X(ifree)(buf);
cannam@167	293 return 0;
cannam@167	294 }
cannam@167	295
cannam@167	296 static const transpose_adt adt_gcd =
cannam@167	297 {
cannam@167	298 apply_gcd, applicable_gcd, mkcldrn_gcd,
cannam@167	299 "rdft-transpose-gcd"
cannam@167	300 };
cannam@167	301
cannam@167	302 /*************************************************************************/
cannam@167	303 /* Cache-oblivious in-place transpose of non-square n x m matrices,
cannam@167	304 based on transposing a sub-matrix first and then transposing the
cannam@167	305 remainder(s) with the help of a buffer. See also transpose-gcd,
cannam@167	306 above, if gcd(n,m) is large.
cannam@167	307
cannam@167	308 This algorithm is related to algorithm V3 from Murray Dow,
cannam@167	309 "Transposing a matrix on a vector computer," Parallel Computing 21
cannam@167	310 (12), 1997-2005 (1995), with the modifications that we use
cannam@167	311 cache-oblivious recursive transpose subroutines and we have the
cannam@167	312 generalization for large \|n-m\| below.
cannam@167	313
cannam@167	314 The best case, and the one described by Dow, is for \|n-m\| small, in
cannam@167	315 which case we transpose a square sub-matrix of size min(n,m),
cannam@167	316 handling the remainder via a buffer. This requires scratch space
cannam@167	317 equal to the size of the matrix times \|n-m\| / max(n,m).
cannam@167	318
cannam@167	319 As a generalization when \|n-m\| is not small, we also support cutting
cannam@167	320 both dimensions to an nc x mc matrix which is not necessarily
cannam@167	321 square, but has a large gcd (and can therefore use transpose-gcd).
cannam@167	322 */
cannam@167	323
cannam@167	324 static void apply_cut(const plan ego_, R I, R *O)
cannam@167	325 {
cannam@167	326 const P ego = (const P ) ego_;
cannam@167	327 INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
cannam@167	328 INT i;
cannam@167	329 R buf1 = (R )MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@167	330 UNUSED(O);
cannam@167	331
cannam@167	332 if (m > mc) {
cannam@167	333 ((plan_rdft ) ego->cld1)->apply(ego->cld1, I + mcvl, buf1);
cannam@167	334 for (i = 0; i < nc; ++i)
cannam@167	335 memmove(I + (mcvl) i, I + (mvl) i, sizeof(R) * (mc*vl));
cannam@167	336 }
cannam@167	337
cannam@167	338 ((plan_rdft ) ego->cld2)->apply(ego->cld2, I, I); / nc x mc transpose */
cannam@167	339
cannam@167	340 if (n > nc) {
cannam@167	341 R buf2 = buf1 + (m-mc)(ncvl); / FIXME: force better alignment? */
cannam@167	342 memcpy(buf2, I + nc(mvl), (n-nc)(mvl)*sizeof(R));
cannam@167	343 for (i = mc-1; i >= 0; --i)
cannam@167	344 memmove(I + (nvl) i, I + (ncvl) i, sizeof(R) * (n*vl));
cannam@167	345 ((plan_rdft ) ego->cld3)->apply(ego->cld3, buf2, I + ncvl);
cannam@167	346 }
cannam@167	347
cannam@167	348 if (m > mc) {
cannam@167	349 if (n > nc)
cannam@167	350 for (i = mc; i < m; ++i)
cannam@167	351 memcpy(I + i(nvl), buf1 + (i-mc)(ncvl),
cannam@167	352 (ncvl)sizeof(R));
cannam@167	353 else
cannam@167	354 memcpy(I + mc(nvl), buf1, (m-mc)(nvl)*sizeof(R));
cannam@167	355 }
cannam@167	356
cannam@167	357 X(ifree)(buf1);
cannam@167	358 }
cannam@167	359
cannam@167	360 /* only cut one dimension if the resulting buffer is small enough */
cannam@167	361 static int cut1(INT n, INT m, INT vl)
cannam@167	362 {
cannam@167	363 return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
cannam@167	364 \|\| X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
cannam@167	365 }
cannam@167	366
cannam@167	367 #define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
cannam@167	368
cannam@167	369 static int applicable_cut(const problem_rdft p, planner plnr,
cannam@167	370 int dim0, int dim1, int dim2, INT *nbuf)
cannam@167	371 {
cannam@167	372 INT n = p->vecsz->dims[dim0].n;
cannam@167	373 INT m = p->vecsz->dims[dim1].n;
cannam@167	374 INT vl, vs;
cannam@167	375 get_transpose_vec(p, dim2, &vl, &vs);
cannam@167	376 nbuf = 0; / always small enough to be non-UGLY (?) */
cannam@167	377 A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
cannam@167	378 return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
cannam@167	379 && n != m
cannam@167	380
cannam@167	381 /* Don't call transpose-cut recursively (avoid inf. loops):
cannam@167	382 the non-square sub-transpose produced when !cut1
cannam@167	383 should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
cannam@167	384 for which transpose-gcd is applicable */
cannam@167	385 && (cut1(n, m, vl)
cannam@167	386 \|\| gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
cannam@167	387
cannam@167	388 && Ntuple_transposable(p->vecsz->dims + dim0,
cannam@167	389 p->vecsz->dims + dim1,
cannam@167	390 vl, vs));
cannam@167	391 }
cannam@167	392
cannam@167	393 static int mkcldrn_cut(const problem_rdft p, planner plnr, P *ego)
cannam@167	394 {
cannam@167	395 INT n = ego->n, m = ego->m, nc, mc;
cannam@167	396 INT vl = ego->vl;
cannam@167	397 R *buf;
cannam@167	398
cannam@167	399 /* pick the "best" cut */
cannam@167	400 if (cut1(n, m, vl)) {
cannam@167	401 nc = mc = X(imin)(n,m);
cannam@167	402 }
cannam@167	403 else {
cannam@167	404 INT dc, ns, ms;
cannam@167	405 dc = gcd(m, n); nc = n; mc = m;
cannam@167	406 /* search for cut with largest gcd
cannam@167	407 (TODO: different optimality criteria? different search range?) */
cannam@167	408 for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
cannam@167	409 for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
cannam@167	410 INT ds = gcd(ms, ns);
cannam@167	411 if (ds > dc) {
cannam@167	412 dc = ds; nc = ns; mc = ms;
cannam@167	413 if (dc == X(imin)(ns, ms))
cannam@167	414 break; /* cannot get larger than this */
cannam@167	415 }
cannam@167	416 }
cannam@167	417 if (dc == X(imin)(n, ms))
cannam@167	418 break; /* cannot get larger than this */
cannam@167	419 }
cannam@167	420 A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
cannam@167	421 }
cannam@167	422 ego->nc = nc;
cannam@167	423 ego->mc = mc;
cannam@167	424 ego->nbuf = (m-mc)(ncvl) + (n-nc)(mvl);
cannam@167	425
cannam@167	426 buf = (R )MALLOC(sizeof(R) ego->nbuf, BUFFERS);
cannam@167	427
cannam@167	428 if (m > mc) {
cannam@167	429 ego->cld1 = X(mkplan_d)(plnr,
cannam@167	430 X(mkproblem_rdft_0_d)(
cannam@167	431 X(mktensor_3d)(nc, m*vl, vl,
cannam@167	432 m-mc, vl, nc*vl,
cannam@167	433 vl, 1, 1),
cannam@167	434 p->I + mc*vl, buf));
cannam@167	435 if (!ego->cld1)
cannam@167	436 goto nada;
cannam@167	437 X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
cannam@167	438 }
cannam@167	439
cannam@167	440 ego->cld2 = X(mkplan_d)(plnr,
cannam@167	441 X(mkproblem_rdft_0_d)(
cannam@167	442 X(mktensor_3d)(nc, mc*vl, vl,
cannam@167	443 mc, vl, nc*vl,
cannam@167	444 vl, 1, 1),
cannam@167	445 p->I, p->I));
cannam@167	446 if (!ego->cld2)
cannam@167	447 goto nada;
cannam@167	448 X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
cannam@167	449
cannam@167	450 if (n > nc) {
cannam@167	451 ego->cld3 = X(mkplan_d)(plnr,
cannam@167	452 X(mkproblem_rdft_0_d)(
cannam@167	453 X(mktensor_3d)(n-nc, m*vl, vl,
cannam@167	454 m, vl, n*vl,
cannam@167	455 vl, 1, 1),
cannam@167	456 buf + (m-mc)(ncvl), p->I + nc*vl));
cannam@167	457 if (!ego->cld3)
cannam@167	458 goto nada;
cannam@167	459 X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
cannam@167	460 }
cannam@167	461
cannam@167	462 /* memcpy/memmove operations */
cannam@167	463 ego->super.super.ops.other += 2 * vl * (ncmc ((m > mc) + (n > nc))
cannam@167	464 + (n-nc)m + (m-mc)nc);
cannam@167	465
cannam@167	466 X(ifree)(buf);
cannam@167	467 return 1;
cannam@167	468
cannam@167	469 nada:
cannam@167	470 X(ifree)(buf);
cannam@167	471 return 0;
cannam@167	472 }
cannam@167	473
cannam@167	474 static const transpose_adt adt_cut =
cannam@167	475 {
cannam@167	476 apply_cut, applicable_cut, mkcldrn_cut,
cannam@167	477 "rdft-transpose-cut"
cannam@167	478 };
cannam@167	479
cannam@167	480 /*************************************************************************/
cannam@167	481 /* In-place transpose routine from TOMS, which follows the cycles of
cannam@167	482 the permutation so that it writes to each location only once.
cannam@167	483 Because of cache-line and other issues, however, this routine is
cannam@167	484 typically much slower than transpose-gcd or transpose-cut, even
cannam@167	485 though the latter do some extra writes. On the other hand, if the
cannam@167	486 vector length is large then the TOMS routine is best.
cannam@167	487
cannam@167	488 The TOMS routine also has the advantage of requiring less buffer
cannam@167	489 space for the case of gcd(nx,ny) small. However, in this case it
cannam@167	490 has been superseded by the combination of the generalized
cannam@167	491 transpose-cut method with the transpose-gcd method, which can
cannam@167	492 always transpose with buffers a small fraction of the array size
cannam@167	493 regardless of gcd(nx,ny). */
cannam@167	494
cannam@167	495 /*
cannam@167	496 * TOMS Transpose. Algorithm 513 (Revised version of algorithm 380).
cannam@167	497 *
cannam@167	498 * These routines do in-place transposes of arrays.
cannam@167	499 *
cannam@167	500 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software,
cannam@167	501 * vol. 3, no. 1, 104-110 (1977) ]
cannam@167	502 *
cannam@167	503 * C version by Steven G. Johnson (February 1997).
cannam@167	504 */
cannam@167	505
cannam@167	506 /*
cannam@167	507 * "a" is a 1D array of length nynxN which constains the nx x ny
cannam@167	508 * matrix of N-tuples to be transposed. "a" is stored in row-major
cannam@167	509 * order (last index varies fastest). move is a 1D array of length
cannam@167	510 * move_size used to store information to speed up the process. The
cannam@167	511 * value move_size=(ny+nx)/2 is recommended. buf should be an array
cannam@167	512 * of length 2*N.
cannam@167	513 *
cannam@167	514 */
cannam@167	515
cannam@167	516 static void transpose_toms513(R *a, INT nx, INT ny, INT N,
cannam@167	517 char move, INT move_size, R buf)
cannam@167	518 {
cannam@167	519 INT i, im, mn;
cannam@167	520 R b, c, *d;
cannam@167	521 INT ncount;
cannam@167	522 INT k;
cannam@167	523
cannam@167	524 /* check arguments and initialize: */
cannam@167	525 A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
cannam@167	526
cannam@167	527 b = buf;
cannam@167	528
cannam@167	529 /* Cate & Twigg have a special case for nx == ny, but we don't
cannam@167	530 bother, since we already have special code for this case elsewhere. */
cannam@167	531
cannam@167	532 c = buf + N;
cannam@167	533 ncount = 2; /* always at least 2 fixed points */
cannam@167	534 k = (mn = ny * nx) - 1;
cannam@167	535
cannam@167	536 for (i = 0; i < move_size; ++i)
cannam@167	537 move[i] = 0;
cannam@167	538
cannam@167	539 if (ny >= 3 && nx >= 3)
cannam@167	540 ncount += gcd(ny - 1, nx - 1) - 1; /* # fixed points */
cannam@167	541
cannam@167	542 i = 1;
cannam@167	543 im = ny;
cannam@167	544
cannam@167	545 while (1) {
cannam@167	546 INT i1, i2, i1c, i2c;
cannam@167	547 INT kmi;
cannam@167	548
cannam@167	549 /** Rearrange the elements of a loop
cannam@167	550 and its companion loop: **/
cannam@167	551
cannam@167	552 i1 = i;
cannam@167	553 kmi = k - i;
cannam@167	554 i1c = kmi;
cannam@167	555 switch (N) {
cannam@167	556 case 1:
cannam@167	557 b[0] = a[i1];
cannam@167	558 c[0] = a[i1c];
cannam@167	559 break;
cannam@167	560 case 2:
cannam@167	561 b[0] = a[2*i1];
cannam@167	562 b[1] = a[2*i1+1];
cannam@167	563 c[0] = a[2*i1c];
cannam@167	564 c[1] = a[2*i1c+1];
cannam@167	565 break;
cannam@167	566 default:
cannam@167	567 memcpy(b, &a[N * i1], N * sizeof(R));
cannam@167	568 memcpy(c, &a[N * i1c], N * sizeof(R));
cannam@167	569 }
cannam@167	570 while (1) {
cannam@167	571 i2 = ny * i1 - k * (i1 / nx);
cannam@167	572 i2c = k - i2;
cannam@167	573 if (i1 < move_size)
cannam@167	574 move[i1] = 1;
cannam@167	575 if (i1c < move_size)
cannam@167	576 move[i1c] = 1;
cannam@167	577 ncount += 2;
cannam@167	578 if (i2 == i)
cannam@167	579 break;
cannam@167	580 if (i2 == kmi) {
cannam@167	581 d = b;
cannam@167	582 b = c;
cannam@167	583 c = d;
cannam@167	584 break;
cannam@167	585 }
cannam@167	586 switch (N) {
cannam@167	587 case 1:
cannam@167	588 a[i1] = a[i2];
cannam@167	589 a[i1c] = a[i2c];
cannam@167	590 break;
cannam@167	591 case 2:
cannam@167	592 a[2i1] = a[2i2];
cannam@167	593 a[2i1+1] = a[2i2+1];
cannam@167	594 a[2i1c] = a[2i2c];
cannam@167	595 a[2i1c+1] = a[2i2c+1];
cannam@167	596 break;
cannam@167	597 default:
cannam@167	598 memcpy(&a[N * i1], &a[N * i2],
cannam@167	599 N * sizeof(R));
cannam@167	600 memcpy(&a[N * i1c], &a[N * i2c],
cannam@167	601 N * sizeof(R));
cannam@167	602 }
cannam@167	603 i1 = i2;
cannam@167	604 i1c = i2c;
cannam@167	605 }
cannam@167	606 switch (N) {
cannam@167	607 case 1:
cannam@167	608 a[i1] = b[0];
cannam@167	609 a[i1c] = c[0];
cannam@167	610 break;
cannam@167	611 case 2:
cannam@167	612 a[2*i1] = b[0];
cannam@167	613 a[2*i1+1] = b[1];
cannam@167	614 a[2*i1c] = c[0];
cannam@167	615 a[2*i1c+1] = c[1];
cannam@167	616 break;
cannam@167	617 default:
cannam@167	618 memcpy(&a[N * i1], b, N * sizeof(R));
cannam@167	619 memcpy(&a[N * i1c], c, N * sizeof(R));
cannam@167	620 }
cannam@167	621 if (ncount >= mn)
cannam@167	622 break; /* we've moved all elements */
cannam@167	623
cannam@167	624 / Search for loops to rearrange: /
cannam@167	625
cannam@167	626 while (1) {
cannam@167	627 INT max = k - i;
cannam@167	628 ++i;
cannam@167	629 A(i <= max);
cannam@167	630 im += ny;
cannam@167	631 if (im > k)
cannam@167	632 im -= k;
cannam@167	633 i2 = im;
cannam@167	634 if (i == i2)
cannam@167	635 continue;
cannam@167	636 if (i >= move_size) {
cannam@167	637 while (i2 > i && i2 < max) {
cannam@167	638 i1 = i2;
cannam@167	639 i2 = ny * i1 - k * (i1 / nx);
cannam@167	640 }
cannam@167	641 if (i2 == i)
cannam@167	642 break;
cannam@167	643 } else if (!move[i])
cannam@167	644 break;
cannam@167	645 }
cannam@167	646 }
cannam@167	647 }
cannam@167	648
cannam@167	649 static void apply_toms513(const plan ego_, R I, R *O)
cannam@167	650 {
cannam@167	651 const P ego = (const P ) ego_;
cannam@167	652 INT n = ego->n, m = ego->m;
cannam@167	653 INT vl = ego->vl;
cannam@167	654 R buf = (R )MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
cannam@167	655 UNUSED(O);
cannam@167	656 transpose_toms513(I, n, m, vl, (char ) (buf + 2vl), (n+m)/2, buf);
cannam@167	657 X(ifree)(buf);
cannam@167	658 }
cannam@167	659
cannam@167	660 static int applicable_toms513(const problem_rdft p, planner plnr,
cannam@167	661 int dim0, int dim1, int dim2, INT *nbuf)
cannam@167	662 {
cannam@167	663 INT n = p->vecsz->dims[dim0].n;
cannam@167	664 INT m = p->vecsz->dims[dim1].n;
cannam@167	665 INT vl, vs;
cannam@167	666 get_transpose_vec(p, dim2, &vl, &vs);
cannam@167	667 nbuf = 2vl
cannam@167	668 + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
cannam@167	669 return (!NO_SLOWP(plnr)
cannam@167	670 && (vl > 8 \|\| !NO_UGLYP(plnr)) /* UGLY for small vl */
cannam@167	671 && n != m
cannam@167	672 && Ntuple_transposable(p->vecsz->dims + dim0,
cannam@167	673 p->vecsz->dims + dim1,
cannam@167	674 vl, vs));
cannam@167	675 }
cannam@167	676
cannam@167	677 static int mkcldrn_toms513(const problem_rdft p, planner plnr, P *ego)
cannam@167	678 {
cannam@167	679 UNUSED(p); UNUSED(plnr);
cannam@167	680 /* heuristic so that TOMS algorithm is last resort for small vl */
cannam@167	681 ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
cannam@167	682 return 1;
cannam@167	683 }
cannam@167	684
cannam@167	685 static const transpose_adt adt_toms513 =
cannam@167	686 {
cannam@167	687 apply_toms513, applicable_toms513, mkcldrn_toms513,
cannam@167	688 "rdft-transpose-toms513"
cannam@167	689 };
cannam@167	690
cannam@167	691 /-----------------------------------------------------------------------/
cannam@167	692 /-----------------------------------------------------------------------/
cannam@167	693 /* generic stuff: */
cannam@167	694
cannam@167	695 static void awake(plan *ego_, enum wakefulness wakefulness)
cannam@167	696 {
cannam@167	697 P ego = (P ) ego_;
cannam@167	698 X(plan_awake)(ego->cld1, wakefulness);
cannam@167	699 X(plan_awake)(ego->cld2, wakefulness);
cannam@167	700 X(plan_awake)(ego->cld3, wakefulness);
cannam@167	701 }
cannam@167	702
cannam@167	703 static void print(const plan ego_, printer p)
cannam@167	704 {
cannam@167	705 const P ego = (const P ) ego_;
cannam@167	706 p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
cannam@167	707 ego->n, ego->m, ego->vl);
cannam@167	708 if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
cannam@167	709 if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
cannam@167	710 if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
cannam@167	711 p->print(p, ")");
cannam@167	712 }
cannam@167	713
cannam@167	714 static void destroy(plan *ego_)
cannam@167	715 {
cannam@167	716 P ego = (P ) ego_;
cannam@167	717 X(plan_destroy_internal)(ego->cld3);
cannam@167	718 X(plan_destroy_internal)(ego->cld2);
cannam@167	719 X(plan_destroy_internal)(ego->cld1);
cannam@167	720 }
cannam@167	721
cannam@167	722 static plan mkplan(const solver ego_, const problem p_, planner plnr)
cannam@167	723 {
cannam@167	724 const S ego = (const S ) ego_;
cannam@167	725 const problem_rdft *p;
cannam@167	726 int dim0, dim1, dim2;
cannam@167	727 INT nbuf, vs;
cannam@167	728 P *pln;
cannam@167	729
cannam@167	730 static const plan_adt padt = {
cannam@167	731 X(rdft_solve), awake, print, destroy
cannam@167	732 };
cannam@167	733
cannam@167	734 if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
cannam@167	735 return (plan *) 0;
cannam@167	736
cannam@167	737 p = (const problem_rdft *) p_;
cannam@167	738 pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
cannam@167	739
cannam@167	740 pln->n = p->vecsz->dims[dim0].n;
cannam@167	741 pln->m = p->vecsz->dims[dim1].n;
cannam@167	742 get_transpose_vec(p, dim2, &pln->vl, &vs);
cannam@167	743 pln->nbuf = nbuf;
cannam@167	744 pln->d = gcd(pln->n, pln->m);
cannam@167	745 pln->nd = pln->n / pln->d;
cannam@167	746 pln->md = pln->m / pln->d;
cannam@167	747 pln->slv = ego;
cannam@167	748
cannam@167	749 X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
cannam@167	750
cannam@167	751 pln->cld1 = pln->cld2 = pln->cld3 = 0;
cannam@167	752 if (!ego->adt->mkcldrn(p, plnr, pln)) {
cannam@167	753 X(plan_destroy_internal)(&(pln->super.super));
cannam@167	754 return 0;
cannam@167	755 }
cannam@167	756
cannam@167	757 return &(pln->super.super);
cannam@167	758 }
cannam@167	759
cannam@167	760 static solver mksolver(const transpose_adt adt)
cannam@167	761 {
cannam@167	762 static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
cannam@167	763 S *slv = MKSOLVER(S, &sadt);
cannam@167	764 slv->adt = adt;
cannam@167	765 return &(slv->super);
cannam@167	766 }
cannam@167	767
cannam@167	768 void X(rdft_vrank3_transpose_register)(planner *p)
cannam@167	769 {
cannam@167	770 unsigned i;
cannam@167	771 static const transpose_adt *const adts[] = {
cannam@167	772 &adt_gcd, &adt_cut,
cannam@167	773 &adt_toms513
cannam@167	774 };
cannam@167	775 for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
cannam@167	776 REGISTER_SOLVER(p, mksolver(adts[i]));
cannam@167	777 }

Mercurial > hg > sv-dependency-builds

annotate src/fftw-3.3.8/rdft/vrank3-transpose.c @ 169:223a55898ab9 tip default