batch-feature-extraction-tool: Lib/fftw-3.2.1/cell/dft-direct-cell.c annotate

annotate Lib/fftw-3.2.1/cell/dft-direct-cell.c @ 0:25bf17994ef1

First commit. VS2013, Codeblocks and Mac OSX configuration

author	Geogaddi\David <d.m.ronan@qmul.ac.uk>
date	Thu, 09 Jul 2015 01:12:16 +0100
parents
children

rev	line source
d@0	1 /*
d@0	2 * Copyright (c) 2007 Massachusetts Institute of Technology
d@0	3 *
d@0	4 * This program is free software; you can redistribute it and/or modify
d@0	5 * it under the terms of the GNU General Public License as published by
d@0	6 * the Free Software Foundation; either version 2 of the License, or
d@0	7 * (at your option) any later version.
d@0	8 *
d@0	9 * This program is distributed in the hope that it will be useful,
d@0	10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
d@0	11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
d@0	12 * GNU General Public License for more details.
d@0	13 *
d@0	14 * You should have received a copy of the GNU General Public License
d@0	15 * along with this program; if not, write to the Free Software
d@0	16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
d@0	17 *
d@0	18 */
d@0	19
d@0	20 /* direct DFT solver via cell library */
d@0	21
d@0	22 #include "dft.h"
d@0	23 #include "ct.h"
d@0	24
d@0	25 #if HAVE_CELL
d@0	26
d@0	27 #include "simd.h"
d@0	28 #include "fftw-cell.h"
d@0	29
d@0	30 typedef struct {
d@0	31 solver super;
d@0	32 int cutdim;
d@0	33 } S;
d@0	34
d@0	35 typedef struct {
d@0	36 plan_dft super;
d@0	37 struct spu_radices radices;
d@0	38 /* strides expressed in reals */
d@0	39 INT n, is, os;
d@0	40 struct cell_iodim v[2];
d@0	41 int cutdim;
d@0	42 int sign;
d@0	43 int Wsz;
d@0	44 R *W;
d@0	45
d@0	46 /* optional twiddle factors for dftw: */
d@0	47 INT rw, mw; /* rw == 0 indicates no twiddle factors */
d@0	48 twid *td;
d@0	49 } P;
d@0	50
d@0	51
d@0	52 /* op counts of SPU codelets */
d@0	53 static const opcnt n_ops[33] = {
d@0	54 [2] = {2, 0, 0, 0},
d@0	55 [3] = {3, 1, 3, 0},
d@0	56 [4] = {6, 0, 2, 0},
d@0	57 [5] = {7, 2, 9, 0},
d@0	58 [6] = {12, 2, 6, 0},
d@0	59 [7] = {9, 3, 21, 0},
d@0	60 [8] = {16, 0, 10, 0},
d@0	61 [9] = {12, 4, 34, 0},
d@0	62 [10] = {24, 4, 18, 0},
d@0	63 [11] = {15, 5, 55, 0},
d@0	64 [12] = {30, 2, 18, 0},
d@0	65 [13] = {31, 6, 57, 0},
d@0	66 [14] = {32, 6, 42, 0},
d@0	67 [15] = {36, 7, 42, 0},
d@0	68 [16] = {38, 0, 34, 0},
d@0	69 [32] = {88, 0, 98, 0},
d@0	70 };
d@0	71
d@0	72 static const opcnt t_ops[33] = {
d@0	73 [2] = {3, 2, 0, 0},
d@0	74 [3] = {5, 5, 3, 0},
d@0	75 [4] = {9, 6, 2, 0},
d@0	76 [5] = {11, 10, 9, 0},
d@0	77 [6] = {17, 12, 6, 0},
d@0	78 [7] = {15, 15, 21, 0},
d@0	79 [8] = {23, 14, 10, 0},
d@0	80 [9] = {20, 20, 34, 0},
d@0	81 [10] = {33, 22, 18, 0},
d@0	82 [12] = {41, 24, 18, 0},
d@0	83 [15] = {50, 35, 42, 0},
d@0	84 [16] = {53, 30, 34, 0},
d@0	85 [32] = {119, 62, 98, 0},
d@0	86 };
d@0	87
d@0	88 static void compute_opcnt(const struct spu_radices *p,
d@0	89 INT n, INT v, opcnt *ops)
d@0	90 {
d@0	91 INT r;
d@0	92 signed char *q;
d@0	93
d@0	94 X(ops_zero)(ops);
d@0	95
d@0	96 for (q = p->r; (r = *q) > 0; ++q)
d@0	97 X(ops_madd2)(v * (n / r) / VL, &t_ops[r], ops);
d@0	98
d@0	99 X(ops_madd2)(v * (n / (-r)) / VL, &n_ops[-r], ops);
d@0	100 }
d@0	101
d@0	102 static INT extent(struct cell_iodim *d)
d@0	103 {
d@0	104 return d->n1 - d->n0;
d@0	105 }
d@0	106
d@0	107 /* FIXME: this is totally broken */
d@0	108 static void cost_model(const P pln, opcnt ops)
d@0	109 {
d@0	110 INT r = pln->n;
d@0	111 INT v0 = extent(pln->v + 0);
d@0	112 INT v1 = extent(pln->v + 1);
d@0	113
d@0	114 compute_opcnt(&pln->radices, r, v0 * v1, ops);
d@0	115
d@0	116 /* penalize cuts across short dimensions */
d@0	117 if (extent(pln->v + pln->cutdim) < extent(pln->v + 1 - pln->cutdim))
d@0	118 ops->other += 3.14159;
d@0	119 }
d@0	120
d@0	121 /* expressed in real numbers */
d@0	122 static INT compute_twiddle_size(const struct spu_radices *p, INT n)
d@0	123 {
d@0	124 INT sz = 0;
d@0	125 INT r;
d@0	126 signed char *q;
d@0	127
d@0	128 for (q = p->r; (r = *q) > 0; ++q) {
d@0	129 n /= r;
d@0	130 sz += 2 * (r - 1) * n;
d@0	131 }
d@0	132
d@0	133 return sz;
d@0	134 }
d@0	135
d@0	136 /* FIXME: find a way to use the normal FFTW twiddle mechanisms for this */
d@0	137 static void fill_twiddles(enum wakefulness wakefulness,
d@0	138 R W, const signed char q, INT n)
d@0	139 {
d@0	140 INT r;
d@0	141
d@0	142 for ( ; (r = *q) > 0; ++q) {
d@0	143 triggen *t = X(mktriggen)(wakefulness, n);
d@0	144 INT i, j, v, m = n / r;
d@0	145
d@0	146 for (j = 0; j < m; j += VL) {
d@0	147 for (i = 1; i < r; ++i) {
d@0	148 for (v = 0; v < VL; ++v) {
d@0	149 t->cexp(t, i * (j + v), W);
d@0	150 W += 2;
d@0	151 }
d@0	152 }
d@0	153 }
d@0	154 X(triggen_destroy)(t);
d@0	155 n = m;
d@0	156 }
d@0	157 }
d@0	158
d@0	159 static R *make_twiddles(enum wakefulness wakefulness,
d@0	160 const struct spu_radices p, INT n, int Wsz)
d@0	161 {
d@0	162 INT sz = compute_twiddle_size(p, n);
d@0	163 R W = X(cell_aligned_malloc)(sz sizeof(R));
d@0	164 A(FITS_IN_INT(sz));
d@0	165 *Wsz = sz;
d@0	166 fill_twiddles(wakefulness, W, p->r, n);
d@0	167 return W;
d@0	168 }
d@0	169
d@0	170 static int fits_in_local_store(INT n, INT v)
d@0	171 {
d@0	172 /* the SPU has space for 3 * MAX_N complex numbers. We need
d@0	173 n(v+1) for data plus n for twiddle factors. /
d@0	174 return n * (v+2) <= 3 * MAX_N;
d@0	175 }
d@0	176
d@0	177 static void apply(const plan ego_, R ri, R ii, R ro, R *io)
d@0	178 {
d@0	179 const P ego = (const P ) ego_;
d@0	180 R xi, xo;
d@0	181 int i, v;
d@0	182 int nspe = X(cell_nspe)();
d@0	183 int cutdim = ego->cutdim;
d@0	184 int contiguous_r = ((ego->is == 2) && (ego->os == 2));
d@0	185
d@0	186 /* find pointer to beginning of data, depending on sign */
d@0	187 if (ego->sign == FFT_SIGN) {
d@0	188 xi = ri; xo = ro;
d@0	189 } else {
d@0	190 xi = ii; xo = io;
d@0	191 }
d@0	192
d@0	193 /* fill contexts */
d@0	194 v = ego->v[cutdim].n1;
d@0	195
d@0	196 for (i = 0; i < nspe; ++i) {
d@0	197 int chunk;
d@0	198 struct spu_context *ctx = X(cell_get_ctx)(i);
d@0	199 struct dft_context *dft = &ctx->u.dft;
d@0	200
d@0	201 ctx->op = FFTW_SPE_DFT;
d@0	202
d@0	203 dft->r = ego->radices;
d@0	204 dft->n = ego->n;
d@0	205 dft->is_bytes = ego->is * sizeof(R);
d@0	206 dft->os_bytes = ego->os * sizeof(R);
d@0	207 dft->v[0] = ego->v[0];
d@0	208 dft->v[1] = ego->v[1];
d@0	209 dft->sign = ego->sign;
d@0	210 A(FITS_IN_INT(ego->Wsz * sizeof(R)));
d@0	211 dft->Wsz_bytes = ego->Wsz * sizeof(R);
d@0	212 dft->W = (uintptr_t)ego->W;
d@0	213 dft->xi = (uintptr_t)xi;
d@0	214 dft->xo = (uintptr_t)xo;
d@0	215
d@0	216 /* partition v into pieces of equal size, subject to alignment
d@0	217 constraints */
d@0	218 if (cutdim == 0 && !contiguous_r) {
d@0	219 /* CUTDIM = 0 and the SPU uses transposed DMA.
d@0	220 We must preserve the alignment of the dimension 0 in the
d@0	221 cut */
d@0	222 chunk = VL * ((v - ego->v[cutdim].n0) / (VL * (nspe - i)));
d@0	223 } else {
d@0	224 chunk = (v - ego->v[cutdim].n0) / (nspe - i);
d@0	225 }
d@0	226
d@0	227 dft->v[cutdim].n1 = v;
d@0	228 v -= chunk;
d@0	229 dft->v[cutdim].n0 = v;
d@0	230
d@0	231 /* optional dftw twiddles */
d@0	232 if (ego->rw)
d@0	233 dft->Ww = (uintptr_t)ego->td->W;
d@0	234 }
d@0	235
d@0	236 A(v == ego->v[cutdim].n0);
d@0	237
d@0	238 /* activate spe's */
d@0	239 X(cell_spe_awake_all)();
d@0	240
d@0	241 /* wait for completion */
d@0	242 X(cell_spe_wait_all)();
d@0	243 }
d@0	244
d@0	245 static void print(const plan ego_, printer p)
d@0	246 {
d@0	247 const P ego = (const P ) ego_;
d@0	248 int i;
d@0	249 p->print(p, "(dft-direct-cell-%D/%d", ego->n, ego->cutdim);
d@0	250 for (i = 0; i < 2; ++i)
d@0	251 p->print(p, "%v", (INT)ego->v[i].n1);
d@0	252 p->print(p, ")");
d@0	253 }
d@0	254
d@0	255 static void awake(plan *ego_, enum wakefulness wakefulness)
d@0	256 {
d@0	257 P ego = (P ) ego_;
d@0	258
d@0	259 /* awake the optional dftw twiddles */
d@0	260 if (ego->rw) {
d@0	261 static const tw_instr tw[] = {
d@0	262 { TW_CEXP, 0, 0 },
d@0	263 { TW_FULL, 0, 0 },
d@0	264 { TW_NEXT, 1, 0 }
d@0	265 };
d@0	266 X(twiddle_awake)(wakefulness, &ego->td, tw,
d@0	267 ego->rw * ego->mw, ego->rw, ego->mw);
d@0	268 }
d@0	269
d@0	270 /* awake the twiddles for the dft part */
d@0	271 switch (wakefulness) {
d@0	272 case SLEEPY:
d@0	273 free(ego->W);
d@0	274 ego->W = 0;
d@0	275 break;
d@0	276 default:
d@0	277 ego->W = make_twiddles(wakefulness, &ego->radices,
d@0	278 ego->n, &ego->Wsz);
d@0	279 break;
d@0	280 }
d@0	281 }
d@0	282
d@0	283 static int contiguous_or_aligned_p(INT s_bytes)
d@0	284 {
d@0	285 return (s_bytes == 2 * sizeof(R)) \|\| ((s_bytes % ALIGNMENTA) == 0);
d@0	286 }
d@0	287
d@0	288 static int build_vdim(int inplacep,
d@0	289 INT r, INT irs, INT ors,
d@0	290 INT m, INT ims, INT oms, int dm,
d@0	291 INT v, INT ivs, INT ovs,
d@0	292 struct cell_iodim vd[2], int cutdim)
d@0	293 {
d@0	294 int vm, vv;
d@0	295 int contiguous_r = ((irs == 2) && (ors == 2));
d@0	296
d@0	297 /* 32-bit overflow? */
d@0	298 if (!(1
d@0	299 && FITS_IN_INT(r)
d@0	300 && FITS_IN_INT(irs * sizeof(R))
d@0	301 && FITS_IN_INT(ors * sizeof(R))
d@0	302 && FITS_IN_INT(m)
d@0	303 && FITS_IN_INT(ims * sizeof(R))
d@0	304 && FITS_IN_INT(oms * sizeof(R))
d@0	305 && FITS_IN_INT(v)
d@0	306 && FITS_IN_INT(ivs * sizeof(R))
d@0	307 && FITS_IN_INT(ovs * sizeof(R))))
d@0	308 return 0;
d@0	309
d@0	310 /* R dimension must be aligned in all cases */
d@0	311 if (!(1
d@0	312 && r % VL == 0 /* REDUNDANT */
d@0	313 && contiguous_or_aligned_p(irs * sizeof(R))
d@0	314 && contiguous_or_aligned_p(ors * sizeof(R))))
d@0	315 return 0;
d@0	316
d@0	317 if ((irs == 2 \|\| ims == 2) && (ors == 2 \|\| oms == 2)) {
d@0	318 /* Case 1: in SPU, let N=R, V0=M, V1=V */
d@0	319 vm = 0;
d@0	320 vv = 1;
d@0	321 } else if ((irs == 2 \|\| ivs == 2) && (ors == 2 \|\| ovs == 2)) {
d@0	322 /* Case 2: in SPU, let N=R, V0=V, V1=M */
d@0	323 vm = 1;
d@0	324 vv = 0;
d@0	325 } else {
d@0	326 /* can't do it */
d@0	327 return 0;
d@0	328 }
d@0	329
d@0	330 vd[vm].n0 = 0; vd[vm].n1 = m;
d@0	331 vd[vm].is_bytes = ims * sizeof(R); vd[vm].os_bytes = oms * sizeof(R);
d@0	332 vd[vm].dm = dm;
d@0	333
d@0	334 vd[vv].n0 = 0; vd[vv].n1 = v;
d@0	335 vd[vv].is_bytes = ivs * sizeof(R); vd[vv].os_bytes = ovs * sizeof(R);
d@0	336 vd[vv].dm = 0;
d@0	337
d@0	338 /* Restrictions on the size of the SPU local store: */
d@0	339 if (!(0
d@0	340 /* for contiguous I/O, one array of size R must fit into
d@0	341 local store. (The fits_in_local_store() check is
d@0	342 redundant because R <= MAX_N holds, but we check anyway
d@0	343 for clarity */
d@0	344 \|\| (contiguous_r && fits_in_local_store(r, 1))
d@0	345
d@0	346 /* for noncontiguous I/O, VL arrays of size R must fit into
d@0	347 local store because of transposed DMA */
d@0	348 \|\| fits_in_local_store(r, VL)))
d@0	349 return 0;
d@0	350
d@0	351 /* SPU DMA restrictions: */
d@0	352 if (!(1
d@0	353 /* If R is noncontiguous, then the SPU uses transposed DMA
d@0	354 and therefore dimension 0 must be aligned */
d@0	355 && (contiguous_r \|\| vd[0].n1 % VL == 0)
d@0	356
d@0	357 /* dimension 1 is arbitrary */
d@0	358
d@0	359 /* dimension-0 strides must be either contiguous or aligned */
d@0	360 && contiguous_or_aligned_p((INT)vd[0].is_bytes)
d@0	361 && contiguous_or_aligned_p((INT)vd[0].os_bytes)
d@0	362
d@0	363 /* dimension-1 strides must be aligned */
d@0	364 && ((vd[1].is_bytes % ALIGNMENTA) == 0)
d@0	365 && ((vd[1].os_bytes % ALIGNMENTA) == 0)
d@0	366 ))
d@0	367 return 0;
d@0	368
d@0	369 /* see if we can do it without overwriting the input with itself */
d@0	370 if (!(0
d@0	371 /* can operate out-of-place */
d@0	372 \|\| !inplacep
d@0	373
d@0	374 /* all strides are in-place */
d@0	375 \|\| (1
d@0	376 && irs == ors
d@0	377 && ims == oms
d@0	378 && ivs == ovs)
d@0	379
d@0	380 /* we cut across in-place dimension 1, and dimension 0 fits
d@0	381 into local store */
d@0	382 \|\| (1
d@0	383 && cutdim == 1
d@0	384 && vd[cutdim].is_bytes == vd[cutdim].os_bytes
d@0	385 && fits_in_local_store(r, extent(vd + 0)))
d@0	386 ))
d@0	387 return 0;
d@0	388
d@0	389 return 1;
d@0	390 }
d@0	391
d@0	392 static
d@0	393 const struct spu_radices find_radices(R ri, R ii, R ro, R *io,
d@0	394 INT n, int *sign)
d@0	395 {
d@0	396 const struct spu_radices *p;
d@0	397 R xi, xo;
d@0	398
d@0	399 /* 32-bit overflow? */
d@0	400 if (!FITS_IN_INT(n))
d@0	401 return 0;
d@0	402
d@0	403 /* valid n? */
d@0	404 if (n <= 0 \|\| n > MAX_N \|\| ((n % REQUIRE_N_MULTIPLE_OF) != 0))
d@0	405 return 0;
d@0	406
d@0	407 /* see if we have a plan for this N */
d@0	408 p = X(spu_radices) + n / REQUIRE_N_MULTIPLE_OF;
d@0	409 if (!p->r[0])
d@0	410 return 0;
d@0	411
d@0	412 /* check whether the data format is supported */
d@0	413 if (ii == ri + 1 && io == ro + 1) { /* R I R I ... format */
d@0	414 *sign = FFT_SIGN;
d@0	415 xi = ri; xo = ro;
d@0	416 } else if (ri == ii + 1 && ro == io + 1) { /* I R I R ... format */
d@0	417 *sign = -FFT_SIGN;
d@0	418 xi = ii; xo = io;
d@0	419 } else
d@0	420 return 0; /* can't do it */
d@0	421
d@0	422 if (!ALIGNEDA(xi) \|\| !ALIGNEDA(xo))
d@0	423 return 0;
d@0	424
d@0	425 return p;
d@0	426 }
d@0	427
d@0	428 static const plan_adt padt = {
d@0	429 X(dft_solve), awake, print, X(plan_null_destroy)
d@0	430 };
d@0	431
d@0	432 static plan mkplan(const solver ego_, const problem p_, planner plnr)
d@0	433 {
d@0	434 P *pln;
d@0	435 const S ego = (const S )ego_;
d@0	436 const problem_dft p = (const problem_dft ) p_;
d@0	437 int sign;
d@0	438 const struct spu_radices *radices;
d@0	439 struct cell_iodim vd[2];
d@0	440 INT m, ims, oms, v, ivs, ovs;
d@0	441
d@0	442 /* basic conditions */
d@0	443 if (!(1
d@0	444 && X(cell_nspe)() > 0
d@0	445 && p->sz->rnk == 1
d@0	446 && p->vecsz->rnk <= 2
d@0	447 && !NO_SIMDP(plnr)
d@0	448 ))
d@0	449 return 0;
d@0	450
d@0	451 /* see if SPU supports N */
d@0	452 {
d@0	453 iodim *d = p->sz->dims;
d@0	454 radices = find_radices(p->ri, p->ii, p->ro, p->io, d[0].n, &sign);
d@0	455 if (!radices)
d@0	456 return 0;
d@0	457 }
d@0	458
d@0	459 /* canonicalize to vrank 2 */
d@0	460 if (p->vecsz->rnk >= 1) {
d@0	461 iodim *d = p->vecsz->dims + 0;
d@0	462 m = d->n; ims = d->is; oms = d->os;
d@0	463 } else {
d@0	464 m = 1; ims = oms = 0;
d@0	465 }
d@0	466
d@0	467 if (p->vecsz->rnk >= 2) {
d@0	468 iodim *d = p->vecsz->dims + 1;
d@0	469 v = d->n; ivs = d->is; ovs = d->os;
d@0	470 } else {
d@0	471 v = 1; ivs = ovs = 0;
d@0	472 }
d@0	473
d@0	474 /* see if strides are supported by the SPU DMA routine */
d@0	475 {
d@0	476 iodim *d = p->sz->dims + 0;
d@0	477 if (!build_vdim(p->ri == p->ro,
d@0	478 d->n, d->is, d->os,
d@0	479 m, ims, oms, 0,
d@0	480 v, ivs, ovs,
d@0	481 vd, ego->cutdim))
d@0	482 return 0;
d@0	483 }
d@0	484
d@0	485 pln = MKPLAN_DFT(P, &padt, apply);
d@0	486
d@0	487 pln->radices = *radices;
d@0	488 {
d@0	489 iodim *d = p->sz->dims + 0;
d@0	490 pln->n = d[0].n;
d@0	491 pln->is = d[0].is;
d@0	492 pln->os = d[0].os;
d@0	493 }
d@0	494 pln->sign = sign;
d@0	495 pln->v[0] = vd[0];
d@0	496 pln->v[1] = vd[1];
d@0	497 pln->cutdim = ego->cutdim;
d@0	498 pln->W = 0;
d@0	499
d@0	500 pln->rw = 0;
d@0	501
d@0	502 cost_model(pln, &pln->super.super.ops);
d@0	503
d@0	504 return &(pln->super.super);
d@0	505 }
d@0	506
d@0	507 static void solver_destroy(solver *ego)
d@0	508 {
d@0	509 UNUSED(ego);
d@0	510 X(cell_deactivate_spes)();
d@0	511 }
d@0	512
d@0	513 static solver *mksolver(int cutdim)
d@0	514 {
d@0	515 static const solver_adt sadt = { PROBLEM_DFT, mkplan, solver_destroy };
d@0	516 S *slv = MKSOLVER(S, &sadt);
d@0	517 slv->cutdim = cutdim;
d@0	518 X(cell_activate_spes)();
d@0	519 return &(slv->super);
d@0	520 }
d@0	521
d@0	522 void X(dft_direct_cell_register)(planner *p)
d@0	523 {
d@0	524 REGISTER_SOLVER(p, mksolver(0));
d@0	525 REGISTER_SOLVER(p, mksolver(1));
d@0	526 }
d@0	527
d@0	528 /**************************************************************/
d@0	529 /* solvers with twiddle factors: */
d@0	530
d@0	531 typedef struct {
d@0	532 plan_dftw super;
d@0	533 plan *cld;
d@0	534 } Pw;
d@0	535
d@0	536 typedef struct {
d@0	537 ct_solver super;
d@0	538 int cutdim;
d@0	539 } Sw;
d@0	540
d@0	541 static void destroyw(plan *ego_)
d@0	542 {
d@0	543 Pw ego = (Pw ) ego_;
d@0	544 X(plan_destroy_internal)(ego->cld);
d@0	545 }
d@0	546
d@0	547 static void printw(const plan ego_, printer p)
d@0	548 {
d@0	549 const Pw ego = (const Pw ) ego_;
d@0	550 const P cld = (const P ) ego->cld;
d@0	551 p->print(p, "(dftw-direct-cell-%D-%D%v%(%p%))",
d@0	552 cld->rw, cld->mw, cld->v[1].n1,
d@0	553 ego->cld);
d@0	554 }
d@0	555
d@0	556 static void awakew(plan *ego_, enum wakefulness wakefulness)
d@0	557 {
d@0	558 Pw ego = (Pw ) ego_;
d@0	559 X(plan_awake)(ego->cld, wakefulness);
d@0	560 }
d@0	561
d@0	562 static void applyw(const plan ego_, R rio, R *iio)
d@0	563 {
d@0	564 const Pw ego = (const Pw ) ego_;
d@0	565 dftapply cldapply = ((plan_dft *) ego->cld)->apply;
d@0	566 cldapply(ego->cld, rio, iio, rio, iio);
d@0	567 }
d@0	568
d@0	569 static plan mkcldw(const ct_solver ego_,
d@0	570 INT r, INT irs, INT ors,
d@0	571 INT m, INT ms,
d@0	572 INT v, INT ivs, INT ovs,
d@0	573 INT mstart, INT mcount,
d@0	574 R rio, R iio,
d@0	575 planner *plnr)
d@0	576 {
d@0	577 const Sw ego = (const Sw )ego_;
d@0	578 const struct spu_radices *radices;
d@0	579 int sign;
d@0	580 Pw *pln;
d@0	581 P *cld;
d@0	582 struct cell_iodim vd[2];
d@0	583 int dm = 0;
d@0	584
d@0	585 static const plan_adt padtw = {
d@0	586 0, awakew, printw, destroyw
d@0	587 };
d@0	588
d@0	589 /* use only if cell is enabled */
d@0	590 if (NO_SIMDP(plnr) \|\| X(cell_nspe)() <= 0)
d@0	591 return 0;
d@0	592
d@0	593 /* no way in hell this SPU stuff is going to work with pthreads */
d@0	594 if (mstart != 0 \|\| mcount != m)
d@0	595 return 0;
d@0	596
d@0	597 /* don't bother for small N */
d@0	598 if (r * m * v <= MAX_N / 16 /* ARBITRARY */)
d@0	599 return 0;
d@0	600
d@0	601 /* check whether the R dimension is supported */
d@0	602 radices = find_radices(rio, iio, rio, iio, r, &sign);
d@0	603
d@0	604 if (!radices)
d@0	605 return 0;
d@0	606
d@0	607 /* encode decimation in DM */
d@0	608 switch (ego->super.dec) {
d@0	609 case DECDIT:
d@0	610 case DECDIT+TRANSPOSE:
d@0	611 dm = 1;
d@0	612 break;
d@0	613 case DECDIF:
d@0	614 case DECDIF+TRANSPOSE:
d@0	615 dm = -1;
d@0	616 break;
d@0	617 }
d@0	618
d@0	619 if (!build_vdim(1,
d@0	620 r, irs, ors,
d@0	621 m, ms, ms, dm,
d@0	622 v, ivs, ovs,
d@0	623 vd, ego->cutdim))
d@0	624 return 0;
d@0	625
d@0	626 cld = MKPLAN_DFT(P, &padt, apply);
d@0	627
d@0	628 cld->radices = *radices;
d@0	629 cld->n = r;
d@0	630 cld->is = irs;
d@0	631 cld->os = ors;
d@0	632 cld->sign = sign;
d@0	633 cld->W = 0;
d@0	634
d@0	635 cld->rw = r; cld->mw = m; cld->td = 0;
d@0	636
d@0	637 cld->v[0] = vd[0];
d@0	638 cld->v[1] = vd[1];
d@0	639 cld->cutdim = ego->cutdim;
d@0	640
d@0	641 pln = MKPLAN_DFTW(Pw, &padtw, applyw);
d@0	642 pln->cld = &cld->super.super;
d@0	643
d@0	644 cost_model(cld, &pln->super.super.ops);
d@0	645
d@0	646 /* for twiddle factors: one mul and one fma per complex point */
d@0	647 pln->super.super.ops.fma += (r * m * v) / VL;
d@0	648 pln->super.super.ops.mul += (r * m * v) / VL;
d@0	649
d@0	650 /* FIXME: heuristics */
d@0	651 /* pay penalty for large radices: */
d@0	652 if (r > MAX_N / 16)
d@0	653 pln->super.super.ops.other += ((r - (MAX_N / 16)) * m * v);
d@0	654
d@0	655 return &(pln->super.super);
d@0	656 }
d@0	657
d@0	658 /* heuristic to enable vector recursion */
d@0	659 static int force_vrecur(const ct_solver ego, const problem_dft p)
d@0	660 {
d@0	661 iodim *d;
d@0	662 INT n, r, m;
d@0	663 INT cutoff = 128;
d@0	664
d@0	665 A(p->vecsz->rnk == 1);
d@0	666 A(p->sz->rnk == 1);
d@0	667
d@0	668 n = p->sz->dims[0].n;
d@0	669 r = X(choose_radix)(ego->r, n);
d@0	670 m = n / r;
d@0	671
d@0	672 d = p->vecsz->dims + 0;
d@0	673 return (1
d@0	674 /* some vector dimension is contiguous */
d@0	675 && (d->is == 2 \|\| d->os == 2)
d@0	676
d@0	677 /* vector is sufficiently long */
d@0	678 && d->n >= cutoff
d@0	679
d@0	680 /* transform is sufficiently long */
d@0	681 && m >= cutoff
d@0	682 && r >= cutoff);
d@0	683 }
d@0	684
d@0	685 static void regsolverw(planner *plnr, INT r, int dec, int cutdim)
d@0	686 {
d@0	687 Sw slv = (Sw )X(mksolver_ct)(sizeof(Sw), r, dec, mkcldw, force_vrecur);
d@0	688 slv->cutdim = cutdim;
d@0	689 REGISTER_SOLVER(plnr, &(slv->super.super));
d@0	690 }
d@0	691
d@0	692 void X(ct_cell_direct_register)(planner *p)
d@0	693 {
d@0	694 INT n;
d@0	695
d@0	696 for (n = 0; n <= MAX_N; n += REQUIRE_N_MULTIPLE_OF) {
d@0	697 const struct spu_radices *r =
d@0	698 X(spu_radices) + n / REQUIRE_N_MULTIPLE_OF;
d@0	699 if (r->r[0]) {
d@0	700 regsolverw(p, n, DECDIT, 0);
d@0	701 regsolverw(p, n, DECDIT, 1);
d@0	702 regsolverw(p, n, DECDIF+TRANSPOSE, 0);
d@0	703 regsolverw(p, n, DECDIF+TRANSPOSE, 1);
d@0	704 }
d@0	705 }
d@0	706 }
d@0	707
d@0	708
d@0	709 #endif /* HAVE_CELL */
d@0	710
d@0	711

Mercurial > hg > batch-feature-extraction-tool

annotate Lib/fftw-3.2.1/cell/dft-direct-cell.c @ 0:25bf17994ef1