Mercurial > hg > batch-feature-extraction-tool
comparison Lib/fftw-3.2.1/cell/dft-direct-cell.c @ 15:585caf503ef5 tip
Tidy up for ROLI
author | Geogaddi\David <d.m.ronan@qmul.ac.uk> |
---|---|
date | Tue, 17 May 2016 18:50:19 +0100 |
parents | 636c989477e7 |
children |
comparison
equal
deleted
inserted
replaced
14:636c989477e7 | 15:585caf503ef5 |
---|---|
1 /* | |
2 * Copyright (c) 2007 Massachusetts Institute of Technology | |
3 * | |
4 * This program is free software; you can redistribute it and/or modify | |
5 * it under the terms of the GNU General Public License as published by | |
6 * the Free Software Foundation; either version 2 of the License, or | |
7 * (at your option) any later version. | |
8 * | |
9 * This program is distributed in the hope that it will be useful, | |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 * GNU General Public License for more details. | |
13 * | |
14 * You should have received a copy of the GNU General Public License | |
15 * along with this program; if not, write to the Free Software | |
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 * | |
18 */ | |
19 | |
20 /* direct DFT solver via cell library */ | |
21 | |
22 #include "dft.h" | |
23 #include "ct.h" | |
24 | |
25 #if HAVE_CELL | |
26 | |
27 #include "simd.h" | |
28 #include "fftw-cell.h" | |
29 | |
30 typedef struct { | |
31 solver super; | |
32 int cutdim; | |
33 } S; | |
34 | |
35 typedef struct { | |
36 plan_dft super; | |
37 struct spu_radices radices; | |
38 /* strides expressed in reals */ | |
39 INT n, is, os; | |
40 struct cell_iodim v[2]; | |
41 int cutdim; | |
42 int sign; | |
43 int Wsz; | |
44 R *W; | |
45 | |
46 /* optional twiddle factors for dftw: */ | |
47 INT rw, mw; /* rw == 0 indicates no twiddle factors */ | |
48 twid *td; | |
49 } P; | |
50 | |
51 | |
52 /* op counts of SPU codelets */ | |
53 static const opcnt n_ops[33] = { | |
54 [2] = {2, 0, 0, 0}, | |
55 [3] = {3, 1, 3, 0}, | |
56 [4] = {6, 0, 2, 0}, | |
57 [5] = {7, 2, 9, 0}, | |
58 [6] = {12, 2, 6, 0}, | |
59 [7] = {9, 3, 21, 0}, | |
60 [8] = {16, 0, 10, 0}, | |
61 [9] = {12, 4, 34, 0}, | |
62 [10] = {24, 4, 18, 0}, | |
63 [11] = {15, 5, 55, 0}, | |
64 [12] = {30, 2, 18, 0}, | |
65 [13] = {31, 6, 57, 0}, | |
66 [14] = {32, 6, 42, 0}, | |
67 [15] = {36, 7, 42, 0}, | |
68 [16] = {38, 0, 34, 0}, | |
69 [32] = {88, 0, 98, 0}, | |
70 }; | |
71 | |
72 static const opcnt t_ops[33] = { | |
73 [2] = {3, 2, 0, 0}, | |
74 [3] = {5, 5, 3, 0}, | |
75 [4] = {9, 6, 2, 0}, | |
76 [5] = {11, 10, 9, 0}, | |
77 [6] = {17, 12, 6, 0}, | |
78 [7] = {15, 15, 21, 0}, | |
79 [8] = {23, 14, 10, 0}, | |
80 [9] = {20, 20, 34, 0}, | |
81 [10] = {33, 22, 18, 0}, | |
82 [12] = {41, 24, 18, 0}, | |
83 [15] = {50, 35, 42, 0}, | |
84 [16] = {53, 30, 34, 0}, | |
85 [32] = {119, 62, 98, 0}, | |
86 }; | |
87 | |
88 static void compute_opcnt(const struct spu_radices *p, | |
89 INT n, INT v, opcnt *ops) | |
90 { | |
91 INT r; | |
92 signed char *q; | |
93 | |
94 X(ops_zero)(ops); | |
95 | |
96 for (q = p->r; (r = *q) > 0; ++q) | |
97 X(ops_madd2)(v * (n / r) / VL, &t_ops[r], ops); | |
98 | |
99 X(ops_madd2)(v * (n / (-r)) / VL, &n_ops[-r], ops); | |
100 } | |
101 | |
102 static INT extent(struct cell_iodim *d) | |
103 { | |
104 return d->n1 - d->n0; | |
105 } | |
106 | |
107 /* FIXME: this is totally broken */ | |
108 static void cost_model(const P *pln, opcnt *ops) | |
109 { | |
110 INT r = pln->n; | |
111 INT v0 = extent(pln->v + 0); | |
112 INT v1 = extent(pln->v + 1); | |
113 | |
114 compute_opcnt(&pln->radices, r, v0 * v1, ops); | |
115 | |
116 /* penalize cuts across short dimensions */ | |
117 if (extent(pln->v + pln->cutdim) < extent(pln->v + 1 - pln->cutdim)) | |
118 ops->other += 3.14159; | |
119 } | |
120 | |
121 /* expressed in real numbers */ | |
122 static INT compute_twiddle_size(const struct spu_radices *p, INT n) | |
123 { | |
124 INT sz = 0; | |
125 INT r; | |
126 signed char *q; | |
127 | |
128 for (q = p->r; (r = *q) > 0; ++q) { | |
129 n /= r; | |
130 sz += 2 * (r - 1) * n; | |
131 } | |
132 | |
133 return sz; | |
134 } | |
135 | |
136 /* FIXME: find a way to use the normal FFTW twiddle mechanisms for this */ | |
137 static void fill_twiddles(enum wakefulness wakefulness, | |
138 R *W, const signed char *q, INT n) | |
139 { | |
140 INT r; | |
141 | |
142 for ( ; (r = *q) > 0; ++q) { | |
143 triggen *t = X(mktriggen)(wakefulness, n); | |
144 INT i, j, v, m = n / r; | |
145 | |
146 for (j = 0; j < m; j += VL) { | |
147 for (i = 1; i < r; ++i) { | |
148 for (v = 0; v < VL; ++v) { | |
149 t->cexp(t, i * (j + v), W); | |
150 W += 2; | |
151 } | |
152 } | |
153 } | |
154 X(triggen_destroy)(t); | |
155 n = m; | |
156 } | |
157 } | |
158 | |
159 static R *make_twiddles(enum wakefulness wakefulness, | |
160 const struct spu_radices *p, INT n, int *Wsz) | |
161 { | |
162 INT sz = compute_twiddle_size(p, n); | |
163 R *W = X(cell_aligned_malloc)(sz * sizeof(R)); | |
164 A(FITS_IN_INT(sz)); | |
165 *Wsz = sz; | |
166 fill_twiddles(wakefulness, W, p->r, n); | |
167 return W; | |
168 } | |
169 | |
170 static int fits_in_local_store(INT n, INT v) | |
171 { | |
172 /* the SPU has space for 3 * MAX_N complex numbers. We need | |
173 n*(v+1) for data plus n for twiddle factors. */ | |
174 return n * (v+2) <= 3 * MAX_N; | |
175 } | |
176 | |
177 static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io) | |
178 { | |
179 const P *ego = (const P *) ego_; | |
180 R *xi, *xo; | |
181 int i, v; | |
182 int nspe = X(cell_nspe)(); | |
183 int cutdim = ego->cutdim; | |
184 int contiguous_r = ((ego->is == 2) && (ego->os == 2)); | |
185 | |
186 /* find pointer to beginning of data, depending on sign */ | |
187 if (ego->sign == FFT_SIGN) { | |
188 xi = ri; xo = ro; | |
189 } else { | |
190 xi = ii; xo = io; | |
191 } | |
192 | |
193 /* fill contexts */ | |
194 v = ego->v[cutdim].n1; | |
195 | |
196 for (i = 0; i < nspe; ++i) { | |
197 int chunk; | |
198 struct spu_context *ctx = X(cell_get_ctx)(i); | |
199 struct dft_context *dft = &ctx->u.dft; | |
200 | |
201 ctx->op = FFTW_SPE_DFT; | |
202 | |
203 dft->r = ego->radices; | |
204 dft->n = ego->n; | |
205 dft->is_bytes = ego->is * sizeof(R); | |
206 dft->os_bytes = ego->os * sizeof(R); | |
207 dft->v[0] = ego->v[0]; | |
208 dft->v[1] = ego->v[1]; | |
209 dft->sign = ego->sign; | |
210 A(FITS_IN_INT(ego->Wsz * sizeof(R))); | |
211 dft->Wsz_bytes = ego->Wsz * sizeof(R); | |
212 dft->W = (uintptr_t)ego->W; | |
213 dft->xi = (uintptr_t)xi; | |
214 dft->xo = (uintptr_t)xo; | |
215 | |
216 /* partition v into pieces of equal size, subject to alignment | |
217 constraints */ | |
218 if (cutdim == 0 && !contiguous_r) { | |
219 /* CUTDIM = 0 and the SPU uses transposed DMA. | |
220 We must preserve the alignment of the dimension 0 in the | |
221 cut */ | |
222 chunk = VL * ((v - ego->v[cutdim].n0) / (VL * (nspe - i))); | |
223 } else { | |
224 chunk = (v - ego->v[cutdim].n0) / (nspe - i); | |
225 } | |
226 | |
227 dft->v[cutdim].n1 = v; | |
228 v -= chunk; | |
229 dft->v[cutdim].n0 = v; | |
230 | |
231 /* optional dftw twiddles */ | |
232 if (ego->rw) | |
233 dft->Ww = (uintptr_t)ego->td->W; | |
234 } | |
235 | |
236 A(v == ego->v[cutdim].n0); | |
237 | |
238 /* activate spe's */ | |
239 X(cell_spe_awake_all)(); | |
240 | |
241 /* wait for completion */ | |
242 X(cell_spe_wait_all)(); | |
243 } | |
244 | |
245 static void print(const plan *ego_, printer *p) | |
246 { | |
247 const P *ego = (const P *) ego_; | |
248 int i; | |
249 p->print(p, "(dft-direct-cell-%D/%d", ego->n, ego->cutdim); | |
250 for (i = 0; i < 2; ++i) | |
251 p->print(p, "%v", (INT)ego->v[i].n1); | |
252 p->print(p, ")"); | |
253 } | |
254 | |
255 static void awake(plan *ego_, enum wakefulness wakefulness) | |
256 { | |
257 P *ego = (P *) ego_; | |
258 | |
259 /* awake the optional dftw twiddles */ | |
260 if (ego->rw) { | |
261 static const tw_instr tw[] = { | |
262 { TW_CEXP, 0, 0 }, | |
263 { TW_FULL, 0, 0 }, | |
264 { TW_NEXT, 1, 0 } | |
265 }; | |
266 X(twiddle_awake)(wakefulness, &ego->td, tw, | |
267 ego->rw * ego->mw, ego->rw, ego->mw); | |
268 } | |
269 | |
270 /* awake the twiddles for the dft part */ | |
271 switch (wakefulness) { | |
272 case SLEEPY: | |
273 free(ego->W); | |
274 ego->W = 0; | |
275 break; | |
276 default: | |
277 ego->W = make_twiddles(wakefulness, &ego->radices, | |
278 ego->n, &ego->Wsz); | |
279 break; | |
280 } | |
281 } | |
282 | |
283 static int contiguous_or_aligned_p(INT s_bytes) | |
284 { | |
285 return (s_bytes == 2 * sizeof(R)) || ((s_bytes % ALIGNMENTA) == 0); | |
286 } | |
287 | |
288 static int build_vdim(int inplacep, | |
289 INT r, INT irs, INT ors, | |
290 INT m, INT ims, INT oms, int dm, | |
291 INT v, INT ivs, INT ovs, | |
292 struct cell_iodim vd[2], int cutdim) | |
293 { | |
294 int vm, vv; | |
295 int contiguous_r = ((irs == 2) && (ors == 2)); | |
296 | |
297 /* 32-bit overflow? */ | |
298 if (!(1 | |
299 && FITS_IN_INT(r) | |
300 && FITS_IN_INT(irs * sizeof(R)) | |
301 && FITS_IN_INT(ors * sizeof(R)) | |
302 && FITS_IN_INT(m) | |
303 && FITS_IN_INT(ims * sizeof(R)) | |
304 && FITS_IN_INT(oms * sizeof(R)) | |
305 && FITS_IN_INT(v) | |
306 && FITS_IN_INT(ivs * sizeof(R)) | |
307 && FITS_IN_INT(ovs * sizeof(R)))) | |
308 return 0; | |
309 | |
310 /* R dimension must be aligned in all cases */ | |
311 if (!(1 | |
312 && r % VL == 0 /* REDUNDANT */ | |
313 && contiguous_or_aligned_p(irs * sizeof(R)) | |
314 && contiguous_or_aligned_p(ors * sizeof(R)))) | |
315 return 0; | |
316 | |
317 if ((irs == 2 || ims == 2) && (ors == 2 || oms == 2)) { | |
318 /* Case 1: in SPU, let N=R, V0=M, V1=V */ | |
319 vm = 0; | |
320 vv = 1; | |
321 } else if ((irs == 2 || ivs == 2) && (ors == 2 || ovs == 2)) { | |
322 /* Case 2: in SPU, let N=R, V0=V, V1=M */ | |
323 vm = 1; | |
324 vv = 0; | |
325 } else { | |
326 /* can't do it */ | |
327 return 0; | |
328 } | |
329 | |
330 vd[vm].n0 = 0; vd[vm].n1 = m; | |
331 vd[vm].is_bytes = ims * sizeof(R); vd[vm].os_bytes = oms * sizeof(R); | |
332 vd[vm].dm = dm; | |
333 | |
334 vd[vv].n0 = 0; vd[vv].n1 = v; | |
335 vd[vv].is_bytes = ivs * sizeof(R); vd[vv].os_bytes = ovs * sizeof(R); | |
336 vd[vv].dm = 0; | |
337 | |
338 /* Restrictions on the size of the SPU local store: */ | |
339 if (!(0 | |
340 /* for contiguous I/O, one array of size R must fit into | |
341 local store. (The fits_in_local_store() check is | |
342 redundant because R <= MAX_N holds, but we check anyway | |
343 for clarity */ | |
344 || (contiguous_r && fits_in_local_store(r, 1)) | |
345 | |
346 /* for noncontiguous I/O, VL arrays of size R must fit into | |
347 local store because of transposed DMA */ | |
348 || fits_in_local_store(r, VL))) | |
349 return 0; | |
350 | |
351 /* SPU DMA restrictions: */ | |
352 if (!(1 | |
353 /* If R is noncontiguous, then the SPU uses transposed DMA | |
354 and therefore dimension 0 must be aligned */ | |
355 && (contiguous_r || vd[0].n1 % VL == 0) | |
356 | |
357 /* dimension 1 is arbitrary */ | |
358 | |
359 /* dimension-0 strides must be either contiguous or aligned */ | |
360 && contiguous_or_aligned_p((INT)vd[0].is_bytes) | |
361 && contiguous_or_aligned_p((INT)vd[0].os_bytes) | |
362 | |
363 /* dimension-1 strides must be aligned */ | |
364 && ((vd[1].is_bytes % ALIGNMENTA) == 0) | |
365 && ((vd[1].os_bytes % ALIGNMENTA) == 0) | |
366 )) | |
367 return 0; | |
368 | |
369 /* see if we can do it without overwriting the input with itself */ | |
370 if (!(0 | |
371 /* can operate out-of-place */ | |
372 || !inplacep | |
373 | |
374 /* all strides are in-place */ | |
375 || (1 | |
376 && irs == ors | |
377 && ims == oms | |
378 && ivs == ovs) | |
379 | |
380 /* we cut across in-place dimension 1, and dimension 0 fits | |
381 into local store */ | |
382 || (1 | |
383 && cutdim == 1 | |
384 && vd[cutdim].is_bytes == vd[cutdim].os_bytes | |
385 && fits_in_local_store(r, extent(vd + 0))) | |
386 )) | |
387 return 0; | |
388 | |
389 return 1; | |
390 } | |
391 | |
392 static | |
393 const struct spu_radices *find_radices(R *ri, R *ii, R *ro, R *io, | |
394 INT n, int *sign) | |
395 { | |
396 const struct spu_radices *p; | |
397 R *xi, *xo; | |
398 | |
399 /* 32-bit overflow? */ | |
400 if (!FITS_IN_INT(n)) | |
401 return 0; | |
402 | |
403 /* valid n? */ | |
404 if (n <= 0 || n > MAX_N || ((n % REQUIRE_N_MULTIPLE_OF) != 0)) | |
405 return 0; | |
406 | |
407 /* see if we have a plan for this N */ | |
408 p = X(spu_radices) + n / REQUIRE_N_MULTIPLE_OF; | |
409 if (!p->r[0]) | |
410 return 0; | |
411 | |
412 /* check whether the data format is supported */ | |
413 if (ii == ri + 1 && io == ro + 1) { /* R I R I ... format */ | |
414 *sign = FFT_SIGN; | |
415 xi = ri; xo = ro; | |
416 } else if (ri == ii + 1 && ro == io + 1) { /* I R I R ... format */ | |
417 *sign = -FFT_SIGN; | |
418 xi = ii; xo = io; | |
419 } else | |
420 return 0; /* can't do it */ | |
421 | |
422 if (!ALIGNEDA(xi) || !ALIGNEDA(xo)) | |
423 return 0; | |
424 | |
425 return p; | |
426 } | |
427 | |
428 static const plan_adt padt = { | |
429 X(dft_solve), awake, print, X(plan_null_destroy) | |
430 }; | |
431 | |
432 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr) | |
433 { | |
434 P *pln; | |
435 const S *ego = (const S *)ego_; | |
436 const problem_dft *p = (const problem_dft *) p_; | |
437 int sign; | |
438 const struct spu_radices *radices; | |
439 struct cell_iodim vd[2]; | |
440 INT m, ims, oms, v, ivs, ovs; | |
441 | |
442 /* basic conditions */ | |
443 if (!(1 | |
444 && X(cell_nspe)() > 0 | |
445 && p->sz->rnk == 1 | |
446 && p->vecsz->rnk <= 2 | |
447 && !NO_SIMDP(plnr) | |
448 )) | |
449 return 0; | |
450 | |
451 /* see if SPU supports N */ | |
452 { | |
453 iodim *d = p->sz->dims; | |
454 radices = find_radices(p->ri, p->ii, p->ro, p->io, d[0].n, &sign); | |
455 if (!radices) | |
456 return 0; | |
457 } | |
458 | |
459 /* canonicalize to vrank 2 */ | |
460 if (p->vecsz->rnk >= 1) { | |
461 iodim *d = p->vecsz->dims + 0; | |
462 m = d->n; ims = d->is; oms = d->os; | |
463 } else { | |
464 m = 1; ims = oms = 0; | |
465 } | |
466 | |
467 if (p->vecsz->rnk >= 2) { | |
468 iodim *d = p->vecsz->dims + 1; | |
469 v = d->n; ivs = d->is; ovs = d->os; | |
470 } else { | |
471 v = 1; ivs = ovs = 0; | |
472 } | |
473 | |
474 /* see if strides are supported by the SPU DMA routine */ | |
475 { | |
476 iodim *d = p->sz->dims + 0; | |
477 if (!build_vdim(p->ri == p->ro, | |
478 d->n, d->is, d->os, | |
479 m, ims, oms, 0, | |
480 v, ivs, ovs, | |
481 vd, ego->cutdim)) | |
482 return 0; | |
483 } | |
484 | |
485 pln = MKPLAN_DFT(P, &padt, apply); | |
486 | |
487 pln->radices = *radices; | |
488 { | |
489 iodim *d = p->sz->dims + 0; | |
490 pln->n = d[0].n; | |
491 pln->is = d[0].is; | |
492 pln->os = d[0].os; | |
493 } | |
494 pln->sign = sign; | |
495 pln->v[0] = vd[0]; | |
496 pln->v[1] = vd[1]; | |
497 pln->cutdim = ego->cutdim; | |
498 pln->W = 0; | |
499 | |
500 pln->rw = 0; | |
501 | |
502 cost_model(pln, &pln->super.super.ops); | |
503 | |
504 return &(pln->super.super); | |
505 } | |
506 | |
507 static void solver_destroy(solver *ego) | |
508 { | |
509 UNUSED(ego); | |
510 X(cell_deactivate_spes)(); | |
511 } | |
512 | |
513 static solver *mksolver(int cutdim) | |
514 { | |
515 static const solver_adt sadt = { PROBLEM_DFT, mkplan, solver_destroy }; | |
516 S *slv = MKSOLVER(S, &sadt); | |
517 slv->cutdim = cutdim; | |
518 X(cell_activate_spes)(); | |
519 return &(slv->super); | |
520 } | |
521 | |
522 void X(dft_direct_cell_register)(planner *p) | |
523 { | |
524 REGISTER_SOLVER(p, mksolver(0)); | |
525 REGISTER_SOLVER(p, mksolver(1)); | |
526 } | |
527 | |
528 /**************************************************************/ | |
529 /* solvers with twiddle factors: */ | |
530 | |
531 typedef struct { | |
532 plan_dftw super; | |
533 plan *cld; | |
534 } Pw; | |
535 | |
536 typedef struct { | |
537 ct_solver super; | |
538 int cutdim; | |
539 } Sw; | |
540 | |
541 static void destroyw(plan *ego_) | |
542 { | |
543 Pw *ego = (Pw *) ego_; | |
544 X(plan_destroy_internal)(ego->cld); | |
545 } | |
546 | |
547 static void printw(const plan *ego_, printer *p) | |
548 { | |
549 const Pw *ego = (const Pw *) ego_; | |
550 const P *cld = (const P *) ego->cld; | |
551 p->print(p, "(dftw-direct-cell-%D-%D%v%(%p%))", | |
552 cld->rw, cld->mw, cld->v[1].n1, | |
553 ego->cld); | |
554 } | |
555 | |
556 static void awakew(plan *ego_, enum wakefulness wakefulness) | |
557 { | |
558 Pw *ego = (Pw *) ego_; | |
559 X(plan_awake)(ego->cld, wakefulness); | |
560 } | |
561 | |
562 static void applyw(const plan *ego_, R *rio, R *iio) | |
563 { | |
564 const Pw *ego = (const Pw *) ego_; | |
565 dftapply cldapply = ((plan_dft *) ego->cld)->apply; | |
566 cldapply(ego->cld, rio, iio, rio, iio); | |
567 } | |
568 | |
569 static plan *mkcldw(const ct_solver *ego_, | |
570 INT r, INT irs, INT ors, | |
571 INT m, INT ms, | |
572 INT v, INT ivs, INT ovs, | |
573 INT mstart, INT mcount, | |
574 R *rio, R *iio, | |
575 planner *plnr) | |
576 { | |
577 const Sw *ego = (const Sw *)ego_; | |
578 const struct spu_radices *radices; | |
579 int sign; | |
580 Pw *pln; | |
581 P *cld; | |
582 struct cell_iodim vd[2]; | |
583 int dm = 0; | |
584 | |
585 static const plan_adt padtw = { | |
586 0, awakew, printw, destroyw | |
587 }; | |
588 | |
589 /* use only if cell is enabled */ | |
590 if (NO_SIMDP(plnr) || X(cell_nspe)() <= 0) | |
591 return 0; | |
592 | |
593 /* no way in hell this SPU stuff is going to work with pthreads */ | |
594 if (mstart != 0 || mcount != m) | |
595 return 0; | |
596 | |
597 /* don't bother for small N */ | |
598 if (r * m * v <= MAX_N / 16 /* ARBITRARY */) | |
599 return 0; | |
600 | |
601 /* check whether the R dimension is supported */ | |
602 radices = find_radices(rio, iio, rio, iio, r, &sign); | |
603 | |
604 if (!radices) | |
605 return 0; | |
606 | |
607 /* encode decimation in DM */ | |
608 switch (ego->super.dec) { | |
609 case DECDIT: | |
610 case DECDIT+TRANSPOSE: | |
611 dm = 1; | |
612 break; | |
613 case DECDIF: | |
614 case DECDIF+TRANSPOSE: | |
615 dm = -1; | |
616 break; | |
617 } | |
618 | |
619 if (!build_vdim(1, | |
620 r, irs, ors, | |
621 m, ms, ms, dm, | |
622 v, ivs, ovs, | |
623 vd, ego->cutdim)) | |
624 return 0; | |
625 | |
626 cld = MKPLAN_DFT(P, &padt, apply); | |
627 | |
628 cld->radices = *radices; | |
629 cld->n = r; | |
630 cld->is = irs; | |
631 cld->os = ors; | |
632 cld->sign = sign; | |
633 cld->W = 0; | |
634 | |
635 cld->rw = r; cld->mw = m; cld->td = 0; | |
636 | |
637 cld->v[0] = vd[0]; | |
638 cld->v[1] = vd[1]; | |
639 cld->cutdim = ego->cutdim; | |
640 | |
641 pln = MKPLAN_DFTW(Pw, &padtw, applyw); | |
642 pln->cld = &cld->super.super; | |
643 | |
644 cost_model(cld, &pln->super.super.ops); | |
645 | |
646 /* for twiddle factors: one mul and one fma per complex point */ | |
647 pln->super.super.ops.fma += (r * m * v) / VL; | |
648 pln->super.super.ops.mul += (r * m * v) / VL; | |
649 | |
650 /* FIXME: heuristics */ | |
651 /* pay penalty for large radices: */ | |
652 if (r > MAX_N / 16) | |
653 pln->super.super.ops.other += ((r - (MAX_N / 16)) * m * v); | |
654 | |
655 return &(pln->super.super); | |
656 } | |
657 | |
658 /* heuristic to enable vector recursion */ | |
659 static int force_vrecur(const ct_solver *ego, const problem_dft *p) | |
660 { | |
661 iodim *d; | |
662 INT n, r, m; | |
663 INT cutoff = 128; | |
664 | |
665 A(p->vecsz->rnk == 1); | |
666 A(p->sz->rnk == 1); | |
667 | |
668 n = p->sz->dims[0].n; | |
669 r = X(choose_radix)(ego->r, n); | |
670 m = n / r; | |
671 | |
672 d = p->vecsz->dims + 0; | |
673 return (1 | |
674 /* some vector dimension is contiguous */ | |
675 && (d->is == 2 || d->os == 2) | |
676 | |
677 /* vector is sufficiently long */ | |
678 && d->n >= cutoff | |
679 | |
680 /* transform is sufficiently long */ | |
681 && m >= cutoff | |
682 && r >= cutoff); | |
683 } | |
684 | |
685 static void regsolverw(planner *plnr, INT r, int dec, int cutdim) | |
686 { | |
687 Sw *slv = (Sw *)X(mksolver_ct)(sizeof(Sw), r, dec, mkcldw, force_vrecur); | |
688 slv->cutdim = cutdim; | |
689 REGISTER_SOLVER(plnr, &(slv->super.super)); | |
690 } | |
691 | |
692 void X(ct_cell_direct_register)(planner *p) | |
693 { | |
694 INT n; | |
695 | |
696 for (n = 0; n <= MAX_N; n += REQUIRE_N_MULTIPLE_OF) { | |
697 const struct spu_radices *r = | |
698 X(spu_radices) + n / REQUIRE_N_MULTIPLE_OF; | |
699 if (r->r[0]) { | |
700 regsolverw(p, n, DECDIT, 0); | |
701 regsolverw(p, n, DECDIT, 1); | |
702 regsolverw(p, n, DECDIF+TRANSPOSE, 0); | |
703 regsolverw(p, n, DECDIF+TRANSPOSE, 1); | |
704 } | |
705 } | |
706 } | |
707 | |
708 | |
709 #endif /* HAVE_CELL */ | |
710 | |
711 |