d@0
|
1 /*
|
d@0
|
2 * Copyright (c) 2007 Massachusetts Institute of Technology
|
d@0
|
3 *
|
d@0
|
4 * This program is free software; you can redistribute it and/or modify
|
d@0
|
5 * it under the terms of the GNU General Public License as published by
|
d@0
|
6 * the Free Software Foundation; either version 2 of the License, or
|
d@0
|
7 * (at your option) any later version.
|
d@0
|
8 *
|
d@0
|
9 * This program is distributed in the hope that it will be useful,
|
d@0
|
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
d@0
|
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
d@0
|
12 * GNU General Public License for more details.
|
d@0
|
13 *
|
d@0
|
14 * You should have received a copy of the GNU General Public License
|
d@0
|
15 * along with this program; if not, write to the Free Software
|
d@0
|
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
d@0
|
17 *
|
d@0
|
18 */
|
d@0
|
19
|
d@0
|
20 /* direct DFT solver via cell library */
|
d@0
|
21
|
d@0
|
22 #include "dft.h"
|
d@0
|
23 #include "ct.h"
|
d@0
|
24
|
d@0
|
25 #if HAVE_CELL
|
d@0
|
26
|
d@0
|
27 #include "simd.h"
|
d@0
|
28 #include "fftw-cell.h"
|
d@0
|
29
|
d@0
|
30 typedef struct {
|
d@0
|
31 solver super;
|
d@0
|
32 int cutdim;
|
d@0
|
33 } S;
|
d@0
|
34
|
d@0
|
35 typedef struct {
|
d@0
|
36 plan_dft super;
|
d@0
|
37 struct spu_radices radices;
|
d@0
|
38 /* strides expressed in reals */
|
d@0
|
39 INT n, is, os;
|
d@0
|
40 struct cell_iodim v[2];
|
d@0
|
41 int cutdim;
|
d@0
|
42 int sign;
|
d@0
|
43 int Wsz;
|
d@0
|
44 R *W;
|
d@0
|
45
|
d@0
|
46 /* optional twiddle factors for dftw: */
|
d@0
|
47 INT rw, mw; /* rw == 0 indicates no twiddle factors */
|
d@0
|
48 twid *td;
|
d@0
|
49 } P;
|
d@0
|
50
|
d@0
|
51
|
d@0
|
52 /* op counts of SPU codelets */
|
d@0
|
53 static const opcnt n_ops[33] = {
|
d@0
|
54 [2] = {2, 0, 0, 0},
|
d@0
|
55 [3] = {3, 1, 3, 0},
|
d@0
|
56 [4] = {6, 0, 2, 0},
|
d@0
|
57 [5] = {7, 2, 9, 0},
|
d@0
|
58 [6] = {12, 2, 6, 0},
|
d@0
|
59 [7] = {9, 3, 21, 0},
|
d@0
|
60 [8] = {16, 0, 10, 0},
|
d@0
|
61 [9] = {12, 4, 34, 0},
|
d@0
|
62 [10] = {24, 4, 18, 0},
|
d@0
|
63 [11] = {15, 5, 55, 0},
|
d@0
|
64 [12] = {30, 2, 18, 0},
|
d@0
|
65 [13] = {31, 6, 57, 0},
|
d@0
|
66 [14] = {32, 6, 42, 0},
|
d@0
|
67 [15] = {36, 7, 42, 0},
|
d@0
|
68 [16] = {38, 0, 34, 0},
|
d@0
|
69 [32] = {88, 0, 98, 0},
|
d@0
|
70 };
|
d@0
|
71
|
d@0
|
72 static const opcnt t_ops[33] = {
|
d@0
|
73 [2] = {3, 2, 0, 0},
|
d@0
|
74 [3] = {5, 5, 3, 0},
|
d@0
|
75 [4] = {9, 6, 2, 0},
|
d@0
|
76 [5] = {11, 10, 9, 0},
|
d@0
|
77 [6] = {17, 12, 6, 0},
|
d@0
|
78 [7] = {15, 15, 21, 0},
|
d@0
|
79 [8] = {23, 14, 10, 0},
|
d@0
|
80 [9] = {20, 20, 34, 0},
|
d@0
|
81 [10] = {33, 22, 18, 0},
|
d@0
|
82 [12] = {41, 24, 18, 0},
|
d@0
|
83 [15] = {50, 35, 42, 0},
|
d@0
|
84 [16] = {53, 30, 34, 0},
|
d@0
|
85 [32] = {119, 62, 98, 0},
|
d@0
|
86 };
|
d@0
|
87
|
d@0
|
88 static void compute_opcnt(const struct spu_radices *p,
|
d@0
|
89 INT n, INT v, opcnt *ops)
|
d@0
|
90 {
|
d@0
|
91 INT r;
|
d@0
|
92 signed char *q;
|
d@0
|
93
|
d@0
|
94 X(ops_zero)(ops);
|
d@0
|
95
|
d@0
|
96 for (q = p->r; (r = *q) > 0; ++q)
|
d@0
|
97 X(ops_madd2)(v * (n / r) / VL, &t_ops[r], ops);
|
d@0
|
98
|
d@0
|
99 X(ops_madd2)(v * (n / (-r)) / VL, &n_ops[-r], ops);
|
d@0
|
100 }
|
d@0
|
101
|
d@0
|
102 static INT extent(struct cell_iodim *d)
|
d@0
|
103 {
|
d@0
|
104 return d->n1 - d->n0;
|
d@0
|
105 }
|
d@0
|
106
|
d@0
|
107 /* FIXME: this is totally broken */
|
d@0
|
108 static void cost_model(const P *pln, opcnt *ops)
|
d@0
|
109 {
|
d@0
|
110 INT r = pln->n;
|
d@0
|
111 INT v0 = extent(pln->v + 0);
|
d@0
|
112 INT v1 = extent(pln->v + 1);
|
d@0
|
113
|
d@0
|
114 compute_opcnt(&pln->radices, r, v0 * v1, ops);
|
d@0
|
115
|
d@0
|
116 /* penalize cuts across short dimensions */
|
d@0
|
117 if (extent(pln->v + pln->cutdim) < extent(pln->v + 1 - pln->cutdim))
|
d@0
|
118 ops->other += 3.14159;
|
d@0
|
119 }
|
d@0
|
120
|
d@0
|
121 /* expressed in real numbers */
|
d@0
|
122 static INT compute_twiddle_size(const struct spu_radices *p, INT n)
|
d@0
|
123 {
|
d@0
|
124 INT sz = 0;
|
d@0
|
125 INT r;
|
d@0
|
126 signed char *q;
|
d@0
|
127
|
d@0
|
128 for (q = p->r; (r = *q) > 0; ++q) {
|
d@0
|
129 n /= r;
|
d@0
|
130 sz += 2 * (r - 1) * n;
|
d@0
|
131 }
|
d@0
|
132
|
d@0
|
133 return sz;
|
d@0
|
134 }
|
d@0
|
135
|
d@0
|
136 /* FIXME: find a way to use the normal FFTW twiddle mechanisms for this */
|
d@0
|
137 static void fill_twiddles(enum wakefulness wakefulness,
|
d@0
|
138 R *W, const signed char *q, INT n)
|
d@0
|
139 {
|
d@0
|
140 INT r;
|
d@0
|
141
|
d@0
|
142 for ( ; (r = *q) > 0; ++q) {
|
d@0
|
143 triggen *t = X(mktriggen)(wakefulness, n);
|
d@0
|
144 INT i, j, v, m = n / r;
|
d@0
|
145
|
d@0
|
146 for (j = 0; j < m; j += VL) {
|
d@0
|
147 for (i = 1; i < r; ++i) {
|
d@0
|
148 for (v = 0; v < VL; ++v) {
|
d@0
|
149 t->cexp(t, i * (j + v), W);
|
d@0
|
150 W += 2;
|
d@0
|
151 }
|
d@0
|
152 }
|
d@0
|
153 }
|
d@0
|
154 X(triggen_destroy)(t);
|
d@0
|
155 n = m;
|
d@0
|
156 }
|
d@0
|
157 }
|
d@0
|
158
|
d@0
|
159 static R *make_twiddles(enum wakefulness wakefulness,
|
d@0
|
160 const struct spu_radices *p, INT n, int *Wsz)
|
d@0
|
161 {
|
d@0
|
162 INT sz = compute_twiddle_size(p, n);
|
d@0
|
163 R *W = X(cell_aligned_malloc)(sz * sizeof(R));
|
d@0
|
164 A(FITS_IN_INT(sz));
|
d@0
|
165 *Wsz = sz;
|
d@0
|
166 fill_twiddles(wakefulness, W, p->r, n);
|
d@0
|
167 return W;
|
d@0
|
168 }
|
d@0
|
169
|
d@0
|
170 static int fits_in_local_store(INT n, INT v)
|
d@0
|
171 {
|
d@0
|
172 /* the SPU has space for 3 * MAX_N complex numbers. We need
|
d@0
|
173 n*(v+1) for data plus n for twiddle factors. */
|
d@0
|
174 return n * (v+2) <= 3 * MAX_N;
|
d@0
|
175 }
|
d@0
|
176
|
d@0
|
177 static void apply(const plan *ego_, R *ri, R *ii, R *ro, R *io)
|
d@0
|
178 {
|
d@0
|
179 const P *ego = (const P *) ego_;
|
d@0
|
180 R *xi, *xo;
|
d@0
|
181 int i, v;
|
d@0
|
182 int nspe = X(cell_nspe)();
|
d@0
|
183 int cutdim = ego->cutdim;
|
d@0
|
184 int contiguous_r = ((ego->is == 2) && (ego->os == 2));
|
d@0
|
185
|
d@0
|
186 /* find pointer to beginning of data, depending on sign */
|
d@0
|
187 if (ego->sign == FFT_SIGN) {
|
d@0
|
188 xi = ri; xo = ro;
|
d@0
|
189 } else {
|
d@0
|
190 xi = ii; xo = io;
|
d@0
|
191 }
|
d@0
|
192
|
d@0
|
193 /* fill contexts */
|
d@0
|
194 v = ego->v[cutdim].n1;
|
d@0
|
195
|
d@0
|
196 for (i = 0; i < nspe; ++i) {
|
d@0
|
197 int chunk;
|
d@0
|
198 struct spu_context *ctx = X(cell_get_ctx)(i);
|
d@0
|
199 struct dft_context *dft = &ctx->u.dft;
|
d@0
|
200
|
d@0
|
201 ctx->op = FFTW_SPE_DFT;
|
d@0
|
202
|
d@0
|
203 dft->r = ego->radices;
|
d@0
|
204 dft->n = ego->n;
|
d@0
|
205 dft->is_bytes = ego->is * sizeof(R);
|
d@0
|
206 dft->os_bytes = ego->os * sizeof(R);
|
d@0
|
207 dft->v[0] = ego->v[0];
|
d@0
|
208 dft->v[1] = ego->v[1];
|
d@0
|
209 dft->sign = ego->sign;
|
d@0
|
210 A(FITS_IN_INT(ego->Wsz * sizeof(R)));
|
d@0
|
211 dft->Wsz_bytes = ego->Wsz * sizeof(R);
|
d@0
|
212 dft->W = (uintptr_t)ego->W;
|
d@0
|
213 dft->xi = (uintptr_t)xi;
|
d@0
|
214 dft->xo = (uintptr_t)xo;
|
d@0
|
215
|
d@0
|
216 /* partition v into pieces of equal size, subject to alignment
|
d@0
|
217 constraints */
|
d@0
|
218 if (cutdim == 0 && !contiguous_r) {
|
d@0
|
219 /* CUTDIM = 0 and the SPU uses transposed DMA.
|
d@0
|
220 We must preserve the alignment of the dimension 0 in the
|
d@0
|
221 cut */
|
d@0
|
222 chunk = VL * ((v - ego->v[cutdim].n0) / (VL * (nspe - i)));
|
d@0
|
223 } else {
|
d@0
|
224 chunk = (v - ego->v[cutdim].n0) / (nspe - i);
|
d@0
|
225 }
|
d@0
|
226
|
d@0
|
227 dft->v[cutdim].n1 = v;
|
d@0
|
228 v -= chunk;
|
d@0
|
229 dft->v[cutdim].n0 = v;
|
d@0
|
230
|
d@0
|
231 /* optional dftw twiddles */
|
d@0
|
232 if (ego->rw)
|
d@0
|
233 dft->Ww = (uintptr_t)ego->td->W;
|
d@0
|
234 }
|
d@0
|
235
|
d@0
|
236 A(v == ego->v[cutdim].n0);
|
d@0
|
237
|
d@0
|
238 /* activate spe's */
|
d@0
|
239 X(cell_spe_awake_all)();
|
d@0
|
240
|
d@0
|
241 /* wait for completion */
|
d@0
|
242 X(cell_spe_wait_all)();
|
d@0
|
243 }
|
d@0
|
244
|
d@0
|
245 static void print(const plan *ego_, printer *p)
|
d@0
|
246 {
|
d@0
|
247 const P *ego = (const P *) ego_;
|
d@0
|
248 int i;
|
d@0
|
249 p->print(p, "(dft-direct-cell-%D/%d", ego->n, ego->cutdim);
|
d@0
|
250 for (i = 0; i < 2; ++i)
|
d@0
|
251 p->print(p, "%v", (INT)ego->v[i].n1);
|
d@0
|
252 p->print(p, ")");
|
d@0
|
253 }
|
d@0
|
254
|
d@0
|
255 static void awake(plan *ego_, enum wakefulness wakefulness)
|
d@0
|
256 {
|
d@0
|
257 P *ego = (P *) ego_;
|
d@0
|
258
|
d@0
|
259 /* awake the optional dftw twiddles */
|
d@0
|
260 if (ego->rw) {
|
d@0
|
261 static const tw_instr tw[] = {
|
d@0
|
262 { TW_CEXP, 0, 0 },
|
d@0
|
263 { TW_FULL, 0, 0 },
|
d@0
|
264 { TW_NEXT, 1, 0 }
|
d@0
|
265 };
|
d@0
|
266 X(twiddle_awake)(wakefulness, &ego->td, tw,
|
d@0
|
267 ego->rw * ego->mw, ego->rw, ego->mw);
|
d@0
|
268 }
|
d@0
|
269
|
d@0
|
270 /* awake the twiddles for the dft part */
|
d@0
|
271 switch (wakefulness) {
|
d@0
|
272 case SLEEPY:
|
d@0
|
273 free(ego->W);
|
d@0
|
274 ego->W = 0;
|
d@0
|
275 break;
|
d@0
|
276 default:
|
d@0
|
277 ego->W = make_twiddles(wakefulness, &ego->radices,
|
d@0
|
278 ego->n, &ego->Wsz);
|
d@0
|
279 break;
|
d@0
|
280 }
|
d@0
|
281 }
|
d@0
|
282
|
d@0
|
283 static int contiguous_or_aligned_p(INT s_bytes)
|
d@0
|
284 {
|
d@0
|
285 return (s_bytes == 2 * sizeof(R)) || ((s_bytes % ALIGNMENTA) == 0);
|
d@0
|
286 }
|
d@0
|
287
|
d@0
|
288 static int build_vdim(int inplacep,
|
d@0
|
289 INT r, INT irs, INT ors,
|
d@0
|
290 INT m, INT ims, INT oms, int dm,
|
d@0
|
291 INT v, INT ivs, INT ovs,
|
d@0
|
292 struct cell_iodim vd[2], int cutdim)
|
d@0
|
293 {
|
d@0
|
294 int vm, vv;
|
d@0
|
295 int contiguous_r = ((irs == 2) && (ors == 2));
|
d@0
|
296
|
d@0
|
297 /* 32-bit overflow? */
|
d@0
|
298 if (!(1
|
d@0
|
299 && FITS_IN_INT(r)
|
d@0
|
300 && FITS_IN_INT(irs * sizeof(R))
|
d@0
|
301 && FITS_IN_INT(ors * sizeof(R))
|
d@0
|
302 && FITS_IN_INT(m)
|
d@0
|
303 && FITS_IN_INT(ims * sizeof(R))
|
d@0
|
304 && FITS_IN_INT(oms * sizeof(R))
|
d@0
|
305 && FITS_IN_INT(v)
|
d@0
|
306 && FITS_IN_INT(ivs * sizeof(R))
|
d@0
|
307 && FITS_IN_INT(ovs * sizeof(R))))
|
d@0
|
308 return 0;
|
d@0
|
309
|
d@0
|
310 /* R dimension must be aligned in all cases */
|
d@0
|
311 if (!(1
|
d@0
|
312 && r % VL == 0 /* REDUNDANT */
|
d@0
|
313 && contiguous_or_aligned_p(irs * sizeof(R))
|
d@0
|
314 && contiguous_or_aligned_p(ors * sizeof(R))))
|
d@0
|
315 return 0;
|
d@0
|
316
|
d@0
|
317 if ((irs == 2 || ims == 2) && (ors == 2 || oms == 2)) {
|
d@0
|
318 /* Case 1: in SPU, let N=R, V0=M, V1=V */
|
d@0
|
319 vm = 0;
|
d@0
|
320 vv = 1;
|
d@0
|
321 } else if ((irs == 2 || ivs == 2) && (ors == 2 || ovs == 2)) {
|
d@0
|
322 /* Case 2: in SPU, let N=R, V0=V, V1=M */
|
d@0
|
323 vm = 1;
|
d@0
|
324 vv = 0;
|
d@0
|
325 } else {
|
d@0
|
326 /* can't do it */
|
d@0
|
327 return 0;
|
d@0
|
328 }
|
d@0
|
329
|
d@0
|
330 vd[vm].n0 = 0; vd[vm].n1 = m;
|
d@0
|
331 vd[vm].is_bytes = ims * sizeof(R); vd[vm].os_bytes = oms * sizeof(R);
|
d@0
|
332 vd[vm].dm = dm;
|
d@0
|
333
|
d@0
|
334 vd[vv].n0 = 0; vd[vv].n1 = v;
|
d@0
|
335 vd[vv].is_bytes = ivs * sizeof(R); vd[vv].os_bytes = ovs * sizeof(R);
|
d@0
|
336 vd[vv].dm = 0;
|
d@0
|
337
|
d@0
|
338 /* Restrictions on the size of the SPU local store: */
|
d@0
|
339 if (!(0
|
d@0
|
340 /* for contiguous I/O, one array of size R must fit into
|
d@0
|
341 local store. (The fits_in_local_store() check is
|
d@0
|
342 redundant because R <= MAX_N holds, but we check anyway
|
d@0
|
343 for clarity */
|
d@0
|
344 || (contiguous_r && fits_in_local_store(r, 1))
|
d@0
|
345
|
d@0
|
346 /* for noncontiguous I/O, VL arrays of size R must fit into
|
d@0
|
347 local store because of transposed DMA */
|
d@0
|
348 || fits_in_local_store(r, VL)))
|
d@0
|
349 return 0;
|
d@0
|
350
|
d@0
|
351 /* SPU DMA restrictions: */
|
d@0
|
352 if (!(1
|
d@0
|
353 /* If R is noncontiguous, then the SPU uses transposed DMA
|
d@0
|
354 and therefore dimension 0 must be aligned */
|
d@0
|
355 && (contiguous_r || vd[0].n1 % VL == 0)
|
d@0
|
356
|
d@0
|
357 /* dimension 1 is arbitrary */
|
d@0
|
358
|
d@0
|
359 /* dimension-0 strides must be either contiguous or aligned */
|
d@0
|
360 && contiguous_or_aligned_p((INT)vd[0].is_bytes)
|
d@0
|
361 && contiguous_or_aligned_p((INT)vd[0].os_bytes)
|
d@0
|
362
|
d@0
|
363 /* dimension-1 strides must be aligned */
|
d@0
|
364 && ((vd[1].is_bytes % ALIGNMENTA) == 0)
|
d@0
|
365 && ((vd[1].os_bytes % ALIGNMENTA) == 0)
|
d@0
|
366 ))
|
d@0
|
367 return 0;
|
d@0
|
368
|
d@0
|
369 /* see if we can do it without overwriting the input with itself */
|
d@0
|
370 if (!(0
|
d@0
|
371 /* can operate out-of-place */
|
d@0
|
372 || !inplacep
|
d@0
|
373
|
d@0
|
374 /* all strides are in-place */
|
d@0
|
375 || (1
|
d@0
|
376 && irs == ors
|
d@0
|
377 && ims == oms
|
d@0
|
378 && ivs == ovs)
|
d@0
|
379
|
d@0
|
380 /* we cut across in-place dimension 1, and dimension 0 fits
|
d@0
|
381 into local store */
|
d@0
|
382 || (1
|
d@0
|
383 && cutdim == 1
|
d@0
|
384 && vd[cutdim].is_bytes == vd[cutdim].os_bytes
|
d@0
|
385 && fits_in_local_store(r, extent(vd + 0)))
|
d@0
|
386 ))
|
d@0
|
387 return 0;
|
d@0
|
388
|
d@0
|
389 return 1;
|
d@0
|
390 }
|
d@0
|
391
|
d@0
|
392 static
|
d@0
|
393 const struct spu_radices *find_radices(R *ri, R *ii, R *ro, R *io,
|
d@0
|
394 INT n, int *sign)
|
d@0
|
395 {
|
d@0
|
396 const struct spu_radices *p;
|
d@0
|
397 R *xi, *xo;
|
d@0
|
398
|
d@0
|
399 /* 32-bit overflow? */
|
d@0
|
400 if (!FITS_IN_INT(n))
|
d@0
|
401 return 0;
|
d@0
|
402
|
d@0
|
403 /* valid n? */
|
d@0
|
404 if (n <= 0 || n > MAX_N || ((n % REQUIRE_N_MULTIPLE_OF) != 0))
|
d@0
|
405 return 0;
|
d@0
|
406
|
d@0
|
407 /* see if we have a plan for this N */
|
d@0
|
408 p = X(spu_radices) + n / REQUIRE_N_MULTIPLE_OF;
|
d@0
|
409 if (!p->r[0])
|
d@0
|
410 return 0;
|
d@0
|
411
|
d@0
|
412 /* check whether the data format is supported */
|
d@0
|
413 if (ii == ri + 1 && io == ro + 1) { /* R I R I ... format */
|
d@0
|
414 *sign = FFT_SIGN;
|
d@0
|
415 xi = ri; xo = ro;
|
d@0
|
416 } else if (ri == ii + 1 && ro == io + 1) { /* I R I R ... format */
|
d@0
|
417 *sign = -FFT_SIGN;
|
d@0
|
418 xi = ii; xo = io;
|
d@0
|
419 } else
|
d@0
|
420 return 0; /* can't do it */
|
d@0
|
421
|
d@0
|
422 if (!ALIGNEDA(xi) || !ALIGNEDA(xo))
|
d@0
|
423 return 0;
|
d@0
|
424
|
d@0
|
425 return p;
|
d@0
|
426 }
|
d@0
|
427
|
d@0
|
428 static const plan_adt padt = {
|
d@0
|
429 X(dft_solve), awake, print, X(plan_null_destroy)
|
d@0
|
430 };
|
d@0
|
431
|
d@0
|
432 static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
|
d@0
|
433 {
|
d@0
|
434 P *pln;
|
d@0
|
435 const S *ego = (const S *)ego_;
|
d@0
|
436 const problem_dft *p = (const problem_dft *) p_;
|
d@0
|
437 int sign;
|
d@0
|
438 const struct spu_radices *radices;
|
d@0
|
439 struct cell_iodim vd[2];
|
d@0
|
440 INT m, ims, oms, v, ivs, ovs;
|
d@0
|
441
|
d@0
|
442 /* basic conditions */
|
d@0
|
443 if (!(1
|
d@0
|
444 && X(cell_nspe)() > 0
|
d@0
|
445 && p->sz->rnk == 1
|
d@0
|
446 && p->vecsz->rnk <= 2
|
d@0
|
447 && !NO_SIMDP(plnr)
|
d@0
|
448 ))
|
d@0
|
449 return 0;
|
d@0
|
450
|
d@0
|
451 /* see if SPU supports N */
|
d@0
|
452 {
|
d@0
|
453 iodim *d = p->sz->dims;
|
d@0
|
454 radices = find_radices(p->ri, p->ii, p->ro, p->io, d[0].n, &sign);
|
d@0
|
455 if (!radices)
|
d@0
|
456 return 0;
|
d@0
|
457 }
|
d@0
|
458
|
d@0
|
459 /* canonicalize to vrank 2 */
|
d@0
|
460 if (p->vecsz->rnk >= 1) {
|
d@0
|
461 iodim *d = p->vecsz->dims + 0;
|
d@0
|
462 m = d->n; ims = d->is; oms = d->os;
|
d@0
|
463 } else {
|
d@0
|
464 m = 1; ims = oms = 0;
|
d@0
|
465 }
|
d@0
|
466
|
d@0
|
467 if (p->vecsz->rnk >= 2) {
|
d@0
|
468 iodim *d = p->vecsz->dims + 1;
|
d@0
|
469 v = d->n; ivs = d->is; ovs = d->os;
|
d@0
|
470 } else {
|
d@0
|
471 v = 1; ivs = ovs = 0;
|
d@0
|
472 }
|
d@0
|
473
|
d@0
|
474 /* see if strides are supported by the SPU DMA routine */
|
d@0
|
475 {
|
d@0
|
476 iodim *d = p->sz->dims + 0;
|
d@0
|
477 if (!build_vdim(p->ri == p->ro,
|
d@0
|
478 d->n, d->is, d->os,
|
d@0
|
479 m, ims, oms, 0,
|
d@0
|
480 v, ivs, ovs,
|
d@0
|
481 vd, ego->cutdim))
|
d@0
|
482 return 0;
|
d@0
|
483 }
|
d@0
|
484
|
d@0
|
485 pln = MKPLAN_DFT(P, &padt, apply);
|
d@0
|
486
|
d@0
|
487 pln->radices = *radices;
|
d@0
|
488 {
|
d@0
|
489 iodim *d = p->sz->dims + 0;
|
d@0
|
490 pln->n = d[0].n;
|
d@0
|
491 pln->is = d[0].is;
|
d@0
|
492 pln->os = d[0].os;
|
d@0
|
493 }
|
d@0
|
494 pln->sign = sign;
|
d@0
|
495 pln->v[0] = vd[0];
|
d@0
|
496 pln->v[1] = vd[1];
|
d@0
|
497 pln->cutdim = ego->cutdim;
|
d@0
|
498 pln->W = 0;
|
d@0
|
499
|
d@0
|
500 pln->rw = 0;
|
d@0
|
501
|
d@0
|
502 cost_model(pln, &pln->super.super.ops);
|
d@0
|
503
|
d@0
|
504 return &(pln->super.super);
|
d@0
|
505 }
|
d@0
|
506
|
d@0
|
507 static void solver_destroy(solver *ego)
|
d@0
|
508 {
|
d@0
|
509 UNUSED(ego);
|
d@0
|
510 X(cell_deactivate_spes)();
|
d@0
|
511 }
|
d@0
|
512
|
d@0
|
513 static solver *mksolver(int cutdim)
|
d@0
|
514 {
|
d@0
|
515 static const solver_adt sadt = { PROBLEM_DFT, mkplan, solver_destroy };
|
d@0
|
516 S *slv = MKSOLVER(S, &sadt);
|
d@0
|
517 slv->cutdim = cutdim;
|
d@0
|
518 X(cell_activate_spes)();
|
d@0
|
519 return &(slv->super);
|
d@0
|
520 }
|
d@0
|
521
|
d@0
|
522 void X(dft_direct_cell_register)(planner *p)
|
d@0
|
523 {
|
d@0
|
524 REGISTER_SOLVER(p, mksolver(0));
|
d@0
|
525 REGISTER_SOLVER(p, mksolver(1));
|
d@0
|
526 }
|
d@0
|
527
|
d@0
|
528 /**************************************************************/
|
d@0
|
529 /* solvers with twiddle factors: */
|
d@0
|
530
|
d@0
|
531 typedef struct {
|
d@0
|
532 plan_dftw super;
|
d@0
|
533 plan *cld;
|
d@0
|
534 } Pw;
|
d@0
|
535
|
d@0
|
536 typedef struct {
|
d@0
|
537 ct_solver super;
|
d@0
|
538 int cutdim;
|
d@0
|
539 } Sw;
|
d@0
|
540
|
d@0
|
541 static void destroyw(plan *ego_)
|
d@0
|
542 {
|
d@0
|
543 Pw *ego = (Pw *) ego_;
|
d@0
|
544 X(plan_destroy_internal)(ego->cld);
|
d@0
|
545 }
|
d@0
|
546
|
d@0
|
547 static void printw(const plan *ego_, printer *p)
|
d@0
|
548 {
|
d@0
|
549 const Pw *ego = (const Pw *) ego_;
|
d@0
|
550 const P *cld = (const P *) ego->cld;
|
d@0
|
551 p->print(p, "(dftw-direct-cell-%D-%D%v%(%p%))",
|
d@0
|
552 cld->rw, cld->mw, cld->v[1].n1,
|
d@0
|
553 ego->cld);
|
d@0
|
554 }
|
d@0
|
555
|
d@0
|
556 static void awakew(plan *ego_, enum wakefulness wakefulness)
|
d@0
|
557 {
|
d@0
|
558 Pw *ego = (Pw *) ego_;
|
d@0
|
559 X(plan_awake)(ego->cld, wakefulness);
|
d@0
|
560 }
|
d@0
|
561
|
d@0
|
562 static void applyw(const plan *ego_, R *rio, R *iio)
|
d@0
|
563 {
|
d@0
|
564 const Pw *ego = (const Pw *) ego_;
|
d@0
|
565 dftapply cldapply = ((plan_dft *) ego->cld)->apply;
|
d@0
|
566 cldapply(ego->cld, rio, iio, rio, iio);
|
d@0
|
567 }
|
d@0
|
568
|
d@0
|
569 static plan *mkcldw(const ct_solver *ego_,
|
d@0
|
570 INT r, INT irs, INT ors,
|
d@0
|
571 INT m, INT ms,
|
d@0
|
572 INT v, INT ivs, INT ovs,
|
d@0
|
573 INT mstart, INT mcount,
|
d@0
|
574 R *rio, R *iio,
|
d@0
|
575 planner *plnr)
|
d@0
|
576 {
|
d@0
|
577 const Sw *ego = (const Sw *)ego_;
|
d@0
|
578 const struct spu_radices *radices;
|
d@0
|
579 int sign;
|
d@0
|
580 Pw *pln;
|
d@0
|
581 P *cld;
|
d@0
|
582 struct cell_iodim vd[2];
|
d@0
|
583 int dm = 0;
|
d@0
|
584
|
d@0
|
585 static const plan_adt padtw = {
|
d@0
|
586 0, awakew, printw, destroyw
|
d@0
|
587 };
|
d@0
|
588
|
d@0
|
589 /* use only if cell is enabled */
|
d@0
|
590 if (NO_SIMDP(plnr) || X(cell_nspe)() <= 0)
|
d@0
|
591 return 0;
|
d@0
|
592
|
d@0
|
593 /* no way in hell this SPU stuff is going to work with pthreads */
|
d@0
|
594 if (mstart != 0 || mcount != m)
|
d@0
|
595 return 0;
|
d@0
|
596
|
d@0
|
597 /* don't bother for small N */
|
d@0
|
598 if (r * m * v <= MAX_N / 16 /* ARBITRARY */)
|
d@0
|
599 return 0;
|
d@0
|
600
|
d@0
|
601 /* check whether the R dimension is supported */
|
d@0
|
602 radices = find_radices(rio, iio, rio, iio, r, &sign);
|
d@0
|
603
|
d@0
|
604 if (!radices)
|
d@0
|
605 return 0;
|
d@0
|
606
|
d@0
|
607 /* encode decimation in DM */
|
d@0
|
608 switch (ego->super.dec) {
|
d@0
|
609 case DECDIT:
|
d@0
|
610 case DECDIT+TRANSPOSE:
|
d@0
|
611 dm = 1;
|
d@0
|
612 break;
|
d@0
|
613 case DECDIF:
|
d@0
|
614 case DECDIF+TRANSPOSE:
|
d@0
|
615 dm = -1;
|
d@0
|
616 break;
|
d@0
|
617 }
|
d@0
|
618
|
d@0
|
619 if (!build_vdim(1,
|
d@0
|
620 r, irs, ors,
|
d@0
|
621 m, ms, ms, dm,
|
d@0
|
622 v, ivs, ovs,
|
d@0
|
623 vd, ego->cutdim))
|
d@0
|
624 return 0;
|
d@0
|
625
|
d@0
|
626 cld = MKPLAN_DFT(P, &padt, apply);
|
d@0
|
627
|
d@0
|
628 cld->radices = *radices;
|
d@0
|
629 cld->n = r;
|
d@0
|
630 cld->is = irs;
|
d@0
|
631 cld->os = ors;
|
d@0
|
632 cld->sign = sign;
|
d@0
|
633 cld->W = 0;
|
d@0
|
634
|
d@0
|
635 cld->rw = r; cld->mw = m; cld->td = 0;
|
d@0
|
636
|
d@0
|
637 cld->v[0] = vd[0];
|
d@0
|
638 cld->v[1] = vd[1];
|
d@0
|
639 cld->cutdim = ego->cutdim;
|
d@0
|
640
|
d@0
|
641 pln = MKPLAN_DFTW(Pw, &padtw, applyw);
|
d@0
|
642 pln->cld = &cld->super.super;
|
d@0
|
643
|
d@0
|
644 cost_model(cld, &pln->super.super.ops);
|
d@0
|
645
|
d@0
|
646 /* for twiddle factors: one mul and one fma per complex point */
|
d@0
|
647 pln->super.super.ops.fma += (r * m * v) / VL;
|
d@0
|
648 pln->super.super.ops.mul += (r * m * v) / VL;
|
d@0
|
649
|
d@0
|
650 /* FIXME: heuristics */
|
d@0
|
651 /* pay penalty for large radices: */
|
d@0
|
652 if (r > MAX_N / 16)
|
d@0
|
653 pln->super.super.ops.other += ((r - (MAX_N / 16)) * m * v);
|
d@0
|
654
|
d@0
|
655 return &(pln->super.super);
|
d@0
|
656 }
|
d@0
|
657
|
d@0
|
658 /* heuristic to enable vector recursion */
|
d@0
|
659 static int force_vrecur(const ct_solver *ego, const problem_dft *p)
|
d@0
|
660 {
|
d@0
|
661 iodim *d;
|
d@0
|
662 INT n, r, m;
|
d@0
|
663 INT cutoff = 128;
|
d@0
|
664
|
d@0
|
665 A(p->vecsz->rnk == 1);
|
d@0
|
666 A(p->sz->rnk == 1);
|
d@0
|
667
|
d@0
|
668 n = p->sz->dims[0].n;
|
d@0
|
669 r = X(choose_radix)(ego->r, n);
|
d@0
|
670 m = n / r;
|
d@0
|
671
|
d@0
|
672 d = p->vecsz->dims + 0;
|
d@0
|
673 return (1
|
d@0
|
674 /* some vector dimension is contiguous */
|
d@0
|
675 && (d->is == 2 || d->os == 2)
|
d@0
|
676
|
d@0
|
677 /* vector is sufficiently long */
|
d@0
|
678 && d->n >= cutoff
|
d@0
|
679
|
d@0
|
680 /* transform is sufficiently long */
|
d@0
|
681 && m >= cutoff
|
d@0
|
682 && r >= cutoff);
|
d@0
|
683 }
|
d@0
|
684
|
d@0
|
685 static void regsolverw(planner *plnr, INT r, int dec, int cutdim)
|
d@0
|
686 {
|
d@0
|
687 Sw *slv = (Sw *)X(mksolver_ct)(sizeof(Sw), r, dec, mkcldw, force_vrecur);
|
d@0
|
688 slv->cutdim = cutdim;
|
d@0
|
689 REGISTER_SOLVER(plnr, &(slv->super.super));
|
d@0
|
690 }
|
d@0
|
691
|
d@0
|
692 void X(ct_cell_direct_register)(planner *p)
|
d@0
|
693 {
|
d@0
|
694 INT n;
|
d@0
|
695
|
d@0
|
696 for (n = 0; n <= MAX_N; n += REQUIRE_N_MULTIPLE_OF) {
|
d@0
|
697 const struct spu_radices *r =
|
d@0
|
698 X(spu_radices) + n / REQUIRE_N_MULTIPLE_OF;
|
d@0
|
699 if (r->r[0]) {
|
d@0
|
700 regsolverw(p, n, DECDIT, 0);
|
d@0
|
701 regsolverw(p, n, DECDIT, 1);
|
d@0
|
702 regsolverw(p, n, DECDIF+TRANSPOSE, 0);
|
d@0
|
703 regsolverw(p, n, DECDIF+TRANSPOSE, 1);
|
d@0
|
704 }
|
d@0
|
705 }
|
d@0
|
706 }
|
d@0
|
707
|
d@0
|
708
|
d@0
|
709 #endif /* HAVE_CELL */
|
d@0
|
710
|
d@0
|
711
|