To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

The primary repository for this project is hosted at https://github.com/sonic-visualiser/sv-dependency-builds .
This repository is a read-only copy which is updated automatically every hour.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / src / fftw-3.3.8 / dft / dftw-direct.c @ 167:bd3cc4d1df30

History | View | Annotate | Download (8.96 KB)

1
/*
2
 * Copyright (c) 2003, 2007-14 Matteo Frigo
3
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 *
19
 */
20

    
21

    
22
#include "dft/ct.h"
23

    
24
typedef struct {
25
     ct_solver super;
26
     const ct_desc *desc;
27
     int bufferedp;
28
     kdftw k;
29
} S;
30

    
31
typedef struct {
32
     plan_dftw super;
33
     kdftw k;
34
     INT r;
35
     stride rs;
36
     INT m, ms, v, vs, mb, me, extra_iter;
37
     stride brs;
38
     twid *td;
39
     const S *slv;
40
} P;
41

    
42

    
43
/*************************************************************
44
  Nonbuffered code
45
 *************************************************************/
46
static void apply(const plan *ego_, R *rio, R *iio)
47
{
48
     const P *ego = (const P *) ego_;
49
     INT i;
50
     ASSERT_ALIGNED_DOUBLE;
51
     for (i = 0; i < ego->v; ++i, rio += ego->vs, iio += ego->vs) {
52
          INT  mb = ego->mb, ms = ego->ms;
53
          ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, 
54
                 ego->rs, mb, ego->me, ms);
55
     }
56
}
57

    
58
static void apply_extra_iter(const plan *ego_, R *rio, R *iio)
59
{
60
     const P *ego = (const P *) ego_;
61
     INT i, v = ego->v, vs = ego->vs;
62
     INT mb = ego->mb, me = ego->me, mm = me - 1, ms = ego->ms;
63
     ASSERT_ALIGNED_DOUBLE;
64
     for (i = 0; i < v; ++i, rio += vs, iio += vs) {
65
          ego->k(rio + mb*ms, iio + mb*ms, ego->td->W, 
66
                 ego->rs, mb, mm, ms);
67
          ego->k(rio + mm*ms, iio + mm*ms, ego->td->W, 
68
                 ego->rs, mm, mm+2, 0);
69
     }
70
}
71

    
72
/*************************************************************
73
  Buffered code
74
 *************************************************************/
75
static void dobatch(const P *ego, R *rA, R *iA, INT mb, INT me, R *buf)
76
{
77
     INT brs = WS(ego->brs, 1);
78
     INT rs = WS(ego->rs, 1);
79
     INT ms = ego->ms;
80

    
81
     X(cpy2d_pair_ci)(rA + mb*ms, iA + mb*ms, buf, buf + 1,
82
                      ego->r, rs, brs,
83
                      me - mb, ms, 2);
84
     ego->k(buf, buf + 1, ego->td->W, ego->brs, mb, me, 2);
85
     X(cpy2d_pair_co)(buf, buf + 1, rA + mb*ms, iA + mb*ms,
86
                      ego->r, brs, rs,
87
                      me - mb, 2, ms);
88
}
89

    
90
/* must be even for SIMD alignment; should not be 2^k to avoid
91
   associativity conflicts */
92
static INT compute_batchsize(INT radix)
93
{
94
     /* round up to multiple of 4 */
95
     radix += 3;
96
     radix &= -4;
97

    
98
     return (radix + 2);
99
}
100

    
101
static void apply_buf(const plan *ego_, R *rio, R *iio)
102
{
103
     const P *ego = (const P *) ego_;
104
     INT i, j, v = ego->v, r = ego->r;
105
     INT batchsz = compute_batchsize(r);
106
     R *buf;
107
     INT mb = ego->mb, me = ego->me;
108
     size_t bufsz = r * batchsz * 2 * sizeof(R);
109

    
110
     BUF_ALLOC(R *, buf, bufsz);
111

    
112
     for (i = 0; i < v; ++i, rio += ego->vs, iio += ego->vs) {
113
          for (j = mb; j + batchsz < me; j += batchsz) 
114
               dobatch(ego, rio, iio, j, j + batchsz, buf);
115

    
116
          dobatch(ego, rio, iio, j, me, buf);
117
     }
118

    
119
     BUF_FREE(buf, bufsz);
120
}
121

    
122
/*************************************************************
123
  common code
124
 *************************************************************/
125
static void awake(plan *ego_, enum wakefulness wakefulness)
126
{
127
     P *ego = (P *) ego_;
128

    
129
     X(twiddle_awake)(wakefulness, &ego->td, ego->slv->desc->tw,
130
                      ego->r * ego->m, ego->r, ego->m + ego->extra_iter);
131
}
132

    
133
static void destroy(plan *ego_)
134
{
135
     P *ego = (P *) ego_;
136
     X(stride_destroy)(ego->brs);
137
     X(stride_destroy)(ego->rs);
138
}
139

    
140
static void print(const plan *ego_, printer *p)
141
{
142
     const P *ego = (const P *) ego_;
143
     const S *slv = ego->slv;
144
     const ct_desc *e = slv->desc;
145

    
146
     if (slv->bufferedp)
147
          p->print(p, "(dftw-directbuf/%D-%D/%D%v \"%s\")",
148
                   compute_batchsize(ego->r), ego->r,
149
                   X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
150
     else
151
          p->print(p, "(dftw-direct-%D/%D%v \"%s\")",
152
                   ego->r, X(twiddle_length)(ego->r, e->tw), ego->v, e->nam);
153
}
154

    
155
static int applicable0(const S *ego,
156
                       INT r, INT irs, INT ors,
157
                       INT m, INT ms,
158
                       INT v, INT ivs, INT ovs,
159
                       INT mb, INT me,
160
                       R *rio, R *iio,
161
                       const planner *plnr, INT *extra_iter)
162
{
163
     const ct_desc *e = ego->desc;
164
     UNUSED(v);
165

    
166
     return (
167
          1
168
          && r == e->radix
169
          && irs == ors /* in-place along R */
170
          && ivs == ovs /* in-place along V */
171

    
172
          /* check for alignment/vector length restrictions */
173
          && ((*extra_iter = 0,
174
               e->genus->okp(e, rio, iio, irs, ivs, m, mb, me, ms, plnr))
175
              ||
176
              (*extra_iter = 1,
177
               (1
178
                /* FIXME: require full array, otherwise some threads
179
                   may be extra_iter and other threads won't be.
180
                   Generating the proper twiddle factors is a pain in
181
                   this case */
182
                && mb == 0 && me == m
183
                && e->genus->okp(e, rio, iio, irs, ivs,
184
                                 m, mb, me - 1, ms, plnr)
185
                && e->genus->okp(e, rio, iio, irs, ivs,
186
                                 m, me - 1, me + 1, ms, plnr))))
187

    
188
          && (e->genus->okp(e, rio + ivs, iio + ivs, irs, ivs,
189
                            m, mb, me - *extra_iter, ms, plnr))
190

    
191
          );
192
}
193

    
194
static int applicable0_buf(const S *ego,
195
                           INT r, INT irs, INT ors,
196
                           INT m, INT ms,
197
                           INT v, INT ivs, INT ovs,
198
                           INT mb, INT me,
199
                           R *rio, R *iio,
200
                           const planner *plnr)
201
{
202
     const ct_desc *e = ego->desc;
203
     INT batchsz;
204
     UNUSED(v); UNUSED(ms); UNUSED(rio); UNUSED(iio);
205

    
206
     return (
207
          1
208
          && r == e->radix
209
          && irs == ors /* in-place along R */
210
          && ivs == ovs /* in-place along V */
211

    
212
          /* check for alignment/vector length restrictions, both for
213
             batchsize and for the remainder */
214
          && (batchsz = compute_batchsize(r), 1)
215
          && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
216
                            m, mb, mb + batchsz, 2, plnr))
217
          && (e->genus->okp(e, 0, ((const R *)0) + 1, 2 * batchsz, 0,
218
                            m, mb, me, 2, plnr))
219
          );
220
}
221

    
222
static int applicable(const S *ego,
223
                      INT r, INT irs, INT ors,
224
                      INT m, INT ms,
225
                      INT v, INT ivs, INT ovs,
226
                      INT mb, INT me,
227
                      R *rio, R *iio,
228
                      const planner *plnr, INT *extra_iter)
229
{
230
     if (ego->bufferedp) {
231
          *extra_iter = 0;
232
          if (!applicable0_buf(ego,
233
                               r, irs, ors, m, ms, v, ivs, ovs, mb, me,
234
                               rio, iio, plnr))
235
               return 0;
236
     } else {
237
          if (!applicable0(ego,
238
                           r, irs, ors, m, ms, v, ivs, ovs, mb, me,
239
                           rio, iio, plnr, extra_iter))
240
               return 0;
241
     }
242

    
243
     if (NO_UGLYP(plnr) && X(ct_uglyp)((ego->bufferedp? (INT)512 : (INT)16),
244
                                       v, m * r, r))
245
          return 0;
246

    
247
     if (m * r > 262144 && NO_FIXED_RADIX_LARGE_NP(plnr))
248
          return 0;
249

    
250
     return 1;
251
}
252

    
253
static plan *mkcldw(const ct_solver *ego_,
254
                    INT r, INT irs, INT ors,
255
                    INT m, INT ms,
256
                    INT v, INT ivs, INT ovs,
257
                    INT mstart, INT mcount,
258
                    R *rio, R *iio,
259
                    planner *plnr)
260
{
261
     const S *ego = (const S *) ego_;
262
     P *pln;
263
     const ct_desc *e = ego->desc;
264
     INT extra_iter;
265

    
266
     static const plan_adt padt = {
267
          0, awake, print, destroy
268
     };
269

    
270
     A(mstart >= 0 && mstart + mcount <= m);
271
     if (!applicable(ego,
272
                     r, irs, ors, m, ms, v, ivs, ovs, mstart, mstart + mcount,
273
                     rio, iio, plnr, &extra_iter))
274
          return (plan *)0;
275

    
276
     if (ego->bufferedp) {
277
          pln = MKPLAN_DFTW(P, &padt, apply_buf);
278
     } else {
279
          pln = MKPLAN_DFTW(P, &padt, extra_iter ? apply_extra_iter : apply);
280
     }
281

    
282
     pln->k = ego->k;
283
     pln->rs = X(mkstride)(r, irs);
284
     pln->td = 0;
285
     pln->r = r;
286
     pln->m = m;
287
     pln->ms = ms;
288
     pln->v = v;
289
     pln->vs = ivs;
290
     pln->mb = mstart;
291
     pln->me = mstart + mcount;
292
     pln->slv = ego;
293
     pln->brs = X(mkstride)(r, 2 * compute_batchsize(r));
294
     pln->extra_iter = extra_iter;
295

    
296
     X(ops_zero)(&pln->super.super.ops);
297
     X(ops_madd2)(v * (mcount/e->genus->vl), &e->ops, &pln->super.super.ops);
298

    
299
     if (ego->bufferedp) {
300
          /* 8 load/stores * N * V */
301
          pln->super.super.ops.other += 8 * r * mcount * v;
302
     }
303

    
304
     pln->super.super.could_prune_now_p =
305
          (!ego->bufferedp && r >= 5 && r < 64 && m >= r);
306
     return &(pln->super.super);
307
}
308

    
309
static void regone(planner *plnr, kdftw codelet,
310
                   const ct_desc *desc, int dec, int bufferedp)
311
{
312
     S *slv = (S *)X(mksolver_ct)(sizeof(S), desc->radix, dec, mkcldw, 0);
313
     slv->k = codelet;
314
     slv->desc = desc;
315
     slv->bufferedp = bufferedp;
316
     REGISTER_SOLVER(plnr, &(slv->super.super));
317
     if (X(mksolver_ct_hook)) {
318
          slv = (S *)X(mksolver_ct_hook)(sizeof(S), desc->radix,
319
                                         dec, mkcldw, 0);
320
          slv->k = codelet;
321
          slv->desc = desc;
322
          slv->bufferedp = bufferedp;
323
          REGISTER_SOLVER(plnr, &(slv->super.super));
324
     }
325
}
326

    
327
void X(regsolver_ct_directw)(planner *plnr, kdftw codelet,
328
                             const ct_desc *desc, int dec)
329
{
330
     regone(plnr, codelet, desc, dec, /* bufferedp */ 0);
331
     regone(plnr, codelet, desc, dec, /* bufferedp */ 1);
332
}