To check out this repository please hg clone the following URL, or open the URL using EasyMercurial or your preferred Mercurial client.

The primary repository for this project is hosted at https://github.com/sonic-visualiser/sv-dependency-builds .
This repository is a read-only copy which is updated automatically every hour.

Statistics Download as Zip
| Branch: | Tag: | Revision:

root / src / fftw-3.3.8 / dft / scalar / codelets / t2_8.c @ 167:bd3cc4d1df30

History | View | Annotate | Download (9.92 KB)

1
/*
2
 * Copyright (c) 2003, 2007-14 Matteo Frigo
3
 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4
 *
5
 * This program is free software; you can redistribute it and/or modify
6
 * it under the terms of the GNU General Public License as published by
7
 * the Free Software Foundation; either version 2 of the License, or
8
 * (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18
 *
19
 */
20

    
21
/* This file was automatically generated --- DO NOT EDIT */
22
/* Generated on Thu May 24 08:04:19 EDT 2018 */
23

    
24
#include "dft/codelet-dft.h"
25

    
26
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27

    
28
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
29

    
30
/*
31
 * This function contains 74 FP additions, 50 FP multiplications,
32
 * (or, 44 additions, 20 multiplications, 30 fused multiply/add),
33
 * 48 stack variables, 1 constants, and 32 memory accesses
34
 */
35
#include "dft/scalar/t.h"
36

    
37
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38
{
39
     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
40
     {
41
          INT m;
42
          for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
43
               E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
44
               {
45
                    E T4, Tm, Tr, Ta, TB, TF;
46
                    T2 = W[0];
47
                    T3 = W[2];
48
                    T4 = T2 * T3;
49
                    Tl = W[4];
50
                    Tm = T2 * Tl;
51
                    Tn = W[5];
52
                    Tr = T2 * Tn;
53
                    T5 = W[1];
54
                    T6 = W[3];
55
                    Ta = T2 * T6;
56
                    Tf = FMA(T5, T6, T4);
57
                    T7 = FNMS(T5, T6, T4);
58
                    Ts = FNMS(T5, Tl, Tr);
59
                    Tb = FMA(T5, T3, Ta);
60
                    To = FMA(T5, Tn, Tm);
61
                    TB = Tf * Tl;
62
                    TF = Tf * Tn;
63
                    Ti = FNMS(T5, T3, Ta);
64
                    TC = FMA(Ti, Tn, TB);
65
                    TG = FNMS(Ti, Tl, TF);
66
               }
67
               {
68
                    E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
69
                    E TI, T11, T13, T15, T16;
70
                    T1 = ri[0];
71
                    T1s = ii[0];
72
                    {
73
                         E T8, T9, Tc, T1q;
74
                         T8 = ri[WS(rs, 4)];
75
                         T9 = T7 * T8;
76
                         Tc = ii[WS(rs, 4)];
77
                         T1q = T7 * Tc;
78
                         Td = FMA(Tb, Tc, T9);
79
                         T1r = FNMS(Tb, T8, T1q);
80
                    }
81
                    {
82
                         E Tp, Tq, Tt, TX;
83
                         Tp = ri[WS(rs, 6)];
84
                         Tq = To * Tp;
85
                         Tt = ii[WS(rs, 6)];
86
                         TX = To * Tt;
87
                         Tu = FMA(Ts, Tt, Tq);
88
                         TY = FNMS(Ts, Tp, TX);
89
                    }
90
                    {
91
                         E Tg, Th, Tj, TV;
92
                         Tg = ri[WS(rs, 2)];
93
                         Th = Tf * Tg;
94
                         Tj = ii[WS(rs, 2)];
95
                         TV = Tf * Tj;
96
                         Tk = FMA(Ti, Tj, Th);
97
                         TW = FNMS(Ti, Tg, TV);
98
                    }
99
                    {
100
                         E TK, TL, TM, T19, TO, TP, TQ, T1b;
101
                         TK = ri[WS(rs, 7)];
102
                         TL = Tl * TK;
103
                         TM = ii[WS(rs, 7)];
104
                         T19 = Tl * TM;
105
                         TO = ri[WS(rs, 3)];
106
                         TP = T3 * TO;
107
                         TQ = ii[WS(rs, 3)];
108
                         T1b = T3 * TQ;
109
                         TN = FMA(Tn, TM, TL);
110
                         TR = FMA(T6, TQ, TP);
111
                         T18 = TN - TR;
112
                         T1a = FNMS(Tn, TK, T19);
113
                         T1c = FNMS(T6, TO, T1b);
114
                         T1d = T1a - T1c;
115
                    }
116
                    {
117
                         E Tx, Ty, Tz, T12, TD, TE, TH, T14;
118
                         Tx = ri[WS(rs, 1)];
119
                         Ty = T2 * Tx;
120
                         Tz = ii[WS(rs, 1)];
121
                         T12 = T2 * Tz;
122
                         TD = ri[WS(rs, 5)];
123
                         TE = TC * TD;
124
                         TH = ii[WS(rs, 5)];
125
                         T14 = TC * TH;
126
                         TA = FMA(T5, Tz, Ty);
127
                         TI = FMA(TG, TH, TE);
128
                         T11 = TA - TI;
129
                         T13 = FNMS(T5, Tx, T12);
130
                         T15 = FNMS(TG, TD, T14);
131
                         T16 = T13 - T15;
132
                    }
133
                    {
134
                         E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
135
                         {
136
                              E TU, TZ, T1x, T1y;
137
                              TU = T1 - Td;
138
                              TZ = TW - TY;
139
                              T10 = TU + TZ;
140
                              T1g = TU - TZ;
141
                              T1x = T1s - T1r;
142
                              T1y = Tk - Tu;
143
                              T1z = T1x - T1y;
144
                              T1B = T1y + T1x;
145
                         }
146
                         {
147
                              E T17, T1e, T1h, T1i;
148
                              T17 = T11 + T16;
149
                              T1e = T18 - T1d;
150
                              T1f = T17 + T1e;
151
                              T1C = T1e - T17;
152
                              T1h = T16 - T11;
153
                              T1i = T18 + T1d;
154
                              T1j = T1h - T1i;
155
                              T1A = T1h + T1i;
156
                         }
157
                         ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
158
                         ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
159
                         ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
160
                         ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
161
                         ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
162
                         ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
163
                         ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
164
                         ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
165
                    }
166
                    {
167
                         E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
168
                         {
169
                              E Te, Tv, T1p, T1t;
170
                              Te = T1 + Td;
171
                              Tv = Tk + Tu;
172
                              Tw = Te + Tv;
173
                              T1k = Te - Tv;
174
                              T1p = TW + TY;
175
                              T1t = T1r + T1s;
176
                              T1u = T1p + T1t;
177
                              T1w = T1t - T1p;
178
                         }
179
                         {
180
                              E TJ, TS, T1l, T1m;
181
                              TJ = TA + TI;
182
                              TS = TN + TR;
183
                              TT = TJ + TS;
184
                              T1v = TS - TJ;
185
                              T1l = T13 + T15;
186
                              T1m = T1a + T1c;
187
                              T1n = T1l - T1m;
188
                              T1o = T1l + T1m;
189
                         }
190
                         ri[WS(rs, 4)] = Tw - TT;
191
                         ii[WS(rs, 4)] = T1u - T1o;
192
                         ri[0] = Tw + TT;
193
                         ii[0] = T1o + T1u;
194
                         ri[WS(rs, 6)] = T1k - T1n;
195
                         ii[WS(rs, 6)] = T1w - T1v;
196
                         ri[WS(rs, 2)] = T1k + T1n;
197
                         ii[WS(rs, 2)] = T1v + T1w;
198
                    }
199
               }
200
          }
201
     }
202
}
203

    
204
static const tw_instr twinstr[] = {
205
     {TW_CEXP, 0, 1},
206
     {TW_CEXP, 0, 3},
207
     {TW_CEXP, 0, 7},
208
     {TW_NEXT, 1, 0}
209
};
210

    
211
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, {44, 20, 30, 0}, 0, 0, 0 };
212

    
213
void X(codelet_t2_8) (planner *p) {
214
     X(kdft_dit_register) (p, t2_8, &desc);
215
}
216
#else
217

    
218
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
219

    
220
/*
221
 * This function contains 74 FP additions, 44 FP multiplications,
222
 * (or, 56 additions, 26 multiplications, 18 fused multiply/add),
223
 * 42 stack variables, 1 constants, and 32 memory accesses
224
 */
225
#include "dft/scalar/t.h"
226

    
227
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
228
{
229
     DK(KP707106781, +0.707106781186547524400844362104849039284835938);
230
     {
231
          INT m;
232
          for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
233
               E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
234
               {
235
                    E T4, Tb, T7, Ta;
236
                    T2 = W[0];
237
                    T5 = W[1];
238
                    T3 = W[2];
239
                    T6 = W[3];
240
                    T4 = T2 * T3;
241
                    Tb = T5 * T3;
242
                    T7 = T5 * T6;
243
                    Ta = T2 * T6;
244
                    T8 = T4 - T7;
245
                    Tc = Ta + Tb;
246
                    Tg = T4 + T7;
247
                    Ti = Ta - Tb;
248
                    Tl = W[4];
249
                    Tm = W[5];
250
                    Tn = FMA(T2, Tl, T5 * Tm);
251
                    Tz = FNMS(Ti, Tl, Tg * Tm);
252
                    Tp = FNMS(T5, Tl, T2 * Tm);
253
                    Tx = FMA(Tg, Tl, Ti * Tm);
254
               }
255
               {
256
                    E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
257
                    E TT;
258
                    {
259
                         E T1, T1c, Te, T1b, T9, Td;
260
                         T1 = ri[0];
261
                         T1c = ii[0];
262
                         T9 = ri[WS(rs, 4)];
263
                         Td = ii[WS(rs, 4)];
264
                         Te = FMA(T8, T9, Tc * Td);
265
                         T1b = FNMS(Tc, T9, T8 * Td);
266
                         Tf = T1 + Te;
267
                         T1i = T1c - T1b;
268
                         TL = T1 - Te;
269
                         T1d = T1b + T1c;
270
                    }
271
                    {
272
                         E TF, TW, TI, TX;
273
                         {
274
                              E TD, TE, TG, TH;
275
                              TD = ri[WS(rs, 7)];
276
                              TE = ii[WS(rs, 7)];
277
                              TF = FMA(Tl, TD, Tm * TE);
278
                              TW = FNMS(Tm, TD, Tl * TE);
279
                              TG = ri[WS(rs, 3)];
280
                              TH = ii[WS(rs, 3)];
281
                              TI = FMA(T3, TG, T6 * TH);
282
                              TX = FNMS(T6, TG, T3 * TH);
283
                         }
284
                         TJ = TF + TI;
285
                         T17 = TW + TX;
286
                         TV = TF - TI;
287
                         TY = TW - TX;
288
                    }
289
                    {
290
                         E Tk, TM, Tr, TN;
291
                         {
292
                              E Th, Tj, To, Tq;
293
                              Th = ri[WS(rs, 2)];
294
                              Tj = ii[WS(rs, 2)];
295
                              Tk = FMA(Tg, Th, Ti * Tj);
296
                              TM = FNMS(Ti, Th, Tg * Tj);
297
                              To = ri[WS(rs, 6)];
298
                              Tq = ii[WS(rs, 6)];
299
                              Tr = FMA(Tn, To, Tp * Tq);
300
                              TN = FNMS(Tp, To, Tn * Tq);
301
                         }
302
                         Ts = Tk + Tr;
303
                         T1j = Tk - Tr;
304
                         TO = TM - TN;
305
                         T1a = TM + TN;
306
                    }
307
                    {
308
                         E Tw, TR, TB, TS;
309
                         {
310
                              E Tu, Tv, Ty, TA;
311
                              Tu = ri[WS(rs, 1)];
312
                              Tv = ii[WS(rs, 1)];
313
                              Tw = FMA(T2, Tu, T5 * Tv);
314
                              TR = FNMS(T5, Tu, T2 * Tv);
315
                              Ty = ri[WS(rs, 5)];
316
                              TA = ii[WS(rs, 5)];
317
                              TB = FMA(Tx, Ty, Tz * TA);
318
                              TS = FNMS(Tz, Ty, Tx * TA);
319
                         }
320
                         TC = Tw + TB;
321
                         T16 = TR + TS;
322
                         TQ = Tw - TB;
323
                         TT = TR - TS;
324
                    }
325
                    {
326
                         E Tt, TK, T1f, T1g;
327
                         Tt = Tf + Ts;
328
                         TK = TC + TJ;
329
                         ri[WS(rs, 4)] = Tt - TK;
330
                         ri[0] = Tt + TK;
331
                         {
332
                              E T19, T1e, T15, T18;
333
                              T19 = T16 + T17;
334
                              T1e = T1a + T1d;
335
                              ii[0] = T19 + T1e;
336
                              ii[WS(rs, 4)] = T1e - T19;
337
                              T15 = Tf - Ts;
338
                              T18 = T16 - T17;
339
                              ri[WS(rs, 6)] = T15 - T18;
340
                              ri[WS(rs, 2)] = T15 + T18;
341
                         }
342
                         T1f = TJ - TC;
343
                         T1g = T1d - T1a;
344
                         ii[WS(rs, 2)] = T1f + T1g;
345
                         ii[WS(rs, 6)] = T1g - T1f;
346
                         {
347
                              E T11, T1k, T14, T1h, T12, T13;
348
                              T11 = TL - TO;
349
                              T1k = T1i - T1j;
350
                              T12 = TT - TQ;
351
                              T13 = TV + TY;
352
                              T14 = KP707106781 * (T12 - T13);
353
                              T1h = KP707106781 * (T12 + T13);
354
                              ri[WS(rs, 7)] = T11 - T14;
355
                              ii[WS(rs, 5)] = T1k - T1h;
356
                              ri[WS(rs, 3)] = T11 + T14;
357
                              ii[WS(rs, 1)] = T1h + T1k;
358
                         }
359
                         {
360
                              E TP, T1m, T10, T1l, TU, TZ;
361
                              TP = TL + TO;
362
                              T1m = T1j + T1i;
363
                              TU = TQ + TT;
364
                              TZ = TV - TY;
365
                              T10 = KP707106781 * (TU + TZ);
366
                              T1l = KP707106781 * (TZ - TU);
367
                              ri[WS(rs, 5)] = TP - T10;
368
                              ii[WS(rs, 7)] = T1m - T1l;
369
                              ri[WS(rs, 1)] = TP + T10;
370
                              ii[WS(rs, 3)] = T1l + T1m;
371
                         }
372
                    }
373
               }
374
          }
375
     }
376
}
377

    
378
static const tw_instr twinstr[] = {
379
     {TW_CEXP, 0, 1},
380
     {TW_CEXP, 0, 3},
381
     {TW_CEXP, 0, 7},
382
     {TW_NEXT, 1, 0}
383
};
384

    
385
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, {56, 26, 18, 0}, 0, 0, 0 };
386

    
387
void X(codelet_t2_8) (planner *p) {
388
     X(kdft_dit_register) (p, t2_8, &desc);
389
}
390
#endif