annotate src/fftw-3.3.5/dft/scalar/codelets/t1_12.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:36:15 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include t.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 118 FP additions, 68 FP multiplications,
Chris@42 32 * (or, 72 additions, 22 multiplications, 46 fused multiply/add),
Chris@42 33 * 84 stack variables, 2 constants, and 48 memory accesses
Chris@42 34 */
Chris@42 35 #include "t.h"
Chris@42 36
Chris@42 37 static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 40 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 41 {
Chris@42 42 INT m;
Chris@42 43 for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 44 E T2B, T2C;
Chris@42 45 {
Chris@42 46 E T1, T2i, T2e, Tl, T1Y, T10, T1S, TG, T2f, T1s, T2r, Ty, T1Z, T1H, T21;
Chris@42 47 E T1d, TI, TL, T2h, T1l, T2o, Te, TJ, T1w, TO, TR, TN, TK, TQ;
Chris@42 48 {
Chris@42 49 E TW, TZ, TY, T1X, TX;
Chris@42 50 T1 = ri[0];
Chris@42 51 T2i = ii[0];
Chris@42 52 {
Chris@42 53 E Th, Tk, Tg, Tj, T2d, Ti, TV;
Chris@42 54 Th = ri[WS(rs, 6)];
Chris@42 55 Tk = ii[WS(rs, 6)];
Chris@42 56 Tg = W[10];
Chris@42 57 Tj = W[11];
Chris@42 58 TW = ri[WS(rs, 9)];
Chris@42 59 TZ = ii[WS(rs, 9)];
Chris@42 60 T2d = Tg * Tk;
Chris@42 61 Ti = Tg * Th;
Chris@42 62 TV = W[16];
Chris@42 63 TY = W[17];
Chris@42 64 T2e = FNMS(Tj, Th, T2d);
Chris@42 65 Tl = FMA(Tj, Tk, Ti);
Chris@42 66 T1X = TV * TZ;
Chris@42 67 TX = TV * TW;
Chris@42 68 }
Chris@42 69 {
Chris@42 70 E Tn, Tq, Tt, T1o, To, Tw, Ts, Tp, Tv;
Chris@42 71 {
Chris@42 72 E TC, TF, TB, TE, T1R, TD, Tm;
Chris@42 73 TC = ri[WS(rs, 3)];
Chris@42 74 TF = ii[WS(rs, 3)];
Chris@42 75 T1Y = FNMS(TY, TW, T1X);
Chris@42 76 T10 = FMA(TY, TZ, TX);
Chris@42 77 TB = W[4];
Chris@42 78 TE = W[5];
Chris@42 79 Tn = ri[WS(rs, 10)];
Chris@42 80 Tq = ii[WS(rs, 10)];
Chris@42 81 T1R = TB * TF;
Chris@42 82 TD = TB * TC;
Chris@42 83 Tm = W[18];
Chris@42 84 Tt = ri[WS(rs, 2)];
Chris@42 85 T1S = FNMS(TE, TC, T1R);
Chris@42 86 TG = FMA(TE, TF, TD);
Chris@42 87 T1o = Tm * Tq;
Chris@42 88 To = Tm * Tn;
Chris@42 89 Tw = ii[WS(rs, 2)];
Chris@42 90 Ts = W[2];
Chris@42 91 Tp = W[19];
Chris@42 92 Tv = W[3];
Chris@42 93 }
Chris@42 94 {
Chris@42 95 E T12, T15, T13, T1D, T18, T1b, T17, T14, T1a;
Chris@42 96 {
Chris@42 97 E T1p, Tr, T1r, Tx, T1q, Tu, T11;
Chris@42 98 T12 = ri[WS(rs, 1)];
Chris@42 99 T1q = Ts * Tw;
Chris@42 100 Tu = Ts * Tt;
Chris@42 101 T1p = FNMS(Tp, Tn, T1o);
Chris@42 102 Tr = FMA(Tp, Tq, To);
Chris@42 103 T1r = FNMS(Tv, Tt, T1q);
Chris@42 104 Tx = FMA(Tv, Tw, Tu);
Chris@42 105 T15 = ii[WS(rs, 1)];
Chris@42 106 T11 = W[0];
Chris@42 107 T2f = T1p + T1r;
Chris@42 108 T1s = T1p - T1r;
Chris@42 109 T2r = Tx - Tr;
Chris@42 110 Ty = Tr + Tx;
Chris@42 111 T13 = T11 * T12;
Chris@42 112 T1D = T11 * T15;
Chris@42 113 }
Chris@42 114 T18 = ri[WS(rs, 5)];
Chris@42 115 T1b = ii[WS(rs, 5)];
Chris@42 116 T17 = W[8];
Chris@42 117 T14 = W[1];
Chris@42 118 T1a = W[9];
Chris@42 119 {
Chris@42 120 E T3, T6, T4, T1h, T9, Tc, T8, T5, Tb;
Chris@42 121 {
Chris@42 122 E T1E, T16, T1G, T1c, T1F, T19, T2;
Chris@42 123 T3 = ri[WS(rs, 4)];
Chris@42 124 T1F = T17 * T1b;
Chris@42 125 T19 = T17 * T18;
Chris@42 126 T1E = FNMS(T14, T12, T1D);
Chris@42 127 T16 = FMA(T14, T15, T13);
Chris@42 128 T1G = FNMS(T1a, T18, T1F);
Chris@42 129 T1c = FMA(T1a, T1b, T19);
Chris@42 130 T6 = ii[WS(rs, 4)];
Chris@42 131 T2 = W[6];
Chris@42 132 T1Z = T1E + T1G;
Chris@42 133 T1H = T1E - T1G;
Chris@42 134 T21 = T1c - T16;
Chris@42 135 T1d = T16 + T1c;
Chris@42 136 T4 = T2 * T3;
Chris@42 137 T1h = T2 * T6;
Chris@42 138 }
Chris@42 139 T9 = ri[WS(rs, 8)];
Chris@42 140 Tc = ii[WS(rs, 8)];
Chris@42 141 T8 = W[14];
Chris@42 142 T5 = W[7];
Chris@42 143 Tb = W[15];
Chris@42 144 {
Chris@42 145 E T1i, T7, T1k, Td, T1j, Ta, TH;
Chris@42 146 TI = ri[WS(rs, 7)];
Chris@42 147 T1j = T8 * Tc;
Chris@42 148 Ta = T8 * T9;
Chris@42 149 T1i = FNMS(T5, T3, T1h);
Chris@42 150 T7 = FMA(T5, T6, T4);
Chris@42 151 T1k = FNMS(Tb, T9, T1j);
Chris@42 152 Td = FMA(Tb, Tc, Ta);
Chris@42 153 TL = ii[WS(rs, 7)];
Chris@42 154 TH = W[12];
Chris@42 155 T2h = T1i + T1k;
Chris@42 156 T1l = T1i - T1k;
Chris@42 157 T2o = Td - T7;
Chris@42 158 Te = T7 + Td;
Chris@42 159 TJ = TH * TI;
Chris@42 160 T1w = TH * TL;
Chris@42 161 }
Chris@42 162 TO = ri[WS(rs, 11)];
Chris@42 163 TR = ii[WS(rs, 11)];
Chris@42 164 TN = W[20];
Chris@42 165 TK = W[13];
Chris@42 166 TQ = W[21];
Chris@42 167 }
Chris@42 168 }
Chris@42 169 }
Chris@42 170 }
Chris@42 171 {
Chris@42 172 E T1g, T1n, T2q, T1A, T1V, T28, TA, T2n, T1v, T1C, T1U, T29, T2m, T2k, T2l;
Chris@42 173 E T1f, T2a, T20;
Chris@42 174 {
Chris@42 175 E T2g, T1T, TT, T2j, TU, T1e;
Chris@42 176 {
Chris@42 177 E Tf, T1x, TM, T1z, TS, Tz, T1y, TP;
Chris@42 178 T1g = FNMS(KP500000000, Te, T1);
Chris@42 179 Tf = T1 + Te;
Chris@42 180 T1y = TN * TR;
Chris@42 181 TP = TN * TO;
Chris@42 182 T1x = FNMS(TK, TI, T1w);
Chris@42 183 TM = FMA(TK, TL, TJ);
Chris@42 184 T1z = FNMS(TQ, TO, T1y);
Chris@42 185 TS = FMA(TQ, TR, TP);
Chris@42 186 Tz = Tl + Ty;
Chris@42 187 T1n = FNMS(KP500000000, Ty, Tl);
Chris@42 188 T2q = FNMS(KP500000000, T2f, T2e);
Chris@42 189 T2g = T2e + T2f;
Chris@42 190 T1T = T1x + T1z;
Chris@42 191 T1A = T1x - T1z;
Chris@42 192 T1V = TS - TM;
Chris@42 193 TT = TM + TS;
Chris@42 194 T28 = Tf - Tz;
Chris@42 195 TA = Tf + Tz;
Chris@42 196 T2j = T2h + T2i;
Chris@42 197 T2n = FNMS(KP500000000, T2h, T2i);
Chris@42 198 }
Chris@42 199 T1v = FNMS(KP500000000, TT, TG);
Chris@42 200 TU = TG + TT;
Chris@42 201 T1e = T10 + T1d;
Chris@42 202 T1C = FNMS(KP500000000, T1d, T10);
Chris@42 203 T1U = FNMS(KP500000000, T1T, T1S);
Chris@42 204 T29 = T1S + T1T;
Chris@42 205 T2m = T2j - T2g;
Chris@42 206 T2k = T2g + T2j;
Chris@42 207 T2l = TU - T1e;
Chris@42 208 T1f = TU + T1e;
Chris@42 209 T2a = T1Y + T1Z;
Chris@42 210 T20 = FNMS(KP500000000, T1Z, T1Y);
Chris@42 211 }
Chris@42 212 {
Chris@42 213 E T1m, T1K, T2y, T2p, T2x, T2s, T1L, T1t, T1B, T1N, T2c, T2b;
Chris@42 214 ii[WS(rs, 9)] = T2m - T2l;
Chris@42 215 ii[WS(rs, 3)] = T2l + T2m;
Chris@42 216 ri[0] = TA + T1f;
Chris@42 217 ri[WS(rs, 6)] = TA - T1f;
Chris@42 218 T2c = T29 + T2a;
Chris@42 219 T2b = T29 - T2a;
Chris@42 220 T1m = FNMS(KP866025403, T1l, T1g);
Chris@42 221 T1K = FMA(KP866025403, T1l, T1g);
Chris@42 222 ii[0] = T2c + T2k;
Chris@42 223 ii[WS(rs, 6)] = T2k - T2c;
Chris@42 224 ri[WS(rs, 9)] = T28 + T2b;
Chris@42 225 ri[WS(rs, 3)] = T28 - T2b;
Chris@42 226 T2y = FNMS(KP866025403, T2o, T2n);
Chris@42 227 T2p = FMA(KP866025403, T2o, T2n);
Chris@42 228 T2x = FNMS(KP866025403, T2r, T2q);
Chris@42 229 T2s = FMA(KP866025403, T2r, T2q);
Chris@42 230 T1L = FMA(KP866025403, T1s, T1n);
Chris@42 231 T1t = FNMS(KP866025403, T1s, T1n);
Chris@42 232 T1B = FNMS(KP866025403, T1A, T1v);
Chris@42 233 T1N = FMA(KP866025403, T1A, T1v);
Chris@42 234 {
Chris@42 235 E T24, T27, T1Q, T2u, T23, T2v, T2w, T2t;
Chris@42 236 {
Chris@42 237 E T1u, T1W, T22, T1O, T1I, T2z, T2A, T25, T26, T1M, T1J, T1P;
Chris@42 238 T24 = T1m - T1t;
Chris@42 239 T1u = T1m + T1t;
Chris@42 240 T25 = FNMS(KP866025403, T1V, T1U);
Chris@42 241 T1W = FMA(KP866025403, T1V, T1U);
Chris@42 242 T26 = FNMS(KP866025403, T21, T20);
Chris@42 243 T22 = FMA(KP866025403, T21, T20);
Chris@42 244 T1O = FMA(KP866025403, T1H, T1C);
Chris@42 245 T1I = FNMS(KP866025403, T1H, T1C);
Chris@42 246 T2z = T2x + T2y;
Chris@42 247 T2B = T2y - T2x;
Chris@42 248 T27 = T25 - T26;
Chris@42 249 T2A = T25 + T26;
Chris@42 250 T1M = T1K + T1L;
Chris@42 251 T1Q = T1K - T1L;
Chris@42 252 T2C = T1B - T1I;
Chris@42 253 T1J = T1B + T1I;
Chris@42 254 T1P = T1N + T1O;
Chris@42 255 T2u = T1N - T1O;
Chris@42 256 ii[WS(rs, 8)] = T2A + T2z;
Chris@42 257 ii[WS(rs, 2)] = T2z - T2A;
Chris@42 258 ri[WS(rs, 8)] = T1u + T1J;
Chris@42 259 ri[WS(rs, 2)] = T1u - T1J;
Chris@42 260 ri[WS(rs, 10)] = T1M - T1P;
Chris@42 261 ri[WS(rs, 4)] = T1M + T1P;
Chris@42 262 T23 = T1W - T22;
Chris@42 263 T2v = T1W + T22;
Chris@42 264 T2w = T2s + T2p;
Chris@42 265 T2t = T2p - T2s;
Chris@42 266 }
Chris@42 267 ii[WS(rs, 10)] = T2w - T2v;
Chris@42 268 ii[WS(rs, 4)] = T2v + T2w;
Chris@42 269 ri[WS(rs, 1)] = T1Q + T23;
Chris@42 270 ri[WS(rs, 7)] = T1Q - T23;
Chris@42 271 ii[WS(rs, 7)] = T2u + T2t;
Chris@42 272 ii[WS(rs, 1)] = T2t - T2u;
Chris@42 273 ri[WS(rs, 5)] = T24 + T27;
Chris@42 274 ri[WS(rs, 11)] = T24 - T27;
Chris@42 275 }
Chris@42 276 }
Chris@42 277 }
Chris@42 278 }
Chris@42 279 ii[WS(rs, 11)] = T2C + T2B;
Chris@42 280 ii[WS(rs, 5)] = T2B - T2C;
Chris@42 281 }
Chris@42 282 }
Chris@42 283 }
Chris@42 284
Chris@42 285 static const tw_instr twinstr[] = {
Chris@42 286 {TW_FULL, 0, 12},
Chris@42 287 {TW_NEXT, 1, 0}
Chris@42 288 };
Chris@42 289
Chris@42 290 static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, {72, 22, 46, 0}, 0, 0, 0 };
Chris@42 291
Chris@42 292 void X(codelet_t1_12) (planner *p) {
Chris@42 293 X(kdft_dit_register) (p, t1_12, &desc);
Chris@42 294 }
Chris@42 295 #else /* HAVE_FMA */
Chris@42 296
Chris@42 297 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include t.h */
Chris@42 298
Chris@42 299 /*
Chris@42 300 * This function contains 118 FP additions, 60 FP multiplications,
Chris@42 301 * (or, 88 additions, 30 multiplications, 30 fused multiply/add),
Chris@42 302 * 47 stack variables, 2 constants, and 48 memory accesses
Chris@42 303 */
Chris@42 304 #include "t.h"
Chris@42 305
Chris@42 306 static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 307 {
Chris@42 308 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 309 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 310 {
Chris@42 311 INT m;
Chris@42 312 for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
Chris@42 313 E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
Chris@42 314 E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g;
Chris@42 315 E T1A, T1B;
Chris@42 316 {
Chris@42 317 E T6, T16, Tb, T17;
Chris@42 318 T1 = ri[0];
Chris@42 319 T1W = ii[0];
Chris@42 320 {
Chris@42 321 E T3, T5, T2, T4;
Chris@42 322 T3 = ri[WS(rs, 4)];
Chris@42 323 T5 = ii[WS(rs, 4)];
Chris@42 324 T2 = W[6];
Chris@42 325 T4 = W[7];
Chris@42 326 T6 = FMA(T2, T3, T4 * T5);
Chris@42 327 T16 = FNMS(T4, T3, T2 * T5);
Chris@42 328 }
Chris@42 329 {
Chris@42 330 E T8, Ta, T7, T9;
Chris@42 331 T8 = ri[WS(rs, 8)];
Chris@42 332 Ta = ii[WS(rs, 8)];
Chris@42 333 T7 = W[14];
Chris@42 334 T9 = W[15];
Chris@42 335 Tb = FMA(T7, T8, T9 * Ta);
Chris@42 336 T17 = FNMS(T9, T8, T7 * Ta);
Chris@42 337 }
Chris@42 338 T18 = KP866025403 * (T16 - T17);
Chris@42 339 T21 = KP866025403 * (Tb - T6);
Chris@42 340 Tc = T6 + Tb;
Chris@42 341 T15 = FNMS(KP500000000, Tc, T1);
Chris@42 342 T1V = T16 + T17;
Chris@42 343 T22 = FNMS(KP500000000, T1V, T1W);
Chris@42 344 }
Chris@42 345 {
Chris@42 346 E T11, T1n, TW, T1m;
Chris@42 347 {
Chris@42 348 E TO, TQ, TN, TP;
Chris@42 349 TO = ri[WS(rs, 9)];
Chris@42 350 TQ = ii[WS(rs, 9)];
Chris@42 351 TN = W[16];
Chris@42 352 TP = W[17];
Chris@42 353 TR = FMA(TN, TO, TP * TQ);
Chris@42 354 T1E = FNMS(TP, TO, TN * TQ);
Chris@42 355 }
Chris@42 356 {
Chris@42 357 E TY, T10, TX, TZ;
Chris@42 358 TY = ri[WS(rs, 5)];
Chris@42 359 T10 = ii[WS(rs, 5)];
Chris@42 360 TX = W[8];
Chris@42 361 TZ = W[9];
Chris@42 362 T11 = FMA(TX, TY, TZ * T10);
Chris@42 363 T1n = FNMS(TZ, TY, TX * T10);
Chris@42 364 }
Chris@42 365 {
Chris@42 366 E TT, TV, TS, TU;
Chris@42 367 TT = ri[WS(rs, 1)];
Chris@42 368 TV = ii[WS(rs, 1)];
Chris@42 369 TS = W[0];
Chris@42 370 TU = W[1];
Chris@42 371 TW = FMA(TS, TT, TU * TV);
Chris@42 372 T1m = FNMS(TU, TT, TS * TV);
Chris@42 373 }
Chris@42 374 T1o = KP866025403 * (T1m - T1n);
Chris@42 375 T1D = KP866025403 * (T11 - TW);
Chris@42 376 T12 = TW + T11;
Chris@42 377 T1l = FNMS(KP500000000, T12, TR);
Chris@42 378 T1F = T1m + T1n;
Chris@42 379 T1G = FNMS(KP500000000, T1F, T1E);
Chris@42 380 }
Chris@42 381 {
Chris@42 382 E Ts, T1c, Tn, T1b;
Chris@42 383 {
Chris@42 384 E Tf, Th, Te, Tg;
Chris@42 385 Tf = ri[WS(rs, 6)];
Chris@42 386 Th = ii[WS(rs, 6)];
Chris@42 387 Te = W[10];
Chris@42 388 Tg = W[11];
Chris@42 389 Ti = FMA(Te, Tf, Tg * Th);
Chris@42 390 T1S = FNMS(Tg, Tf, Te * Th);
Chris@42 391 }
Chris@42 392 {
Chris@42 393 E Tp, Tr, To, Tq;
Chris@42 394 Tp = ri[WS(rs, 2)];
Chris@42 395 Tr = ii[WS(rs, 2)];
Chris@42 396 To = W[2];
Chris@42 397 Tq = W[3];
Chris@42 398 Ts = FMA(To, Tp, Tq * Tr);
Chris@42 399 T1c = FNMS(Tq, Tp, To * Tr);
Chris@42 400 }
Chris@42 401 {
Chris@42 402 E Tk, Tm, Tj, Tl;
Chris@42 403 Tk = ri[WS(rs, 10)];
Chris@42 404 Tm = ii[WS(rs, 10)];
Chris@42 405 Tj = W[18];
Chris@42 406 Tl = W[19];
Chris@42 407 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@42 408 T1b = FNMS(Tl, Tk, Tj * Tm);
Chris@42 409 }
Chris@42 410 T1d = KP866025403 * (T1b - T1c);
Chris@42 411 T24 = KP866025403 * (Ts - Tn);
Chris@42 412 Tt = Tn + Ts;
Chris@42 413 T1a = FNMS(KP500000000, Tt, Ti);
Chris@42 414 T1T = T1b + T1c;
Chris@42 415 T25 = FNMS(KP500000000, T1T, T1S);
Chris@42 416 }
Chris@42 417 {
Chris@42 418 E TK, T1i, TF, T1h;
Chris@42 419 {
Chris@42 420 E Tx, Tz, Tw, Ty;
Chris@42 421 Tx = ri[WS(rs, 3)];
Chris@42 422 Tz = ii[WS(rs, 3)];
Chris@42 423 Tw = W[4];
Chris@42 424 Ty = W[5];
Chris@42 425 TA = FMA(Tw, Tx, Ty * Tz);
Chris@42 426 T1z = FNMS(Ty, Tx, Tw * Tz);
Chris@42 427 }
Chris@42 428 {
Chris@42 429 E TH, TJ, TG, TI;
Chris@42 430 TH = ri[WS(rs, 11)];
Chris@42 431 TJ = ii[WS(rs, 11)];
Chris@42 432 TG = W[20];
Chris@42 433 TI = W[21];
Chris@42 434 TK = FMA(TG, TH, TI * TJ);
Chris@42 435 T1i = FNMS(TI, TH, TG * TJ);
Chris@42 436 }
Chris@42 437 {
Chris@42 438 E TC, TE, TB, TD;
Chris@42 439 TC = ri[WS(rs, 7)];
Chris@42 440 TE = ii[WS(rs, 7)];
Chris@42 441 TB = W[12];
Chris@42 442 TD = W[13];
Chris@42 443 TF = FMA(TB, TC, TD * TE);
Chris@42 444 T1h = FNMS(TD, TC, TB * TE);
Chris@42 445 }
Chris@42 446 T1j = KP866025403 * (T1h - T1i);
Chris@42 447 T1y = KP866025403 * (TK - TF);
Chris@42 448 TL = TF + TK;
Chris@42 449 T1g = FNMS(KP500000000, TL, TA);
Chris@42 450 T1A = T1h + T1i;
Chris@42 451 T1B = FNMS(KP500000000, T1A, T1z);
Chris@42 452 }
Chris@42 453 {
Chris@42 454 E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
Chris@42 455 {
Chris@42 456 E Td, Tu, T1U, T1X;
Chris@42 457 Td = T1 + Tc;
Chris@42 458 Tu = Ti + Tt;
Chris@42 459 Tv = Td + Tu;
Chris@42 460 T1N = Td - Tu;
Chris@42 461 T1U = T1S + T1T;
Chris@42 462 T1X = T1V + T1W;
Chris@42 463 T1Y = T1U + T1X;
Chris@42 464 T20 = T1X - T1U;
Chris@42 465 }
Chris@42 466 {
Chris@42 467 E TM, T13, T1O, T1P;
Chris@42 468 TM = TA + TL;
Chris@42 469 T13 = TR + T12;
Chris@42 470 T14 = TM + T13;
Chris@42 471 T1Z = TM - T13;
Chris@42 472 T1O = T1z + T1A;
Chris@42 473 T1P = T1E + T1F;
Chris@42 474 T1Q = T1O - T1P;
Chris@42 475 T1R = T1O + T1P;
Chris@42 476 }
Chris@42 477 ri[WS(rs, 6)] = Tv - T14;
Chris@42 478 ii[WS(rs, 6)] = T1Y - T1R;
Chris@42 479 ri[0] = Tv + T14;
Chris@42 480 ii[0] = T1R + T1Y;
Chris@42 481 ri[WS(rs, 3)] = T1N - T1Q;
Chris@42 482 ii[WS(rs, 3)] = T1Z + T20;
Chris@42 483 ri[WS(rs, 9)] = T1N + T1Q;
Chris@42 484 ii[WS(rs, 9)] = T20 - T1Z;
Chris@42 485 }
Chris@42 486 {
Chris@42 487 E T1t, T1x, T27, T2a, T1w, T28, T1I, T29;
Chris@42 488 {
Chris@42 489 E T1r, T1s, T23, T26;
Chris@42 490 T1r = T15 + T18;
Chris@42 491 T1s = T1a + T1d;
Chris@42 492 T1t = T1r + T1s;
Chris@42 493 T1x = T1r - T1s;
Chris@42 494 T23 = T21 + T22;
Chris@42 495 T26 = T24 + T25;
Chris@42 496 T27 = T23 - T26;
Chris@42 497 T2a = T26 + T23;
Chris@42 498 }
Chris@42 499 {
Chris@42 500 E T1u, T1v, T1C, T1H;
Chris@42 501 T1u = T1g + T1j;
Chris@42 502 T1v = T1l + T1o;
Chris@42 503 T1w = T1u + T1v;
Chris@42 504 T28 = T1u - T1v;
Chris@42 505 T1C = T1y + T1B;
Chris@42 506 T1H = T1D + T1G;
Chris@42 507 T1I = T1C - T1H;
Chris@42 508 T29 = T1C + T1H;
Chris@42 509 }
Chris@42 510 ri[WS(rs, 10)] = T1t - T1w;
Chris@42 511 ii[WS(rs, 10)] = T2a - T29;
Chris@42 512 ri[WS(rs, 4)] = T1t + T1w;
Chris@42 513 ii[WS(rs, 4)] = T29 + T2a;
Chris@42 514 ri[WS(rs, 7)] = T1x - T1I;
Chris@42 515 ii[WS(rs, 7)] = T28 + T27;
Chris@42 516 ri[WS(rs, 1)] = T1x + T1I;
Chris@42 517 ii[WS(rs, 1)] = T27 - T28;
Chris@42 518 }
Chris@42 519 {
Chris@42 520 E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e;
Chris@42 521 {
Chris@42 522 E T19, T1e, T2b, T2c;
Chris@42 523 T19 = T15 - T18;
Chris@42 524 T1e = T1a - T1d;
Chris@42 525 T1f = T19 + T1e;
Chris@42 526 T1J = T19 - T1e;
Chris@42 527 T2b = T25 - T24;
Chris@42 528 T2c = T22 - T21;
Chris@42 529 T2d = T2b + T2c;
Chris@42 530 T2f = T2c - T2b;
Chris@42 531 }
Chris@42 532 {
Chris@42 533 E T1k, T1p, T1K, T1L;
Chris@42 534 T1k = T1g - T1j;
Chris@42 535 T1p = T1l - T1o;
Chris@42 536 T1q = T1k + T1p;
Chris@42 537 T2g = T1k - T1p;
Chris@42 538 T1K = T1B - T1y;
Chris@42 539 T1L = T1G - T1D;
Chris@42 540 T1M = T1K - T1L;
Chris@42 541 T2e = T1K + T1L;
Chris@42 542 }
Chris@42 543 ri[WS(rs, 2)] = T1f - T1q;
Chris@42 544 ii[WS(rs, 2)] = T2d - T2e;
Chris@42 545 ri[WS(rs, 8)] = T1f + T1q;
Chris@42 546 ii[WS(rs, 8)] = T2e + T2d;
Chris@42 547 ri[WS(rs, 11)] = T1J - T1M;
Chris@42 548 ii[WS(rs, 11)] = T2g + T2f;
Chris@42 549 ri[WS(rs, 5)] = T1J + T1M;
Chris@42 550 ii[WS(rs, 5)] = T2f - T2g;
Chris@42 551 }
Chris@42 552 }
Chris@42 553 }
Chris@42 554 }
Chris@42 555
Chris@42 556 static const tw_instr twinstr[] = {
Chris@42 557 {TW_FULL, 0, 12},
Chris@42 558 {TW_NEXT, 1, 0}
Chris@42 559 };
Chris@42 560
Chris@42 561 static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, {88, 30, 30, 0}, 0, 0, 0 };
Chris@42 562
Chris@42 563 void X(codelet_t1_12) (planner *p) {
Chris@42 564 X(kdft_dit_register) (p, t1_12, &desc);
Chris@42 565 }
Chris@42 566 #endif /* HAVE_FMA */