annotate src/fftw-3.3.3/dft/scalar/codelets/t2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:36:00 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@10 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@10 33 * 100 stack variables, 3 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 42 {
Chris@10 43 INT m;
Chris@10 44 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 45 E T3S, T3R;
Chris@10 46 {
Chris@10 47 E T2, Tf, TM, TO, T3, Tg, TN, TS, T4, Tp, T6, T5, Th;
Chris@10 48 T2 = W[0];
Chris@10 49 Tf = W[2];
Chris@10 50 TM = W[6];
Chris@10 51 TO = W[7];
Chris@10 52 T3 = W[4];
Chris@10 53 Tg = T2 * Tf;
Chris@10 54 TN = T2 * TM;
Chris@10 55 TS = T2 * TO;
Chris@10 56 T4 = T2 * T3;
Chris@10 57 Tp = Tf * T3;
Chris@10 58 T6 = W[5];
Chris@10 59 T5 = W[1];
Chris@10 60 Th = W[3];
Chris@10 61 {
Chris@10 62 E TZ, Te, T1U, T3A, T3L, T2D, T1G, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M;
Chris@10 63 E T1Z, T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, TX;
Chris@10 64 E T10, TV, T2a, TY, T2b;
Chris@10 65 {
Chris@10 66 E TF, TP, TT, Tq, TW, Tz, Tu, TI, TC, T1m, T1f, T1p, T1j, Tr, Ts;
Chris@10 67 E Tv, To, T1W;
Chris@10 68 {
Chris@10 69 E Ti, Tm, T1L, T1O, T1D, T1A, T1x, T2y, T1F, T2x;
Chris@10 70 {
Chris@10 71 E T1, T7, Tb, T3z, T8, T1z, T9, Tc;
Chris@10 72 {
Chris@10 73 E T1i, T1e, T1C, T1y, Tt, Ta, Tl;
Chris@10 74 T1 = ri[0];
Chris@10 75 Tt = Tf * T6;
Chris@10 76 Ta = T2 * T6;
Chris@10 77 T7 = FMA(T5, T6, T4);
Chris@10 78 TF = FNMS(T5, T6, T4);
Chris@10 79 TP = FMA(T5, TO, TN);
Chris@10 80 TT = FNMS(T5, TM, TS);
Chris@10 81 Tq = FNMS(Th, T6, Tp);
Chris@10 82 TW = FMA(Th, T6, Tp);
Chris@10 83 Tz = FMA(T5, Th, Tg);
Chris@10 84 Ti = FNMS(T5, Th, Tg);
Chris@10 85 Tl = T2 * Th;
Chris@10 86 Tu = FMA(Th, T3, Tt);
Chris@10 87 TZ = FNMS(Th, T3, Tt);
Chris@10 88 TI = FMA(T5, T3, Ta);
Chris@10 89 Tb = FNMS(T5, T3, Ta);
Chris@10 90 T1i = Ti * T6;
Chris@10 91 T1e = Ti * T3;
Chris@10 92 T1C = Tz * T6;
Chris@10 93 T1y = Tz * T3;
Chris@10 94 Tm = FMA(T5, Tf, Tl);
Chris@10 95 TC = FNMS(T5, Tf, Tl);
Chris@10 96 T3z = ii[0];
Chris@10 97 T8 = ri[WS(rs, 8)];
Chris@10 98 T1m = FNMS(Tm, T6, T1e);
Chris@10 99 T1f = FMA(Tm, T6, T1e);
Chris@10 100 T1p = FMA(Tm, T3, T1i);
Chris@10 101 T1j = FNMS(Tm, T3, T1i);
Chris@10 102 T1L = FNMS(TC, T6, T1y);
Chris@10 103 T1z = FMA(TC, T6, T1y);
Chris@10 104 T1O = FMA(TC, T3, T1C);
Chris@10 105 T1D = FNMS(TC, T3, T1C);
Chris@10 106 T9 = T7 * T8;
Chris@10 107 Tc = ii[WS(rs, 8)];
Chris@10 108 }
Chris@10 109 {
Chris@10 110 E T1u, T1w, T1v, T2w, T3y, T1B, T1E, Td, T3x;
Chris@10 111 T1u = ri[WS(rs, 15)];
Chris@10 112 T1w = ii[WS(rs, 15)];
Chris@10 113 T1A = ri[WS(rs, 7)];
Chris@10 114 Td = FMA(Tb, Tc, T9);
Chris@10 115 T3x = T7 * Tc;
Chris@10 116 T1v = TM * T1u;
Chris@10 117 T2w = TM * T1w;
Chris@10 118 Te = T1 + Td;
Chris@10 119 T1U = T1 - Td;
Chris@10 120 T3y = FNMS(Tb, T8, T3x);
Chris@10 121 T1B = T1z * T1A;
Chris@10 122 T1E = ii[WS(rs, 7)];
Chris@10 123 T1x = FMA(TO, T1w, T1v);
Chris@10 124 T3A = T3y + T3z;
Chris@10 125 T3L = T3z - T3y;
Chris@10 126 T2y = T1z * T1E;
Chris@10 127 T1F = FMA(T1D, T1E, T1B);
Chris@10 128 T2x = FNMS(TO, T1u, T2w);
Chris@10 129 }
Chris@10 130 }
Chris@10 131 {
Chris@10 132 E T1H, T1I, T1J, T1M, T1P, T2z;
Chris@10 133 T1H = ri[WS(rs, 3)];
Chris@10 134 T2z = FNMS(T1D, T1A, T2y);
Chris@10 135 T2D = T1x - T1F;
Chris@10 136 T1G = T1x + T1F;
Chris@10 137 T1I = Tf * T1H;
Chris@10 138 T2A = T2x - T2z;
Chris@10 139 T3h = T2x + T2z;
Chris@10 140 T1J = ii[WS(rs, 3)];
Chris@10 141 T1M = ri[WS(rs, 11)];
Chris@10 142 T1P = ii[WS(rs, 11)];
Chris@10 143 {
Chris@10 144 E Tj, Tk, Tn, T1V;
Chris@10 145 {
Chris@10 146 E T1K, T2F, T1Q, T2H, T2E, T1N, T2G;
Chris@10 147 Tj = ri[WS(rs, 4)];
Chris@10 148 T1K = FMA(Th, T1J, T1I);
Chris@10 149 T2E = Tf * T1J;
Chris@10 150 T1N = T1L * T1M;
Chris@10 151 T2G = T1L * T1P;
Chris@10 152 Tk = Ti * Tj;
Chris@10 153 T2F = FNMS(Th, T1H, T2E);
Chris@10 154 T1Q = FMA(T1O, T1P, T1N);
Chris@10 155 T2H = FNMS(T1O, T1M, T2G);
Chris@10 156 Tn = ii[WS(rs, 4)];
Chris@10 157 Tr = ri[WS(rs, 12)];
Chris@10 158 T1R = T1K + T1Q;
Chris@10 159 T2B = T1K - T1Q;
Chris@10 160 T2I = T2F - T2H;
Chris@10 161 T3i = T2F + T2H;
Chris@10 162 T1V = Ti * Tn;
Chris@10 163 Ts = Tq * Tr;
Chris@10 164 Tv = ii[WS(rs, 12)];
Chris@10 165 }
Chris@10 166 To = FMA(Tm, Tn, Tk);
Chris@10 167 T1W = FNMS(Tm, Tj, T1V);
Chris@10 168 }
Chris@10 169 }
Chris@10 170 }
Chris@10 171 {
Chris@10 172 E T19, T1b, T18, T2i, T1a, T2j;
Chris@10 173 {
Chris@10 174 E TE, T22, TK, T24;
Chris@10 175 {
Chris@10 176 E TA, TD, TB, T21, TG, TJ, TH, T23, T1Y, Tw, T1X;
Chris@10 177 TA = ri[WS(rs, 2)];
Chris@10 178 Tw = FMA(Tu, Tv, Ts);
Chris@10 179 T1X = Tq * Tv;
Chris@10 180 TD = ii[WS(rs, 2)];
Chris@10 181 TB = Tz * TA;
Chris@10 182 Tx = To + Tw;
Chris@10 183 T3M = To - Tw;
Chris@10 184 T1Y = FNMS(Tu, Tr, T1X);
Chris@10 185 T21 = Tz * TD;
Chris@10 186 TG = ri[WS(rs, 10)];
Chris@10 187 TJ = ii[WS(rs, 10)];
Chris@10 188 T1Z = T1W - T1Y;
Chris@10 189 T3w = T1W + T1Y;
Chris@10 190 TH = TF * TG;
Chris@10 191 T23 = TF * TJ;
Chris@10 192 TE = FMA(TC, TD, TB);
Chris@10 193 T22 = FNMS(TC, TA, T21);
Chris@10 194 TK = FMA(TI, TJ, TH);
Chris@10 195 T24 = FNMS(TI, TG, T23);
Chris@10 196 }
Chris@10 197 {
Chris@10 198 E T15, T17, T16, T2h;
Chris@10 199 T15 = ri[WS(rs, 1)];
Chris@10 200 T17 = ii[WS(rs, 1)];
Chris@10 201 TL = TE + TK;
Chris@10 202 T26 = TE - TK;
Chris@10 203 T25 = T22 - T24;
Chris@10 204 T37 = T22 + T24;
Chris@10 205 T16 = T2 * T15;
Chris@10 206 T2h = T2 * T17;
Chris@10 207 T19 = ri[WS(rs, 9)];
Chris@10 208 T1b = ii[WS(rs, 9)];
Chris@10 209 T18 = FMA(T5, T17, T16);
Chris@10 210 T2i = FNMS(T5, T15, T2h);
Chris@10 211 T1a = T3 * T19;
Chris@10 212 T2j = T3 * T1b;
Chris@10 213 }
Chris@10 214 }
Chris@10 215 {
Chris@10 216 E T1n, T1q, T1l, T2q, T1o, T2r;
Chris@10 217 {
Chris@10 218 E T1g, T1k, T1h, T2p, T1c, T2k;
Chris@10 219 T1g = ri[WS(rs, 5)];
Chris@10 220 T1k = ii[WS(rs, 5)];
Chris@10 221 T1c = FMA(T6, T1b, T1a);
Chris@10 222 T2k = FNMS(T6, T19, T2j);
Chris@10 223 T1h = T1f * T1g;
Chris@10 224 T2p = T1f * T1k;
Chris@10 225 T1d = T18 + T1c;
Chris@10 226 T2o = T18 - T1c;
Chris@10 227 T2l = T2i - T2k;
Chris@10 228 T3c = T2i + T2k;
Chris@10 229 T1n = ri[WS(rs, 13)];
Chris@10 230 T1q = ii[WS(rs, 13)];
Chris@10 231 T1l = FMA(T1j, T1k, T1h);
Chris@10 232 T2q = FNMS(T1j, T1g, T2p);
Chris@10 233 T1o = T1m * T1n;
Chris@10 234 T2r = T1m * T1q;
Chris@10 235 }
Chris@10 236 {
Chris@10 237 E TQ, TU, TR, T29, T1r, T2s;
Chris@10 238 TQ = ri[WS(rs, 14)];
Chris@10 239 TU = ii[WS(rs, 14)];
Chris@10 240 T1r = FMA(T1p, T1q, T1o);
Chris@10 241 T2s = FNMS(T1p, T1n, T2r);
Chris@10 242 TR = TP * TQ;
Chris@10 243 T29 = TP * TU;
Chris@10 244 T1s = T1l + T1r;
Chris@10 245 T2m = T1l - T1r;
Chris@10 246 T2t = T2q - T2s;
Chris@10 247 T3d = T2q + T2s;
Chris@10 248 TX = ri[WS(rs, 6)];
Chris@10 249 T10 = ii[WS(rs, 6)];
Chris@10 250 TV = FMA(TT, TU, TR);
Chris@10 251 T2a = FNMS(TT, TQ, T29);
Chris@10 252 TY = TW * TX;
Chris@10 253 T2b = TW * T10;
Chris@10 254 }
Chris@10 255 }
Chris@10 256 }
Chris@10 257 }
Chris@10 258 {
Chris@10 259 E T36, T3G, T3b, T3g, T28, T2d, T3F, T39, T3e, T3q, T3C, T3j, T3u, T3t;
Chris@10 260 {
Chris@10 261 E T3D, T1T, T3r, T14, T3E, T3s;
Chris@10 262 {
Chris@10 263 E Ty, T3B, T11, T2c, T13, T3v;
Chris@10 264 T36 = Te - Tx;
Chris@10 265 Ty = Te + Tx;
Chris@10 266 T3B = T3w + T3A;
Chris@10 267 T3G = T3A - T3w;
Chris@10 268 T11 = FMA(TZ, T10, TY);
Chris@10 269 T2c = FNMS(TZ, TX, T2b);
Chris@10 270 {
Chris@10 271 E T1t, T1S, T12, T38;
Chris@10 272 T3b = T1d - T1s;
Chris@10 273 T1t = T1d + T1s;
Chris@10 274 T1S = T1G + T1R;
Chris@10 275 T3g = T1G - T1R;
Chris@10 276 T12 = TV + T11;
Chris@10 277 T28 = TV - T11;
Chris@10 278 T2d = T2a - T2c;
Chris@10 279 T38 = T2a + T2c;
Chris@10 280 T3D = T1S - T1t;
Chris@10 281 T1T = T1t + T1S;
Chris@10 282 T13 = TL + T12;
Chris@10 283 T3F = T12 - TL;
Chris@10 284 T39 = T37 - T38;
Chris@10 285 T3v = T37 + T38;
Chris@10 286 }
Chris@10 287 T3e = T3c - T3d;
Chris@10 288 T3r = T3c + T3d;
Chris@10 289 T3q = Ty - T13;
Chris@10 290 T14 = Ty + T13;
Chris@10 291 T3E = T3B - T3v;
Chris@10 292 T3C = T3v + T3B;
Chris@10 293 T3s = T3h + T3i;
Chris@10 294 T3j = T3h - T3i;
Chris@10 295 }
Chris@10 296 ri[WS(rs, 8)] = T14 - T1T;
Chris@10 297 ri[0] = T14 + T1T;
Chris@10 298 ii[WS(rs, 12)] = T3E - T3D;
Chris@10 299 T3u = T3r + T3s;
Chris@10 300 T3t = T3r - T3s;
Chris@10 301 ii[WS(rs, 4)] = T3D + T3E;
Chris@10 302 }
Chris@10 303 {
Chris@10 304 E T3m, T3a, T3J, T3H;
Chris@10 305 ii[0] = T3u + T3C;
Chris@10 306 ii[WS(rs, 8)] = T3C - T3u;
Chris@10 307 ri[WS(rs, 4)] = T3q + T3t;
Chris@10 308 ri[WS(rs, 12)] = T3q - T3t;
Chris@10 309 T3m = T36 - T39;
Chris@10 310 T3a = T36 + T39;
Chris@10 311 T3J = T3G - T3F;
Chris@10 312 T3H = T3F + T3G;
Chris@10 313 {
Chris@10 314 E T2Q, T20, T3N, T3T, T2J, T2C, T3O, T2f, T34, T30, T2W, T2V, T3U, T2T, T2N;
Chris@10 315 E T2v;
Chris@10 316 {
Chris@10 317 E T2R, T27, T2e, T2S;
Chris@10 318 {
Chris@10 319 E T3n, T3f, T3o, T3k;
Chris@10 320 T2Q = T1U + T1Z;
Chris@10 321 T20 = T1U - T1Z;
Chris@10 322 T3n = T3e - T3b;
Chris@10 323 T3f = T3b + T3e;
Chris@10 324 T3o = T3g + T3j;
Chris@10 325 T3k = T3g - T3j;
Chris@10 326 T3N = T3L - T3M;
Chris@10 327 T3T = T3M + T3L;
Chris@10 328 {
Chris@10 329 E T3p, T3I, T3K, T3l;
Chris@10 330 T3p = T3n - T3o;
Chris@10 331 T3I = T3n + T3o;
Chris@10 332 T3K = T3k - T3f;
Chris@10 333 T3l = T3f + T3k;
Chris@10 334 ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
Chris@10 335 ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
Chris@10 336 ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
Chris@10 337 ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
Chris@10 338 ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
Chris@10 339 ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
Chris@10 340 ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
Chris@10 341 ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
Chris@10 342 T2R = T26 + T25;
Chris@10 343 T27 = T25 - T26;
Chris@10 344 T2e = T28 + T2d;
Chris@10 345 T2S = T28 - T2d;
Chris@10 346 }
Chris@10 347 }
Chris@10 348 {
Chris@10 349 E T2Y, T2Z, T2n, T2u;
Chris@10 350 T2J = T2D - T2I;
Chris@10 351 T2Y = T2D + T2I;
Chris@10 352 T2Z = T2A - T2B;
Chris@10 353 T2C = T2A + T2B;
Chris@10 354 T3O = T27 + T2e;
Chris@10 355 T2f = T27 - T2e;
Chris@10 356 T34 = FMA(KP414213562, T2Y, T2Z);
Chris@10 357 T30 = FNMS(KP414213562, T2Z, T2Y);
Chris@10 358 T2W = T2l - T2m;
Chris@10 359 T2n = T2l + T2m;
Chris@10 360 T2u = T2o - T2t;
Chris@10 361 T2V = T2o + T2t;
Chris@10 362 T3U = T2S - T2R;
Chris@10 363 T2T = T2R + T2S;
Chris@10 364 T2N = FNMS(KP414213562, T2n, T2u);
Chris@10 365 T2v = FMA(KP414213562, T2u, T2n);
Chris@10 366 }
Chris@10 367 }
Chris@10 368 {
Chris@10 369 E T33, T2X, T3X, T3Y;
Chris@10 370 {
Chris@10 371 E T2M, T2g, T2O, T2K, T3V, T3W, T2P, T2L;
Chris@10 372 T2M = FNMS(KP707106781, T2f, T20);
Chris@10 373 T2g = FMA(KP707106781, T2f, T20);
Chris@10 374 T33 = FNMS(KP414213562, T2V, T2W);
Chris@10 375 T2X = FMA(KP414213562, T2W, T2V);
Chris@10 376 T2O = FMA(KP414213562, T2C, T2J);
Chris@10 377 T2K = FNMS(KP414213562, T2J, T2C);
Chris@10 378 T3V = FMA(KP707106781, T3U, T3T);
Chris@10 379 T3X = FNMS(KP707106781, T3U, T3T);
Chris@10 380 T3W = T2O - T2N;
Chris@10 381 T2P = T2N + T2O;
Chris@10 382 T3Y = T2v + T2K;
Chris@10 383 T2L = T2v - T2K;
Chris@10 384 ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
Chris@10 385 ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
Chris@10 386 ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
Chris@10 387 ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
Chris@10 388 ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
Chris@10 389 ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
Chris@10 390 }
Chris@10 391 {
Chris@10 392 E T32, T3P, T3Q, T35, T2U, T31;
Chris@10 393 T32 = FNMS(KP707106781, T2T, T2Q);
Chris@10 394 T2U = FMA(KP707106781, T2T, T2Q);
Chris@10 395 T31 = T2X + T30;
Chris@10 396 T3S = T30 - T2X;
Chris@10 397 T3R = FNMS(KP707106781, T3O, T3N);
Chris@10 398 T3P = FMA(KP707106781, T3O, T3N);
Chris@10 399 ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
Chris@10 400 ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
Chris@10 401 ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
Chris@10 402 ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
Chris@10 403 T3Q = T33 + T34;
Chris@10 404 T35 = T33 - T34;
Chris@10 405 ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
Chris@10 406 ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
Chris@10 407 ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
Chris@10 408 ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
Chris@10 409 }
Chris@10 410 }
Chris@10 411 }
Chris@10 412 }
Chris@10 413 }
Chris@10 414 }
Chris@10 415 }
Chris@10 416 ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
Chris@10 417 ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
Chris@10 418 }
Chris@10 419 }
Chris@10 420 }
Chris@10 421
Chris@10 422 static const tw_instr twinstr[] = {
Chris@10 423 {TW_CEXP, 0, 1},
Chris@10 424 {TW_CEXP, 0, 3},
Chris@10 425 {TW_CEXP, 0, 9},
Chris@10 426 {TW_CEXP, 0, 15},
Chris@10 427 {TW_NEXT, 1, 0}
Chris@10 428 };
Chris@10 429
Chris@10 430 static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
Chris@10 431
Chris@10 432 void X(codelet_t2_16) (planner *p) {
Chris@10 433 X(kdft_dit_register) (p, t2_16, &desc);
Chris@10 434 }
Chris@10 435 #else /* HAVE_FMA */
Chris@10 436
Chris@10 437 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include t.h */
Chris@10 438
Chris@10 439 /*
Chris@10 440 * This function contains 196 FP additions, 108 FP multiplications,
Chris@10 441 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@10 442 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@10 443 */
Chris@10 444 #include "t.h"
Chris@10 445
Chris@10 446 static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 447 {
Chris@10 448 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 449 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 450 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 451 {
Chris@10 452 INT m;
Chris@10 453 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 454 E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@10 455 E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@10 456 {
Chris@10 457 E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@10 458 {
Chris@10 459 E Th, Tn, Tj, Tm;
Chris@10 460 T2 = W[0];
Chris@10 461 T5 = W[1];
Chris@10 462 Tg = W[2];
Chris@10 463 Ti = W[3];
Chris@10 464 Th = T2 * Tg;
Chris@10 465 Tn = T5 * Tg;
Chris@10 466 Tj = T5 * Ti;
Chris@10 467 Tm = T2 * Ti;
Chris@10 468 Tk = Th - Tj;
Chris@10 469 To = Tm + Tn;
Chris@10 470 TE = Tm - Tn;
Chris@10 471 TC = Th + Tj;
Chris@10 472 T6 = W[5];
Chris@10 473 T7 = T5 * T6;
Chris@10 474 Tv = Tg * T6;
Chris@10 475 Ta = T2 * T6;
Chris@10 476 Ts = Ti * T6;
Chris@10 477 T3 = W[4];
Chris@10 478 T4 = T2 * T3;
Chris@10 479 Tw = Ti * T3;
Chris@10 480 Tb = T5 * T3;
Chris@10 481 Tr = Tg * T3;
Chris@10 482 }
Chris@10 483 T8 = T4 + T7;
Chris@10 484 TW = Tv - Tw;
Chris@10 485 TJ = Ta + Tb;
Chris@10 486 Tt = Tr - Ts;
Chris@10 487 TU = Tr + Ts;
Chris@10 488 Tc = Ta - Tb;
Chris@10 489 Tx = Tv + Tw;
Chris@10 490 TH = T4 - T7;
Chris@10 491 TN = W[6];
Chris@10 492 TO = W[7];
Chris@10 493 TP = FMA(T2, TN, T5 * TO);
Chris@10 494 TR = FNMS(T5, TN, T2 * TO);
Chris@10 495 {
Chris@10 496 E T1d, T1e, T19, T1a;
Chris@10 497 T1d = Tk * T6;
Chris@10 498 T1e = To * T3;
Chris@10 499 T1f = T1d - T1e;
Chris@10 500 T1k = T1d + T1e;
Chris@10 501 T19 = Tk * T3;
Chris@10 502 T1a = To * T6;
Chris@10 503 T1b = T19 + T1a;
Chris@10 504 T1i = T19 - T1a;
Chris@10 505 }
Chris@10 506 {
Chris@10 507 E T1w, T1x, T1s, T1t;
Chris@10 508 T1w = TC * T6;
Chris@10 509 T1x = TE * T3;
Chris@10 510 T1y = T1w - T1x;
Chris@10 511 T1H = T1w + T1x;
Chris@10 512 T1s = TC * T3;
Chris@10 513 T1t = TE * T6;
Chris@10 514 T1u = T1s + T1t;
Chris@10 515 T1F = T1s - T1t;
Chris@10 516 }
Chris@10 517 }
Chris@10 518 {
Chris@10 519 E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
Chris@10 520 E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
Chris@10 521 E T2S, T2T, T28, T2A, T2d, T2B;
Chris@10 522 {
Chris@10 523 E T1, T3d, Te, T3c, T9, Td;
Chris@10 524 T1 = ri[0];
Chris@10 525 T3d = ii[0];
Chris@10 526 T9 = ri[WS(rs, 8)];
Chris@10 527 Td = ii[WS(rs, 8)];
Chris@10 528 Te = FMA(T8, T9, Tc * Td);
Chris@10 529 T3c = FNMS(Tc, T9, T8 * Td);
Chris@10 530 Tf = T1 + Te;
Chris@10 531 T3r = T3d - T3c;
Chris@10 532 T1N = T1 - Te;
Chris@10 533 T3e = T3c + T3d;
Chris@10 534 }
Chris@10 535 {
Chris@10 536 E Tq, T1O, Tz, T1P;
Chris@10 537 {
Chris@10 538 E Tl, Tp, Tu, Ty;
Chris@10 539 Tl = ri[WS(rs, 4)];
Chris@10 540 Tp = ii[WS(rs, 4)];
Chris@10 541 Tq = FMA(Tk, Tl, To * Tp);
Chris@10 542 T1O = FNMS(To, Tl, Tk * Tp);
Chris@10 543 Tu = ri[WS(rs, 12)];
Chris@10 544 Ty = ii[WS(rs, 12)];
Chris@10 545 Tz = FMA(Tt, Tu, Tx * Ty);
Chris@10 546 T1P = FNMS(Tx, Tu, Tt * Ty);
Chris@10 547 }
Chris@10 548 TA = Tq + Tz;
Chris@10 549 T3s = Tq - Tz;
Chris@10 550 T1Q = T1O - T1P;
Chris@10 551 T3b = T1O + T1P;
Chris@10 552 }
Chris@10 553 {
Chris@10 554 E TG, T1S, TL, T1T, T1U, T1V;
Chris@10 555 {
Chris@10 556 E TD, TF, TI, TK;
Chris@10 557 TD = ri[WS(rs, 2)];
Chris@10 558 TF = ii[WS(rs, 2)];
Chris@10 559 TG = FMA(TC, TD, TE * TF);
Chris@10 560 T1S = FNMS(TE, TD, TC * TF);
Chris@10 561 TI = ri[WS(rs, 10)];
Chris@10 562 TK = ii[WS(rs, 10)];
Chris@10 563 TL = FMA(TH, TI, TJ * TK);
Chris@10 564 T1T = FNMS(TJ, TI, TH * TK);
Chris@10 565 }
Chris@10 566 TM = TG + TL;
Chris@10 567 T2M = T1S + T1T;
Chris@10 568 T1U = T1S - T1T;
Chris@10 569 T1V = TG - TL;
Chris@10 570 T1W = T1U - T1V;
Chris@10 571 T2w = T1V + T1U;
Chris@10 572 }
Chris@10 573 {
Chris@10 574 E TT, T1Y, TY, T1Z, T1X, T20;
Chris@10 575 {
Chris@10 576 E TQ, TS, TV, TX;
Chris@10 577 TQ = ri[WS(rs, 14)];
Chris@10 578 TS = ii[WS(rs, 14)];
Chris@10 579 TT = FMA(TP, TQ, TR * TS);
Chris@10 580 T1Y = FNMS(TR, TQ, TP * TS);
Chris@10 581 TV = ri[WS(rs, 6)];
Chris@10 582 TX = ii[WS(rs, 6)];
Chris@10 583 TY = FMA(TU, TV, TW * TX);
Chris@10 584 T1Z = FNMS(TW, TV, TU * TX);
Chris@10 585 }
Chris@10 586 TZ = TT + TY;
Chris@10 587 T2N = T1Y + T1Z;
Chris@10 588 T1X = TT - TY;
Chris@10 589 T20 = T1Y - T1Z;
Chris@10 590 T21 = T1X + T20;
Chris@10 591 T2x = T1X - T20;
Chris@10 592 }
Chris@10 593 {
Chris@10 594 E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
Chris@10 595 {
Chris@10 596 E T1p, T1q, T1G, T1I;
Chris@10 597 T1p = ri[WS(rs, 15)];
Chris@10 598 T1q = ii[WS(rs, 15)];
Chris@10 599 T1r = FMA(TN, T1p, TO * T1q);
Chris@10 600 T2k = FNMS(TO, T1p, TN * T1q);
Chris@10 601 T1G = ri[WS(rs, 11)];
Chris@10 602 T1I = ii[WS(rs, 11)];
Chris@10 603 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@10 604 T2h = FNMS(T1H, T1G, T1F * T1I);
Chris@10 605 }
Chris@10 606 {
Chris@10 607 E T1v, T1z, T1C, T1D;
Chris@10 608 T1v = ri[WS(rs, 7)];
Chris@10 609 T1z = ii[WS(rs, 7)];
Chris@10 610 T1A = FMA(T1u, T1v, T1y * T1z);
Chris@10 611 T2l = FNMS(T1y, T1v, T1u * T1z);
Chris@10 612 T1C = ri[WS(rs, 3)];
Chris@10 613 T1D = ii[WS(rs, 3)];
Chris@10 614 T1E = FMA(Tg, T1C, Ti * T1D);
Chris@10 615 T2g = FNMS(Ti, T1C, Tg * T1D);
Chris@10 616 }
Chris@10 617 T1B = T1r + T1A;
Chris@10 618 T1K = T1E + T1J;
Chris@10 619 T2V = T1B - T1K;
Chris@10 620 T2W = T2k + T2l;
Chris@10 621 T2X = T2g + T2h;
Chris@10 622 T2Y = T2W - T2X;
Chris@10 623 {
Chris@10 624 E T2f, T2i, T2m, T2n;
Chris@10 625 T2f = T1r - T1A;
Chris@10 626 T2i = T2g - T2h;
Chris@10 627 T2j = T2f - T2i;
Chris@10 628 T2D = T2f + T2i;
Chris@10 629 T2m = T2k - T2l;
Chris@10 630 T2n = T1E - T1J;
Chris@10 631 T2o = T2m + T2n;
Chris@10 632 T2E = T2m - T2n;
Chris@10 633 }
Chris@10 634 }
Chris@10 635 {
Chris@10 636 E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
Chris@10 637 {
Chris@10 638 E T12, T13, T1j, T1l;
Chris@10 639 T12 = ri[WS(rs, 1)];
Chris@10 640 T13 = ii[WS(rs, 1)];
Chris@10 641 T14 = FMA(T2, T12, T5 * T13);
Chris@10 642 T24 = FNMS(T5, T12, T2 * T13);
Chris@10 643 T1j = ri[WS(rs, 13)];
Chris@10 644 T1l = ii[WS(rs, 13)];
Chris@10 645 T1m = FMA(T1i, T1j, T1k * T1l);
Chris@10 646 T2b = FNMS(T1k, T1j, T1i * T1l);
Chris@10 647 }
Chris@10 648 {
Chris@10 649 E T15, T16, T1c, T1g;
Chris@10 650 T15 = ri[WS(rs, 9)];
Chris@10 651 T16 = ii[WS(rs, 9)];
Chris@10 652 T17 = FMA(T3, T15, T6 * T16);
Chris@10 653 T25 = FNMS(T6, T15, T3 * T16);
Chris@10 654 T1c = ri[WS(rs, 5)];
Chris@10 655 T1g = ii[WS(rs, 5)];
Chris@10 656 T1h = FMA(T1b, T1c, T1f * T1g);
Chris@10 657 T2a = FNMS(T1f, T1c, T1b * T1g);
Chris@10 658 }
Chris@10 659 T18 = T14 + T17;
Chris@10 660 T1n = T1h + T1m;
Chris@10 661 T2Q = T18 - T1n;
Chris@10 662 T2R = T24 + T25;
Chris@10 663 T2S = T2a + T2b;
Chris@10 664 T2T = T2R - T2S;
Chris@10 665 {
Chris@10 666 E T26, T27, T29, T2c;
Chris@10 667 T26 = T24 - T25;
Chris@10 668 T27 = T1h - T1m;
Chris@10 669 T28 = T26 + T27;
Chris@10 670 T2A = T26 - T27;
Chris@10 671 T29 = T14 - T17;
Chris@10 672 T2c = T2a - T2b;
Chris@10 673 T2d = T29 - T2c;
Chris@10 674 T2B = T29 + T2c;
Chris@10 675 }
Chris@10 676 }
Chris@10 677 {
Chris@10 678 E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
Chris@10 679 {
Chris@10 680 E T1R, T22, T3y, T3z;
Chris@10 681 T1R = T1N - T1Q;
Chris@10 682 T22 = KP707106781 * (T1W - T21);
Chris@10 683 T23 = T1R + T22;
Chris@10 684 T2r = T1R - T22;
Chris@10 685 T3y = KP707106781 * (T2x - T2w);
Chris@10 686 T3z = T3s + T3r;
Chris@10 687 T3A = T3y + T3z;
Chris@10 688 T3C = T3z - T3y;
Chris@10 689 }
Chris@10 690 {
Chris@10 691 E T2e, T2p, T2s, T2t;
Chris@10 692 T2e = FMA(KP923879532, T28, KP382683432 * T2d);
Chris@10 693 T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
Chris@10 694 T2q = T2e + T2p;
Chris@10 695 T3B = T2p - T2e;
Chris@10 696 T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
Chris@10 697 T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
Chris@10 698 T2u = T2s - T2t;
Chris@10 699 T3x = T2s + T2t;
Chris@10 700 }
Chris@10 701 ri[WS(rs, 11)] = T23 - T2q;
Chris@10 702 ii[WS(rs, 11)] = T3A - T3x;
Chris@10 703 ri[WS(rs, 3)] = T23 + T2q;
Chris@10 704 ii[WS(rs, 3)] = T3x + T3A;
Chris@10 705 ri[WS(rs, 15)] = T2r - T2u;
Chris@10 706 ii[WS(rs, 15)] = T3C - T3B;
Chris@10 707 ri[WS(rs, 7)] = T2r + T2u;
Chris@10 708 ii[WS(rs, 7)] = T3B + T3C;
Chris@10 709 }
Chris@10 710 {
Chris@10 711 E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
Chris@10 712 {
Chris@10 713 E T2L, T2O, T3k, T3l;
Chris@10 714 T2L = Tf - TA;
Chris@10 715 T2O = T2M - T2N;
Chris@10 716 T2P = T2L + T2O;
Chris@10 717 T31 = T2L - T2O;
Chris@10 718 T3k = TZ - TM;
Chris@10 719 T3l = T3e - T3b;
Chris@10 720 T3m = T3k + T3l;
Chris@10 721 T3o = T3l - T3k;
Chris@10 722 }
Chris@10 723 {
Chris@10 724 E T2U, T2Z, T32, T33;
Chris@10 725 T2U = T2Q + T2T;
Chris@10 726 T2Z = T2V - T2Y;
Chris@10 727 T30 = KP707106781 * (T2U + T2Z);
Chris@10 728 T3n = KP707106781 * (T2Z - T2U);
Chris@10 729 T32 = T2T - T2Q;
Chris@10 730 T33 = T2V + T2Y;
Chris@10 731 T34 = KP707106781 * (T32 - T33);
Chris@10 732 T3j = KP707106781 * (T32 + T33);
Chris@10 733 }
Chris@10 734 ri[WS(rs, 10)] = T2P - T30;
Chris@10 735 ii[WS(rs, 10)] = T3m - T3j;
Chris@10 736 ri[WS(rs, 2)] = T2P + T30;
Chris@10 737 ii[WS(rs, 2)] = T3j + T3m;
Chris@10 738 ri[WS(rs, 14)] = T31 - T34;
Chris@10 739 ii[WS(rs, 14)] = T3o - T3n;
Chris@10 740 ri[WS(rs, 6)] = T31 + T34;
Chris@10 741 ii[WS(rs, 6)] = T3n + T3o;
Chris@10 742 }
Chris@10 743 {
Chris@10 744 E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
Chris@10 745 {
Chris@10 746 E T2v, T2y, T3q, T3t;
Chris@10 747 T2v = T1N + T1Q;
Chris@10 748 T2y = KP707106781 * (T2w + T2x);
Chris@10 749 T2z = T2v + T2y;
Chris@10 750 T2H = T2v - T2y;
Chris@10 751 T3q = KP707106781 * (T1W + T21);
Chris@10 752 T3t = T3r - T3s;
Chris@10 753 T3u = T3q + T3t;
Chris@10 754 T3w = T3t - T3q;
Chris@10 755 }
Chris@10 756 {
Chris@10 757 E T2C, T2F, T2I, T2J;
Chris@10 758 T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
Chris@10 759 T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
Chris@10 760 T2G = T2C + T2F;
Chris@10 761 T3v = T2F - T2C;
Chris@10 762 T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
Chris@10 763 T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
Chris@10 764 T2K = T2I - T2J;
Chris@10 765 T3p = T2I + T2J;
Chris@10 766 }
Chris@10 767 ri[WS(rs, 9)] = T2z - T2G;
Chris@10 768 ii[WS(rs, 9)] = T3u - T3p;
Chris@10 769 ri[WS(rs, 1)] = T2z + T2G;
Chris@10 770 ii[WS(rs, 1)] = T3p + T3u;
Chris@10 771 ri[WS(rs, 13)] = T2H - T2K;
Chris@10 772 ii[WS(rs, 13)] = T3w - T3v;
Chris@10 773 ri[WS(rs, 5)] = T2H + T2K;
Chris@10 774 ii[WS(rs, 5)] = T3v + T3w;
Chris@10 775 }
Chris@10 776 {
Chris@10 777 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@10 778 {
Chris@10 779 E TB, T10, T3a, T3f;
Chris@10 780 TB = Tf + TA;
Chris@10 781 T10 = TM + TZ;
Chris@10 782 T11 = TB + T10;
Chris@10 783 T35 = TB - T10;
Chris@10 784 T3a = T2M + T2N;
Chris@10 785 T3f = T3b + T3e;
Chris@10 786 T3g = T3a + T3f;
Chris@10 787 T3i = T3f - T3a;
Chris@10 788 }
Chris@10 789 {
Chris@10 790 E T1o, T1L, T36, T37;
Chris@10 791 T1o = T18 + T1n;
Chris@10 792 T1L = T1B + T1K;
Chris@10 793 T1M = T1o + T1L;
Chris@10 794 T3h = T1L - T1o;
Chris@10 795 T36 = T2R + T2S;
Chris@10 796 T37 = T2W + T2X;
Chris@10 797 T38 = T36 - T37;
Chris@10 798 T39 = T36 + T37;
Chris@10 799 }
Chris@10 800 ri[WS(rs, 8)] = T11 - T1M;
Chris@10 801 ii[WS(rs, 8)] = T3g - T39;
Chris@10 802 ri[0] = T11 + T1M;
Chris@10 803 ii[0] = T39 + T3g;
Chris@10 804 ri[WS(rs, 12)] = T35 - T38;
Chris@10 805 ii[WS(rs, 12)] = T3i - T3h;
Chris@10 806 ri[WS(rs, 4)] = T35 + T38;
Chris@10 807 ii[WS(rs, 4)] = T3h + T3i;
Chris@10 808 }
Chris@10 809 }
Chris@10 810 }
Chris@10 811 }
Chris@10 812 }
Chris@10 813
Chris@10 814 static const tw_instr twinstr[] = {
Chris@10 815 {TW_CEXP, 0, 1},
Chris@10 816 {TW_CEXP, 0, 3},
Chris@10 817 {TW_CEXP, 0, 9},
Chris@10 818 {TW_CEXP, 0, 15},
Chris@10 819 {TW_NEXT, 1, 0}
Chris@10 820 };
Chris@10 821
Chris@10 822 static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
Chris@10 823
Chris@10 824 void X(codelet_t2_16) (planner *p) {
Chris@10 825 X(kdft_dit_register) (p, t2_16, &desc);
Chris@10 826 }
Chris@10 827 #endif /* HAVE_FMA */