annotate src/fftw-3.3.8/dft/scalar/codelets/t2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:19 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 196 FP additions, 134 FP multiplications,
Chris@82 32 * (or, 104 additions, 42 multiplications, 92 fused multiply/add),
Chris@82 33 * 90 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
Chris@82 46 E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
Chris@82 47 {
Chris@82 48 E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
Chris@82 49 T2 = W[0];
Chris@82 50 Tf = W[2];
Chris@82 51 Tg = T2 * Tf;
Chris@82 52 TM = W[6];
Chris@82 53 TN = T2 * TM;
Chris@82 54 TO = W[7];
Chris@82 55 TS = T2 * TO;
Chris@82 56 T3 = W[4];
Chris@82 57 T4 = T2 * T3;
Chris@82 58 Tp = Tf * T3;
Chris@82 59 T6 = W[5];
Chris@82 60 Ta = T2 * T6;
Chris@82 61 Tt = Tf * T6;
Chris@82 62 T5 = W[1];
Chris@82 63 Th = W[3];
Chris@82 64 Tl = T2 * Th;
Chris@82 65 Tz = FMA(T5, Th, Tg);
Chris@82 66 Ti = FNMS(T5, Th, Tg);
Chris@82 67 T7 = FMA(T5, T6, T4);
Chris@82 68 TZ = FNMS(Th, T3, Tt);
Chris@82 69 TT = FNMS(T5, TM, TS);
Chris@82 70 Tq = FNMS(Th, T6, Tp);
Chris@82 71 TW = FMA(Th, T6, Tp);
Chris@82 72 Tb = FNMS(T5, T3, Ta);
Chris@82 73 Tu = FMA(Th, T3, Tt);
Chris@82 74 TP = FMA(T5, TO, TN);
Chris@82 75 TI = FMA(T5, T3, Ta);
Chris@82 76 TF = FNMS(T5, T6, T4);
Chris@82 77 {
Chris@82 78 E T1y, T1C, T1e, T1i;
Chris@82 79 T1y = Tz * T3;
Chris@82 80 T1C = Tz * T6;
Chris@82 81 TC = FNMS(T5, Tf, Tl);
Chris@82 82 T1z = FMA(TC, T6, T1y);
Chris@82 83 T1O = FMA(TC, T3, T1C);
Chris@82 84 T1D = FNMS(TC, T3, T1C);
Chris@82 85 T1L = FNMS(TC, T6, T1y);
Chris@82 86 T1e = Ti * T3;
Chris@82 87 T1i = Ti * T6;
Chris@82 88 Tm = FMA(T5, Tf, Tl);
Chris@82 89 T1f = FMA(Tm, T6, T1e);
Chris@82 90 T1p = FMA(Tm, T3, T1i);
Chris@82 91 T1j = FNMS(Tm, T3, T1i);
Chris@82 92 T1m = FNMS(Tm, T6, T1e);
Chris@82 93 }
Chris@82 94 }
Chris@82 95 {
Chris@82 96 E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
Chris@82 97 E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
Chris@82 98 E T2d, T38;
Chris@82 99 {
Chris@82 100 E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
Chris@82 101 T1 = ri[0];
Chris@82 102 T3z = ii[0];
Chris@82 103 T8 = ri[WS(rs, 8)];
Chris@82 104 T9 = T7 * T8;
Chris@82 105 Tc = ii[WS(rs, 8)];
Chris@82 106 T3x = T7 * Tc;
Chris@82 107 Td = FMA(Tb, Tc, T9);
Chris@82 108 Te = T1 + Td;
Chris@82 109 T1U = T1 - Td;
Chris@82 110 T3y = FNMS(Tb, T8, T3x);
Chris@82 111 T3A = T3y + T3z;
Chris@82 112 T3L = T3z - T3y;
Chris@82 113 }
Chris@82 114 {
Chris@82 115 E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
Chris@82 116 T1u = ri[WS(rs, 15)];
Chris@82 117 T1v = TM * T1u;
Chris@82 118 T1w = ii[WS(rs, 15)];
Chris@82 119 T2w = TM * T1w;
Chris@82 120 T1A = ri[WS(rs, 7)];
Chris@82 121 T1B = T1z * T1A;
Chris@82 122 T1E = ii[WS(rs, 7)];
Chris@82 123 T2y = T1z * T1E;
Chris@82 124 {
Chris@82 125 E T1x, T1F, T2x, T2z;
Chris@82 126 T1x = FMA(TO, T1w, T1v);
Chris@82 127 T1F = FMA(T1D, T1E, T1B);
Chris@82 128 T1G = T1x + T1F;
Chris@82 129 T2D = T1x - T1F;
Chris@82 130 T2x = FNMS(TO, T1u, T2w);
Chris@82 131 T2z = FNMS(T1D, T1A, T2y);
Chris@82 132 T2A = T2x - T2z;
Chris@82 133 T3h = T2x + T2z;
Chris@82 134 }
Chris@82 135 }
Chris@82 136 {
Chris@82 137 E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
Chris@82 138 T1H = ri[WS(rs, 3)];
Chris@82 139 T1I = Tf * T1H;
Chris@82 140 T1J = ii[WS(rs, 3)];
Chris@82 141 T2E = Tf * T1J;
Chris@82 142 T1M = ri[WS(rs, 11)];
Chris@82 143 T1N = T1L * T1M;
Chris@82 144 T1P = ii[WS(rs, 11)];
Chris@82 145 T2G = T1L * T1P;
Chris@82 146 {
Chris@82 147 E T1K, T1Q, T2F, T2H;
Chris@82 148 T1K = FMA(Th, T1J, T1I);
Chris@82 149 T1Q = FMA(T1O, T1P, T1N);
Chris@82 150 T1R = T1K + T1Q;
Chris@82 151 T2B = T1K - T1Q;
Chris@82 152 T2F = FNMS(Th, T1H, T2E);
Chris@82 153 T2H = FNMS(T1O, T1M, T2G);
Chris@82 154 T2I = T2F - T2H;
Chris@82 155 T3i = T2F + T2H;
Chris@82 156 }
Chris@82 157 }
Chris@82 158 {
Chris@82 159 E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Chris@82 160 Tj = ri[WS(rs, 4)];
Chris@82 161 Tk = Ti * Tj;
Chris@82 162 Tn = ii[WS(rs, 4)];
Chris@82 163 T1V = Ti * Tn;
Chris@82 164 Tr = ri[WS(rs, 12)];
Chris@82 165 Ts = Tq * Tr;
Chris@82 166 Tv = ii[WS(rs, 12)];
Chris@82 167 T1X = Tq * Tv;
Chris@82 168 {
Chris@82 169 E To, Tw, T1W, T1Y;
Chris@82 170 To = FMA(Tm, Tn, Tk);
Chris@82 171 Tw = FMA(Tu, Tv, Ts);
Chris@82 172 Tx = To + Tw;
Chris@82 173 T3M = To - Tw;
Chris@82 174 T1W = FNMS(Tm, Tj, T1V);
Chris@82 175 T1Y = FNMS(Tu, Tr, T1X);
Chris@82 176 T1Z = T1W - T1Y;
Chris@82 177 T3w = T1W + T1Y;
Chris@82 178 }
Chris@82 179 }
Chris@82 180 {
Chris@82 181 E TA, TB, TD, T21, TG, TH, TJ, T23;
Chris@82 182 TA = ri[WS(rs, 2)];
Chris@82 183 TB = Tz * TA;
Chris@82 184 TD = ii[WS(rs, 2)];
Chris@82 185 T21 = Tz * TD;
Chris@82 186 TG = ri[WS(rs, 10)];
Chris@82 187 TH = TF * TG;
Chris@82 188 TJ = ii[WS(rs, 10)];
Chris@82 189 T23 = TF * TJ;
Chris@82 190 {
Chris@82 191 E TE, TK, T22, T24;
Chris@82 192 TE = FMA(TC, TD, TB);
Chris@82 193 TK = FMA(TI, TJ, TH);
Chris@82 194 TL = TE + TK;
Chris@82 195 T26 = TE - TK;
Chris@82 196 T22 = FNMS(TC, TA, T21);
Chris@82 197 T24 = FNMS(TI, TG, T23);
Chris@82 198 T25 = T22 - T24;
Chris@82 199 T37 = T22 + T24;
Chris@82 200 }
Chris@82 201 }
Chris@82 202 {
Chris@82 203 E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
Chris@82 204 T15 = ri[WS(rs, 1)];
Chris@82 205 T16 = T2 * T15;
Chris@82 206 T17 = ii[WS(rs, 1)];
Chris@82 207 T2h = T2 * T17;
Chris@82 208 T19 = ri[WS(rs, 9)];
Chris@82 209 T1a = T3 * T19;
Chris@82 210 T1b = ii[WS(rs, 9)];
Chris@82 211 T2j = T3 * T1b;
Chris@82 212 {
Chris@82 213 E T18, T1c, T2i, T2k;
Chris@82 214 T18 = FMA(T5, T17, T16);
Chris@82 215 T1c = FMA(T6, T1b, T1a);
Chris@82 216 T1d = T18 + T1c;
Chris@82 217 T2o = T18 - T1c;
Chris@82 218 T2i = FNMS(T5, T15, T2h);
Chris@82 219 T2k = FNMS(T6, T19, T2j);
Chris@82 220 T2l = T2i - T2k;
Chris@82 221 T3c = T2i + T2k;
Chris@82 222 }
Chris@82 223 }
Chris@82 224 {
Chris@82 225 E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
Chris@82 226 T1g = ri[WS(rs, 5)];
Chris@82 227 T1h = T1f * T1g;
Chris@82 228 T1k = ii[WS(rs, 5)];
Chris@82 229 T2p = T1f * T1k;
Chris@82 230 T1n = ri[WS(rs, 13)];
Chris@82 231 T1o = T1m * T1n;
Chris@82 232 T1q = ii[WS(rs, 13)];
Chris@82 233 T2r = T1m * T1q;
Chris@82 234 {
Chris@82 235 E T1l, T1r, T2q, T2s;
Chris@82 236 T1l = FMA(T1j, T1k, T1h);
Chris@82 237 T1r = FMA(T1p, T1q, T1o);
Chris@82 238 T1s = T1l + T1r;
Chris@82 239 T2m = T1l - T1r;
Chris@82 240 T2q = FNMS(T1j, T1g, T2p);
Chris@82 241 T2s = FNMS(T1p, T1n, T2r);
Chris@82 242 T2t = T2q - T2s;
Chris@82 243 T3d = T2q + T2s;
Chris@82 244 }
Chris@82 245 }
Chris@82 246 {
Chris@82 247 E TQ, TR, TU, T29, TX, TY, T10, T2b;
Chris@82 248 TQ = ri[WS(rs, 14)];
Chris@82 249 TR = TP * TQ;
Chris@82 250 TU = ii[WS(rs, 14)];
Chris@82 251 T29 = TP * TU;
Chris@82 252 TX = ri[WS(rs, 6)];
Chris@82 253 TY = TW * TX;
Chris@82 254 T10 = ii[WS(rs, 6)];
Chris@82 255 T2b = TW * T10;
Chris@82 256 {
Chris@82 257 E TV, T11, T2a, T2c;
Chris@82 258 TV = FMA(TT, TU, TR);
Chris@82 259 T11 = FMA(TZ, T10, TY);
Chris@82 260 T12 = TV + T11;
Chris@82 261 T28 = TV - T11;
Chris@82 262 T2a = FNMS(TT, TQ, T29);
Chris@82 263 T2c = FNMS(TZ, TX, T2b);
Chris@82 264 T2d = T2a - T2c;
Chris@82 265 T38 = T2a + T2c;
Chris@82 266 }
Chris@82 267 }
Chris@82 268 {
Chris@82 269 E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
Chris@82 270 {
Chris@82 271 E Ty, T13, T3v, T3B;
Chris@82 272 Ty = Te + Tx;
Chris@82 273 T13 = TL + T12;
Chris@82 274 T14 = Ty + T13;
Chris@82 275 T3q = Ty - T13;
Chris@82 276 T3v = T37 + T38;
Chris@82 277 T3B = T3w + T3A;
Chris@82 278 T3C = T3v + T3B;
Chris@82 279 T3E = T3B - T3v;
Chris@82 280 }
Chris@82 281 {
Chris@82 282 E T1t, T1S, T3r, T3s;
Chris@82 283 T1t = T1d + T1s;
Chris@82 284 T1S = T1G + T1R;
Chris@82 285 T1T = T1t + T1S;
Chris@82 286 T3D = T1S - T1t;
Chris@82 287 T3r = T3c + T3d;
Chris@82 288 T3s = T3h + T3i;
Chris@82 289 T3t = T3r - T3s;
Chris@82 290 T3u = T3r + T3s;
Chris@82 291 }
Chris@82 292 ri[WS(rs, 8)] = T14 - T1T;
Chris@82 293 ii[WS(rs, 8)] = T3C - T3u;
Chris@82 294 ri[0] = T14 + T1T;
Chris@82 295 ii[0] = T3u + T3C;
Chris@82 296 ri[WS(rs, 12)] = T3q - T3t;
Chris@82 297 ii[WS(rs, 12)] = T3E - T3D;
Chris@82 298 ri[WS(rs, 4)] = T3q + T3t;
Chris@82 299 ii[WS(rs, 4)] = T3D + T3E;
Chris@82 300 }
Chris@82 301 {
Chris@82 302 E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
Chris@82 303 {
Chris@82 304 E T36, T39, T3F, T3G;
Chris@82 305 T36 = Te - Tx;
Chris@82 306 T39 = T37 - T38;
Chris@82 307 T3a = T36 + T39;
Chris@82 308 T3m = T36 - T39;
Chris@82 309 T3F = T12 - TL;
Chris@82 310 T3G = T3A - T3w;
Chris@82 311 T3H = T3F + T3G;
Chris@82 312 T3J = T3G - T3F;
Chris@82 313 }
Chris@82 314 {
Chris@82 315 E T3b, T3e, T3g, T3j;
Chris@82 316 T3b = T1d - T1s;
Chris@82 317 T3e = T3c - T3d;
Chris@82 318 T3f = T3b + T3e;
Chris@82 319 T3n = T3e - T3b;
Chris@82 320 T3g = T1G - T1R;
Chris@82 321 T3j = T3h - T3i;
Chris@82 322 T3k = T3g - T3j;
Chris@82 323 T3o = T3g + T3j;
Chris@82 324 }
Chris@82 325 {
Chris@82 326 E T3l, T3I, T3p, T3K;
Chris@82 327 T3l = T3f + T3k;
Chris@82 328 ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
Chris@82 329 ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
Chris@82 330 T3I = T3n + T3o;
Chris@82 331 ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
Chris@82 332 ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
Chris@82 333 T3p = T3n - T3o;
Chris@82 334 ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
Chris@82 335 ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
Chris@82 336 T3K = T3k - T3f;
Chris@82 337 ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
Chris@82 338 ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
Chris@82 339 }
Chris@82 340 }
Chris@82 341 {
Chris@82 342 E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
Chris@82 343 E T2O;
Chris@82 344 {
Chris@82 345 E T27, T2e, T2n, T2u;
Chris@82 346 T20 = T1U - T1Z;
Chris@82 347 T3N = T3L - T3M;
Chris@82 348 T3T = T3M + T3L;
Chris@82 349 T2Q = T1U + T1Z;
Chris@82 350 T27 = T25 - T26;
Chris@82 351 T2e = T28 + T2d;
Chris@82 352 T2f = T27 - T2e;
Chris@82 353 T3O = T27 + T2e;
Chris@82 354 {
Chris@82 355 E T2Y, T2Z, T2R, T2S;
Chris@82 356 T2Y = T2D + T2I;
Chris@82 357 T2Z = T2A - T2B;
Chris@82 358 T30 = FNMS(KP414213562, T2Z, T2Y);
Chris@82 359 T34 = FMA(KP414213562, T2Y, T2Z);
Chris@82 360 T2R = T26 + T25;
Chris@82 361 T2S = T28 - T2d;
Chris@82 362 T2T = T2R + T2S;
Chris@82 363 T3U = T2S - T2R;
Chris@82 364 }
Chris@82 365 T2n = T2l + T2m;
Chris@82 366 T2u = T2o - T2t;
Chris@82 367 T2v = FMA(KP414213562, T2u, T2n);
Chris@82 368 T2N = FNMS(KP414213562, T2n, T2u);
Chris@82 369 {
Chris@82 370 E T2V, T2W, T2C, T2J;
Chris@82 371 T2V = T2o + T2t;
Chris@82 372 T2W = T2l - T2m;
Chris@82 373 T2X = FMA(KP414213562, T2W, T2V);
Chris@82 374 T33 = FNMS(KP414213562, T2V, T2W);
Chris@82 375 T2C = T2A + T2B;
Chris@82 376 T2J = T2D - T2I;
Chris@82 377 T2K = FNMS(KP414213562, T2J, T2C);
Chris@82 378 T2O = FMA(KP414213562, T2C, T2J);
Chris@82 379 }
Chris@82 380 }
Chris@82 381 {
Chris@82 382 E T2g, T2L, T3V, T3W;
Chris@82 383 T2g = FMA(KP707106781, T2f, T20);
Chris@82 384 T2L = T2v - T2K;
Chris@82 385 ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
Chris@82 386 ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
Chris@82 387 T3V = FMA(KP707106781, T3U, T3T);
Chris@82 388 T3W = T2O - T2N;
Chris@82 389 ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
Chris@82 390 ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
Chris@82 391 }
Chris@82 392 {
Chris@82 393 E T2M, T2P, T3X, T3Y;
Chris@82 394 T2M = FNMS(KP707106781, T2f, T20);
Chris@82 395 T2P = T2N + T2O;
Chris@82 396 ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
Chris@82 397 ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
Chris@82 398 T3X = FNMS(KP707106781, T3U, T3T);
Chris@82 399 T3Y = T2v + T2K;
Chris@82 400 ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
Chris@82 401 ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E T2U, T31, T3P, T3Q;
Chris@82 405 T2U = FMA(KP707106781, T2T, T2Q);
Chris@82 406 T31 = T2X + T30;
Chris@82 407 ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
Chris@82 408 ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
Chris@82 409 T3P = FMA(KP707106781, T3O, T3N);
Chris@82 410 T3Q = T33 + T34;
Chris@82 411 ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
Chris@82 412 ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
Chris@82 413 }
Chris@82 414 {
Chris@82 415 E T32, T35, T3R, T3S;
Chris@82 416 T32 = FNMS(KP707106781, T2T, T2Q);
Chris@82 417 T35 = T33 - T34;
Chris@82 418 ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
Chris@82 419 ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
Chris@82 420 T3R = FNMS(KP707106781, T3O, T3N);
Chris@82 421 T3S = T30 - T2X;
Chris@82 422 ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
Chris@82 423 ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
Chris@82 424 }
Chris@82 425 }
Chris@82 426 }
Chris@82 427 }
Chris@82 428 }
Chris@82 429 }
Chris@82 430
Chris@82 431 static const tw_instr twinstr[] = {
Chris@82 432 {TW_CEXP, 0, 1},
Chris@82 433 {TW_CEXP, 0, 3},
Chris@82 434 {TW_CEXP, 0, 9},
Chris@82 435 {TW_CEXP, 0, 15},
Chris@82 436 {TW_NEXT, 1, 0}
Chris@82 437 };
Chris@82 438
Chris@82 439 static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {104, 42, 92, 0}, 0, 0, 0 };
Chris@82 440
Chris@82 441 void X(codelet_t2_16) (planner *p) {
Chris@82 442 X(kdft_dit_register) (p, t2_16, &desc);
Chris@82 443 }
Chris@82 444 #else
Chris@82 445
Chris@82 446 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
Chris@82 447
Chris@82 448 /*
Chris@82 449 * This function contains 196 FP additions, 108 FP multiplications,
Chris@82 450 * (or, 156 additions, 68 multiplications, 40 fused multiply/add),
Chris@82 451 * 82 stack variables, 3 constants, and 64 memory accesses
Chris@82 452 */
Chris@82 453 #include "dft/scalar/t.h"
Chris@82 454
Chris@82 455 static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 456 {
Chris@82 457 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 458 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 459 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 460 {
Chris@82 461 INT m;
Chris@82 462 for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 463 E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
Chris@82 464 E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
Chris@82 465 {
Chris@82 466 E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
Chris@82 467 {
Chris@82 468 E Th, Tn, Tj, Tm;
Chris@82 469 T2 = W[0];
Chris@82 470 T5 = W[1];
Chris@82 471 Tg = W[2];
Chris@82 472 Ti = W[3];
Chris@82 473 Th = T2 * Tg;
Chris@82 474 Tn = T5 * Tg;
Chris@82 475 Tj = T5 * Ti;
Chris@82 476 Tm = T2 * Ti;
Chris@82 477 Tk = Th - Tj;
Chris@82 478 To = Tm + Tn;
Chris@82 479 TE = Tm - Tn;
Chris@82 480 TC = Th + Tj;
Chris@82 481 T6 = W[5];
Chris@82 482 T7 = T5 * T6;
Chris@82 483 Tv = Tg * T6;
Chris@82 484 Ta = T2 * T6;
Chris@82 485 Ts = Ti * T6;
Chris@82 486 T3 = W[4];
Chris@82 487 T4 = T2 * T3;
Chris@82 488 Tw = Ti * T3;
Chris@82 489 Tb = T5 * T3;
Chris@82 490 Tr = Tg * T3;
Chris@82 491 }
Chris@82 492 T8 = T4 + T7;
Chris@82 493 TW = Tv - Tw;
Chris@82 494 TJ = Ta + Tb;
Chris@82 495 Tt = Tr - Ts;
Chris@82 496 TU = Tr + Ts;
Chris@82 497 Tc = Ta - Tb;
Chris@82 498 Tx = Tv + Tw;
Chris@82 499 TH = T4 - T7;
Chris@82 500 TN = W[6];
Chris@82 501 TO = W[7];
Chris@82 502 TP = FMA(T2, TN, T5 * TO);
Chris@82 503 TR = FNMS(T5, TN, T2 * TO);
Chris@82 504 {
Chris@82 505 E T1d, T1e, T19, T1a;
Chris@82 506 T1d = Tk * T6;
Chris@82 507 T1e = To * T3;
Chris@82 508 T1f = T1d - T1e;
Chris@82 509 T1k = T1d + T1e;
Chris@82 510 T19 = Tk * T3;
Chris@82 511 T1a = To * T6;
Chris@82 512 T1b = T19 + T1a;
Chris@82 513 T1i = T19 - T1a;
Chris@82 514 }
Chris@82 515 {
Chris@82 516 E T1w, T1x, T1s, T1t;
Chris@82 517 T1w = TC * T6;
Chris@82 518 T1x = TE * T3;
Chris@82 519 T1y = T1w - T1x;
Chris@82 520 T1H = T1w + T1x;
Chris@82 521 T1s = TC * T3;
Chris@82 522 T1t = TE * T6;
Chris@82 523 T1u = T1s + T1t;
Chris@82 524 T1F = T1s - T1t;
Chris@82 525 }
Chris@82 526 }
Chris@82 527 {
Chris@82 528 E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
Chris@82 529 E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
Chris@82 530 E T2S, T2T, T28, T2A, T2d, T2B;
Chris@82 531 {
Chris@82 532 E T1, T3d, Te, T3c, T9, Td;
Chris@82 533 T1 = ri[0];
Chris@82 534 T3d = ii[0];
Chris@82 535 T9 = ri[WS(rs, 8)];
Chris@82 536 Td = ii[WS(rs, 8)];
Chris@82 537 Te = FMA(T8, T9, Tc * Td);
Chris@82 538 T3c = FNMS(Tc, T9, T8 * Td);
Chris@82 539 Tf = T1 + Te;
Chris@82 540 T3r = T3d - T3c;
Chris@82 541 T1N = T1 - Te;
Chris@82 542 T3e = T3c + T3d;
Chris@82 543 }
Chris@82 544 {
Chris@82 545 E Tq, T1O, Tz, T1P;
Chris@82 546 {
Chris@82 547 E Tl, Tp, Tu, Ty;
Chris@82 548 Tl = ri[WS(rs, 4)];
Chris@82 549 Tp = ii[WS(rs, 4)];
Chris@82 550 Tq = FMA(Tk, Tl, To * Tp);
Chris@82 551 T1O = FNMS(To, Tl, Tk * Tp);
Chris@82 552 Tu = ri[WS(rs, 12)];
Chris@82 553 Ty = ii[WS(rs, 12)];
Chris@82 554 Tz = FMA(Tt, Tu, Tx * Ty);
Chris@82 555 T1P = FNMS(Tx, Tu, Tt * Ty);
Chris@82 556 }
Chris@82 557 TA = Tq + Tz;
Chris@82 558 T3s = Tq - Tz;
Chris@82 559 T1Q = T1O - T1P;
Chris@82 560 T3b = T1O + T1P;
Chris@82 561 }
Chris@82 562 {
Chris@82 563 E TG, T1S, TL, T1T, T1U, T1V;
Chris@82 564 {
Chris@82 565 E TD, TF, TI, TK;
Chris@82 566 TD = ri[WS(rs, 2)];
Chris@82 567 TF = ii[WS(rs, 2)];
Chris@82 568 TG = FMA(TC, TD, TE * TF);
Chris@82 569 T1S = FNMS(TE, TD, TC * TF);
Chris@82 570 TI = ri[WS(rs, 10)];
Chris@82 571 TK = ii[WS(rs, 10)];
Chris@82 572 TL = FMA(TH, TI, TJ * TK);
Chris@82 573 T1T = FNMS(TJ, TI, TH * TK);
Chris@82 574 }
Chris@82 575 TM = TG + TL;
Chris@82 576 T2M = T1S + T1T;
Chris@82 577 T1U = T1S - T1T;
Chris@82 578 T1V = TG - TL;
Chris@82 579 T1W = T1U - T1V;
Chris@82 580 T2w = T1V + T1U;
Chris@82 581 }
Chris@82 582 {
Chris@82 583 E TT, T1Y, TY, T1Z, T1X, T20;
Chris@82 584 {
Chris@82 585 E TQ, TS, TV, TX;
Chris@82 586 TQ = ri[WS(rs, 14)];
Chris@82 587 TS = ii[WS(rs, 14)];
Chris@82 588 TT = FMA(TP, TQ, TR * TS);
Chris@82 589 T1Y = FNMS(TR, TQ, TP * TS);
Chris@82 590 TV = ri[WS(rs, 6)];
Chris@82 591 TX = ii[WS(rs, 6)];
Chris@82 592 TY = FMA(TU, TV, TW * TX);
Chris@82 593 T1Z = FNMS(TW, TV, TU * TX);
Chris@82 594 }
Chris@82 595 TZ = TT + TY;
Chris@82 596 T2N = T1Y + T1Z;
Chris@82 597 T1X = TT - TY;
Chris@82 598 T20 = T1Y - T1Z;
Chris@82 599 T21 = T1X + T20;
Chris@82 600 T2x = T1X - T20;
Chris@82 601 }
Chris@82 602 {
Chris@82 603 E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
Chris@82 604 {
Chris@82 605 E T1p, T1q, T1G, T1I;
Chris@82 606 T1p = ri[WS(rs, 15)];
Chris@82 607 T1q = ii[WS(rs, 15)];
Chris@82 608 T1r = FMA(TN, T1p, TO * T1q);
Chris@82 609 T2k = FNMS(TO, T1p, TN * T1q);
Chris@82 610 T1G = ri[WS(rs, 11)];
Chris@82 611 T1I = ii[WS(rs, 11)];
Chris@82 612 T1J = FMA(T1F, T1G, T1H * T1I);
Chris@82 613 T2h = FNMS(T1H, T1G, T1F * T1I);
Chris@82 614 }
Chris@82 615 {
Chris@82 616 E T1v, T1z, T1C, T1D;
Chris@82 617 T1v = ri[WS(rs, 7)];
Chris@82 618 T1z = ii[WS(rs, 7)];
Chris@82 619 T1A = FMA(T1u, T1v, T1y * T1z);
Chris@82 620 T2l = FNMS(T1y, T1v, T1u * T1z);
Chris@82 621 T1C = ri[WS(rs, 3)];
Chris@82 622 T1D = ii[WS(rs, 3)];
Chris@82 623 T1E = FMA(Tg, T1C, Ti * T1D);
Chris@82 624 T2g = FNMS(Ti, T1C, Tg * T1D);
Chris@82 625 }
Chris@82 626 T1B = T1r + T1A;
Chris@82 627 T1K = T1E + T1J;
Chris@82 628 T2V = T1B - T1K;
Chris@82 629 T2W = T2k + T2l;
Chris@82 630 T2X = T2g + T2h;
Chris@82 631 T2Y = T2W - T2X;
Chris@82 632 {
Chris@82 633 E T2f, T2i, T2m, T2n;
Chris@82 634 T2f = T1r - T1A;
Chris@82 635 T2i = T2g - T2h;
Chris@82 636 T2j = T2f - T2i;
Chris@82 637 T2D = T2f + T2i;
Chris@82 638 T2m = T2k - T2l;
Chris@82 639 T2n = T1E - T1J;
Chris@82 640 T2o = T2m + T2n;
Chris@82 641 T2E = T2m - T2n;
Chris@82 642 }
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
Chris@82 646 {
Chris@82 647 E T12, T13, T1j, T1l;
Chris@82 648 T12 = ri[WS(rs, 1)];
Chris@82 649 T13 = ii[WS(rs, 1)];
Chris@82 650 T14 = FMA(T2, T12, T5 * T13);
Chris@82 651 T24 = FNMS(T5, T12, T2 * T13);
Chris@82 652 T1j = ri[WS(rs, 13)];
Chris@82 653 T1l = ii[WS(rs, 13)];
Chris@82 654 T1m = FMA(T1i, T1j, T1k * T1l);
Chris@82 655 T2b = FNMS(T1k, T1j, T1i * T1l);
Chris@82 656 }
Chris@82 657 {
Chris@82 658 E T15, T16, T1c, T1g;
Chris@82 659 T15 = ri[WS(rs, 9)];
Chris@82 660 T16 = ii[WS(rs, 9)];
Chris@82 661 T17 = FMA(T3, T15, T6 * T16);
Chris@82 662 T25 = FNMS(T6, T15, T3 * T16);
Chris@82 663 T1c = ri[WS(rs, 5)];
Chris@82 664 T1g = ii[WS(rs, 5)];
Chris@82 665 T1h = FMA(T1b, T1c, T1f * T1g);
Chris@82 666 T2a = FNMS(T1f, T1c, T1b * T1g);
Chris@82 667 }
Chris@82 668 T18 = T14 + T17;
Chris@82 669 T1n = T1h + T1m;
Chris@82 670 T2Q = T18 - T1n;
Chris@82 671 T2R = T24 + T25;
Chris@82 672 T2S = T2a + T2b;
Chris@82 673 T2T = T2R - T2S;
Chris@82 674 {
Chris@82 675 E T26, T27, T29, T2c;
Chris@82 676 T26 = T24 - T25;
Chris@82 677 T27 = T1h - T1m;
Chris@82 678 T28 = T26 + T27;
Chris@82 679 T2A = T26 - T27;
Chris@82 680 T29 = T14 - T17;
Chris@82 681 T2c = T2a - T2b;
Chris@82 682 T2d = T29 - T2c;
Chris@82 683 T2B = T29 + T2c;
Chris@82 684 }
Chris@82 685 }
Chris@82 686 {
Chris@82 687 E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
Chris@82 688 {
Chris@82 689 E T1R, T22, T3y, T3z;
Chris@82 690 T1R = T1N - T1Q;
Chris@82 691 T22 = KP707106781 * (T1W - T21);
Chris@82 692 T23 = T1R + T22;
Chris@82 693 T2r = T1R - T22;
Chris@82 694 T3y = KP707106781 * (T2x - T2w);
Chris@82 695 T3z = T3s + T3r;
Chris@82 696 T3A = T3y + T3z;
Chris@82 697 T3C = T3z - T3y;
Chris@82 698 }
Chris@82 699 {
Chris@82 700 E T2e, T2p, T2s, T2t;
Chris@82 701 T2e = FMA(KP923879532, T28, KP382683432 * T2d);
Chris@82 702 T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
Chris@82 703 T2q = T2e + T2p;
Chris@82 704 T3B = T2p - T2e;
Chris@82 705 T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
Chris@82 706 T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
Chris@82 707 T2u = T2s - T2t;
Chris@82 708 T3x = T2s + T2t;
Chris@82 709 }
Chris@82 710 ri[WS(rs, 11)] = T23 - T2q;
Chris@82 711 ii[WS(rs, 11)] = T3A - T3x;
Chris@82 712 ri[WS(rs, 3)] = T23 + T2q;
Chris@82 713 ii[WS(rs, 3)] = T3x + T3A;
Chris@82 714 ri[WS(rs, 15)] = T2r - T2u;
Chris@82 715 ii[WS(rs, 15)] = T3C - T3B;
Chris@82 716 ri[WS(rs, 7)] = T2r + T2u;
Chris@82 717 ii[WS(rs, 7)] = T3B + T3C;
Chris@82 718 }
Chris@82 719 {
Chris@82 720 E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
Chris@82 721 {
Chris@82 722 E T2L, T2O, T3k, T3l;
Chris@82 723 T2L = Tf - TA;
Chris@82 724 T2O = T2M - T2N;
Chris@82 725 T2P = T2L + T2O;
Chris@82 726 T31 = T2L - T2O;
Chris@82 727 T3k = TZ - TM;
Chris@82 728 T3l = T3e - T3b;
Chris@82 729 T3m = T3k + T3l;
Chris@82 730 T3o = T3l - T3k;
Chris@82 731 }
Chris@82 732 {
Chris@82 733 E T2U, T2Z, T32, T33;
Chris@82 734 T2U = T2Q + T2T;
Chris@82 735 T2Z = T2V - T2Y;
Chris@82 736 T30 = KP707106781 * (T2U + T2Z);
Chris@82 737 T3n = KP707106781 * (T2Z - T2U);
Chris@82 738 T32 = T2T - T2Q;
Chris@82 739 T33 = T2V + T2Y;
Chris@82 740 T34 = KP707106781 * (T32 - T33);
Chris@82 741 T3j = KP707106781 * (T32 + T33);
Chris@82 742 }
Chris@82 743 ri[WS(rs, 10)] = T2P - T30;
Chris@82 744 ii[WS(rs, 10)] = T3m - T3j;
Chris@82 745 ri[WS(rs, 2)] = T2P + T30;
Chris@82 746 ii[WS(rs, 2)] = T3j + T3m;
Chris@82 747 ri[WS(rs, 14)] = T31 - T34;
Chris@82 748 ii[WS(rs, 14)] = T3o - T3n;
Chris@82 749 ri[WS(rs, 6)] = T31 + T34;
Chris@82 750 ii[WS(rs, 6)] = T3n + T3o;
Chris@82 751 }
Chris@82 752 {
Chris@82 753 E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
Chris@82 754 {
Chris@82 755 E T2v, T2y, T3q, T3t;
Chris@82 756 T2v = T1N + T1Q;
Chris@82 757 T2y = KP707106781 * (T2w + T2x);
Chris@82 758 T2z = T2v + T2y;
Chris@82 759 T2H = T2v - T2y;
Chris@82 760 T3q = KP707106781 * (T1W + T21);
Chris@82 761 T3t = T3r - T3s;
Chris@82 762 T3u = T3q + T3t;
Chris@82 763 T3w = T3t - T3q;
Chris@82 764 }
Chris@82 765 {
Chris@82 766 E T2C, T2F, T2I, T2J;
Chris@82 767 T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
Chris@82 768 T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
Chris@82 769 T2G = T2C + T2F;
Chris@82 770 T3v = T2F - T2C;
Chris@82 771 T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
Chris@82 772 T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
Chris@82 773 T2K = T2I - T2J;
Chris@82 774 T3p = T2I + T2J;
Chris@82 775 }
Chris@82 776 ri[WS(rs, 9)] = T2z - T2G;
Chris@82 777 ii[WS(rs, 9)] = T3u - T3p;
Chris@82 778 ri[WS(rs, 1)] = T2z + T2G;
Chris@82 779 ii[WS(rs, 1)] = T3p + T3u;
Chris@82 780 ri[WS(rs, 13)] = T2H - T2K;
Chris@82 781 ii[WS(rs, 13)] = T3w - T3v;
Chris@82 782 ri[WS(rs, 5)] = T2H + T2K;
Chris@82 783 ii[WS(rs, 5)] = T3v + T3w;
Chris@82 784 }
Chris@82 785 {
Chris@82 786 E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
Chris@82 787 {
Chris@82 788 E TB, T10, T3a, T3f;
Chris@82 789 TB = Tf + TA;
Chris@82 790 T10 = TM + TZ;
Chris@82 791 T11 = TB + T10;
Chris@82 792 T35 = TB - T10;
Chris@82 793 T3a = T2M + T2N;
Chris@82 794 T3f = T3b + T3e;
Chris@82 795 T3g = T3a + T3f;
Chris@82 796 T3i = T3f - T3a;
Chris@82 797 }
Chris@82 798 {
Chris@82 799 E T1o, T1L, T36, T37;
Chris@82 800 T1o = T18 + T1n;
Chris@82 801 T1L = T1B + T1K;
Chris@82 802 T1M = T1o + T1L;
Chris@82 803 T3h = T1L - T1o;
Chris@82 804 T36 = T2R + T2S;
Chris@82 805 T37 = T2W + T2X;
Chris@82 806 T38 = T36 - T37;
Chris@82 807 T39 = T36 + T37;
Chris@82 808 }
Chris@82 809 ri[WS(rs, 8)] = T11 - T1M;
Chris@82 810 ii[WS(rs, 8)] = T3g - T39;
Chris@82 811 ri[0] = T11 + T1M;
Chris@82 812 ii[0] = T39 + T3g;
Chris@82 813 ri[WS(rs, 12)] = T35 - T38;
Chris@82 814 ii[WS(rs, 12)] = T3i - T3h;
Chris@82 815 ri[WS(rs, 4)] = T35 + T38;
Chris@82 816 ii[WS(rs, 4)] = T3h + T3i;
Chris@82 817 }
Chris@82 818 }
Chris@82 819 }
Chris@82 820 }
Chris@82 821 }
Chris@82 822
Chris@82 823 static const tw_instr twinstr[] = {
Chris@82 824 {TW_CEXP, 0, 1},
Chris@82 825 {TW_CEXP, 0, 3},
Chris@82 826 {TW_CEXP, 0, 9},
Chris@82 827 {TW_CEXP, 0, 15},
Chris@82 828 {TW_NEXT, 1, 0}
Chris@82 829 };
Chris@82 830
Chris@82 831 static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, {156, 68, 40, 0}, 0, 0, 0 };
Chris@82 832
Chris@82 833 void X(codelet_t2_16) (planner *p) {
Chris@82 834 X(kdft_dit_register) (p, t2_16, &desc);
Chris@82 835 }
Chris@82 836 #endif