annotate src/fftw-3.3.8/dft/scalar/codelets/t1_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:14 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 184 FP additions, 140 FP multiplications,
Chris@82 32 * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
Chris@82 33 * 51 stack variables, 6 constants, and 60 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@82 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 45 {
Chris@82 46 INT m;
Chris@82 47 for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@82 48 E T1, T3j, T1G, T3u, Te, T1B, T3i, T3t, T1y, T2i, T2a, T2M, T37, T2V, Tz;
Chris@82 49 E T2e, T1O, T2t, T39, T2X, TT, T2f, T1V, T2z, T3a, T2Y, T1e, T2h, T23, T2G;
Chris@82 50 E T36, T2U;
Chris@82 51 {
Chris@82 52 E T7, T1D, Td, T1F;
Chris@82 53 T1 = ri[0];
Chris@82 54 T3j = ii[0];
Chris@82 55 {
Chris@82 56 E T3, T6, T4, T1C, T2, T5;
Chris@82 57 T3 = ri[WS(rs, 5)];
Chris@82 58 T6 = ii[WS(rs, 5)];
Chris@82 59 T2 = W[8];
Chris@82 60 T4 = T2 * T3;
Chris@82 61 T1C = T2 * T6;
Chris@82 62 T5 = W[9];
Chris@82 63 T7 = FMA(T5, T6, T4);
Chris@82 64 T1D = FNMS(T5, T3, T1C);
Chris@82 65 }
Chris@82 66 {
Chris@82 67 E T9, Tc, Ta, T1E, T8, Tb;
Chris@82 68 T9 = ri[WS(rs, 10)];
Chris@82 69 Tc = ii[WS(rs, 10)];
Chris@82 70 T8 = W[18];
Chris@82 71 Ta = T8 * T9;
Chris@82 72 T1E = T8 * Tc;
Chris@82 73 Tb = W[19];
Chris@82 74 Td = FMA(Tb, Tc, Ta);
Chris@82 75 T1F = FNMS(Tb, T9, T1E);
Chris@82 76 }
Chris@82 77 T1G = T1D - T1F;
Chris@82 78 T3u = Td - T7;
Chris@82 79 Te = T7 + Td;
Chris@82 80 T1B = FNMS(KP500000000, Te, T1);
Chris@82 81 T3i = T1D + T1F;
Chris@82 82 T3t = FNMS(KP500000000, T3i, T3j);
Chris@82 83 }
Chris@82 84 {
Chris@82 85 E T1k, T2I, T1w, T28, T1q, T26;
Chris@82 86 {
Chris@82 87 E T1g, T1j, T1h, T2H, T1f, T1i;
Chris@82 88 T1g = ri[WS(rs, 9)];
Chris@82 89 T1j = ii[WS(rs, 9)];
Chris@82 90 T1f = W[16];
Chris@82 91 T1h = T1f * T1g;
Chris@82 92 T2H = T1f * T1j;
Chris@82 93 T1i = W[17];
Chris@82 94 T1k = FMA(T1i, T1j, T1h);
Chris@82 95 T2I = FNMS(T1i, T1g, T2H);
Chris@82 96 }
Chris@82 97 {
Chris@82 98 E T1s, T1v, T1t, T27, T1r, T1u;
Chris@82 99 T1s = ri[WS(rs, 4)];
Chris@82 100 T1v = ii[WS(rs, 4)];
Chris@82 101 T1r = W[6];
Chris@82 102 T1t = T1r * T1s;
Chris@82 103 T27 = T1r * T1v;
Chris@82 104 T1u = W[7];
Chris@82 105 T1w = FMA(T1u, T1v, T1t);
Chris@82 106 T28 = FNMS(T1u, T1s, T27);
Chris@82 107 }
Chris@82 108 {
Chris@82 109 E T1m, T1p, T1n, T25, T1l, T1o;
Chris@82 110 T1m = ri[WS(rs, 14)];
Chris@82 111 T1p = ii[WS(rs, 14)];
Chris@82 112 T1l = W[26];
Chris@82 113 T1n = T1l * T1m;
Chris@82 114 T25 = T1l * T1p;
Chris@82 115 T1o = W[27];
Chris@82 116 T1q = FMA(T1o, T1p, T1n);
Chris@82 117 T26 = FNMS(T1o, T1m, T25);
Chris@82 118 }
Chris@82 119 {
Chris@82 120 E T29, T1x, T24, T2L, T2J, T2K;
Chris@82 121 T29 = T26 - T28;
Chris@82 122 T1x = T1q + T1w;
Chris@82 123 T24 = FNMS(KP500000000, T1x, T1k);
Chris@82 124 T1y = T1k + T1x;
Chris@82 125 T2i = FMA(KP866025403, T29, T24);
Chris@82 126 T2a = FNMS(KP866025403, T29, T24);
Chris@82 127 T2L = T1w - T1q;
Chris@82 128 T2J = T26 + T28;
Chris@82 129 T2K = FNMS(KP500000000, T2J, T2I);
Chris@82 130 T2M = FMA(KP866025403, T2L, T2K);
Chris@82 131 T37 = T2I + T2J;
Chris@82 132 T2V = FNMS(KP866025403, T2L, T2K);
Chris@82 133 }
Chris@82 134 }
Chris@82 135 {
Chris@82 136 E Tl, T2p, Tx, T1M, Tr, T1K;
Chris@82 137 {
Chris@82 138 E Th, Tk, Ti, T2o, Tg, Tj;
Chris@82 139 Th = ri[WS(rs, 3)];
Chris@82 140 Tk = ii[WS(rs, 3)];
Chris@82 141 Tg = W[4];
Chris@82 142 Ti = Tg * Th;
Chris@82 143 T2o = Tg * Tk;
Chris@82 144 Tj = W[5];
Chris@82 145 Tl = FMA(Tj, Tk, Ti);
Chris@82 146 T2p = FNMS(Tj, Th, T2o);
Chris@82 147 }
Chris@82 148 {
Chris@82 149 E Tt, Tw, Tu, T1L, Ts, Tv;
Chris@82 150 Tt = ri[WS(rs, 13)];
Chris@82 151 Tw = ii[WS(rs, 13)];
Chris@82 152 Ts = W[24];
Chris@82 153 Tu = Ts * Tt;
Chris@82 154 T1L = Ts * Tw;
Chris@82 155 Tv = W[25];
Chris@82 156 Tx = FMA(Tv, Tw, Tu);
Chris@82 157 T1M = FNMS(Tv, Tt, T1L);
Chris@82 158 }
Chris@82 159 {
Chris@82 160 E Tn, Tq, To, T1J, Tm, Tp;
Chris@82 161 Tn = ri[WS(rs, 8)];
Chris@82 162 Tq = ii[WS(rs, 8)];
Chris@82 163 Tm = W[14];
Chris@82 164 To = Tm * Tn;
Chris@82 165 T1J = Tm * Tq;
Chris@82 166 Tp = W[15];
Chris@82 167 Tr = FMA(Tp, Tq, To);
Chris@82 168 T1K = FNMS(Tp, Tn, T1J);
Chris@82 169 }
Chris@82 170 {
Chris@82 171 E T1N, Ty, T1I, T2s, T2q, T2r;
Chris@82 172 T1N = T1K - T1M;
Chris@82 173 Ty = Tr + Tx;
Chris@82 174 T1I = FNMS(KP500000000, Ty, Tl);
Chris@82 175 Tz = Tl + Ty;
Chris@82 176 T2e = FMA(KP866025403, T1N, T1I);
Chris@82 177 T1O = FNMS(KP866025403, T1N, T1I);
Chris@82 178 T2s = Tx - Tr;
Chris@82 179 T2q = T1K + T1M;
Chris@82 180 T2r = FNMS(KP500000000, T2q, T2p);
Chris@82 181 T2t = FMA(KP866025403, T2s, T2r);
Chris@82 182 T39 = T2p + T2q;
Chris@82 183 T2X = FNMS(KP866025403, T2s, T2r);
Chris@82 184 }
Chris@82 185 }
Chris@82 186 {
Chris@82 187 E TF, T2v, TR, T1T, TL, T1R;
Chris@82 188 {
Chris@82 189 E TB, TE, TC, T2u, TA, TD;
Chris@82 190 TB = ri[WS(rs, 12)];
Chris@82 191 TE = ii[WS(rs, 12)];
Chris@82 192 TA = W[22];
Chris@82 193 TC = TA * TB;
Chris@82 194 T2u = TA * TE;
Chris@82 195 TD = W[23];
Chris@82 196 TF = FMA(TD, TE, TC);
Chris@82 197 T2v = FNMS(TD, TB, T2u);
Chris@82 198 }
Chris@82 199 {
Chris@82 200 E TN, TQ, TO, T1S, TM, TP;
Chris@82 201 TN = ri[WS(rs, 7)];
Chris@82 202 TQ = ii[WS(rs, 7)];
Chris@82 203 TM = W[12];
Chris@82 204 TO = TM * TN;
Chris@82 205 T1S = TM * TQ;
Chris@82 206 TP = W[13];
Chris@82 207 TR = FMA(TP, TQ, TO);
Chris@82 208 T1T = FNMS(TP, TN, T1S);
Chris@82 209 }
Chris@82 210 {
Chris@82 211 E TH, TK, TI, T1Q, TG, TJ;
Chris@82 212 TH = ri[WS(rs, 2)];
Chris@82 213 TK = ii[WS(rs, 2)];
Chris@82 214 TG = W[2];
Chris@82 215 TI = TG * TH;
Chris@82 216 T1Q = TG * TK;
Chris@82 217 TJ = W[3];
Chris@82 218 TL = FMA(TJ, TK, TI);
Chris@82 219 T1R = FNMS(TJ, TH, T1Q);
Chris@82 220 }
Chris@82 221 {
Chris@82 222 E T1U, TS, T1P, T2y, T2w, T2x;
Chris@82 223 T1U = T1R - T1T;
Chris@82 224 TS = TL + TR;
Chris@82 225 T1P = FNMS(KP500000000, TS, TF);
Chris@82 226 TT = TF + TS;
Chris@82 227 T2f = FMA(KP866025403, T1U, T1P);
Chris@82 228 T1V = FNMS(KP866025403, T1U, T1P);
Chris@82 229 T2y = TR - TL;
Chris@82 230 T2w = T1R + T1T;
Chris@82 231 T2x = FNMS(KP500000000, T2w, T2v);
Chris@82 232 T2z = FMA(KP866025403, T2y, T2x);
Chris@82 233 T3a = T2v + T2w;
Chris@82 234 T2Y = FNMS(KP866025403, T2y, T2x);
Chris@82 235 }
Chris@82 236 }
Chris@82 237 {
Chris@82 238 E T10, T2C, T1c, T21, T16, T1Z;
Chris@82 239 {
Chris@82 240 E TW, TZ, TX, T2B, TV, TY;
Chris@82 241 TW = ri[WS(rs, 6)];
Chris@82 242 TZ = ii[WS(rs, 6)];
Chris@82 243 TV = W[10];
Chris@82 244 TX = TV * TW;
Chris@82 245 T2B = TV * TZ;
Chris@82 246 TY = W[11];
Chris@82 247 T10 = FMA(TY, TZ, TX);
Chris@82 248 T2C = FNMS(TY, TW, T2B);
Chris@82 249 }
Chris@82 250 {
Chris@82 251 E T18, T1b, T19, T20, T17, T1a;
Chris@82 252 T18 = ri[WS(rs, 1)];
Chris@82 253 T1b = ii[WS(rs, 1)];
Chris@82 254 T17 = W[0];
Chris@82 255 T19 = T17 * T18;
Chris@82 256 T20 = T17 * T1b;
Chris@82 257 T1a = W[1];
Chris@82 258 T1c = FMA(T1a, T1b, T19);
Chris@82 259 T21 = FNMS(T1a, T18, T20);
Chris@82 260 }
Chris@82 261 {
Chris@82 262 E T12, T15, T13, T1Y, T11, T14;
Chris@82 263 T12 = ri[WS(rs, 11)];
Chris@82 264 T15 = ii[WS(rs, 11)];
Chris@82 265 T11 = W[20];
Chris@82 266 T13 = T11 * T12;
Chris@82 267 T1Y = T11 * T15;
Chris@82 268 T14 = W[21];
Chris@82 269 T16 = FMA(T14, T15, T13);
Chris@82 270 T1Z = FNMS(T14, T12, T1Y);
Chris@82 271 }
Chris@82 272 {
Chris@82 273 E T22, T1d, T1X, T2F, T2D, T2E;
Chris@82 274 T22 = T1Z - T21;
Chris@82 275 T1d = T16 + T1c;
Chris@82 276 T1X = FNMS(KP500000000, T1d, T10);
Chris@82 277 T1e = T10 + T1d;
Chris@82 278 T2h = FMA(KP866025403, T22, T1X);
Chris@82 279 T23 = FNMS(KP866025403, T22, T1X);
Chris@82 280 T2F = T1c - T16;
Chris@82 281 T2D = T1Z + T21;
Chris@82 282 T2E = FNMS(KP500000000, T2D, T2C);
Chris@82 283 T2G = FMA(KP866025403, T2F, T2E);
Chris@82 284 T36 = T2C + T2D;
Chris@82 285 T2U = FNMS(KP866025403, T2F, T2E);
Chris@82 286 }
Chris@82 287 }
Chris@82 288 {
Chris@82 289 E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
Chris@82 290 {
Chris@82 291 E T38, T3b, TU, T1z;
Chris@82 292 T38 = T36 - T37;
Chris@82 293 T3b = T39 - T3a;
Chris@82 294 T3c = FNMS(KP618033988, T3b, T38);
Chris@82 295 T3e = FMA(KP618033988, T38, T3b);
Chris@82 296 Tf = T1 + Te;
Chris@82 297 TU = Tz + TT;
Chris@82 298 T1z = T1e + T1y;
Chris@82 299 T1A = TU + T1z;
Chris@82 300 T33 = FNMS(KP250000000, T1A, Tf);
Chris@82 301 T34 = TU - T1z;
Chris@82 302 }
Chris@82 303 ri[0] = Tf + T1A;
Chris@82 304 T3d = FMA(KP559016994, T34, T33);
Chris@82 305 ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
Chris@82 306 ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
Chris@82 307 T35 = FNMS(KP559016994, T34, T33);
Chris@82 308 ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
Chris@82 309 ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
Chris@82 310 }
Chris@82 311 {
Chris@82 312 E T3q, T3s, T3k, T3h, T3l, T3m, T3r, T3n;
Chris@82 313 {
Chris@82 314 E T3o, T3p, T3f, T3g;
Chris@82 315 T3o = T1e - T1y;
Chris@82 316 T3p = Tz - TT;
Chris@82 317 T3q = FNMS(KP618033988, T3p, T3o);
Chris@82 318 T3s = FMA(KP618033988, T3o, T3p);
Chris@82 319 T3k = T3i + T3j;
Chris@82 320 T3f = T39 + T3a;
Chris@82 321 T3g = T36 + T37;
Chris@82 322 T3h = T3f + T3g;
Chris@82 323 T3l = FNMS(KP250000000, T3h, T3k);
Chris@82 324 T3m = T3f - T3g;
Chris@82 325 }
Chris@82 326 ii[0] = T3h + T3k;
Chris@82 327 T3r = FMA(KP559016994, T3m, T3l);
Chris@82 328 ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
Chris@82 329 ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
Chris@82 330 T3n = FNMS(KP559016994, T3m, T3l);
Chris@82 331 ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
Chris@82 332 ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
Chris@82 333 }
Chris@82 334 {
Chris@82 335 E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
Chris@82 336 {
Chris@82 337 E T2W, T2Z, T1W, T2b;
Chris@82 338 T2W = T2U - T2V;
Chris@82 339 T2Z = T2X - T2Y;
Chris@82 340 T30 = FNMS(KP618033988, T2Z, T2W);
Chris@82 341 T32 = FMA(KP618033988, T2W, T2Z);
Chris@82 342 T1H = FNMS(KP866025403, T1G, T1B);
Chris@82 343 T1W = T1O + T1V;
Chris@82 344 T2b = T23 + T2a;
Chris@82 345 T2c = T1W + T2b;
Chris@82 346 T2R = FNMS(KP250000000, T2c, T1H);
Chris@82 347 T2S = T1W - T2b;
Chris@82 348 }
Chris@82 349 ri[WS(rs, 5)] = T1H + T2c;
Chris@82 350 T31 = FMA(KP559016994, T2S, T2R);
Chris@82 351 ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
Chris@82 352 ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
Chris@82 353 T2T = FNMS(KP559016994, T2S, T2R);
Chris@82 354 ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
Chris@82 355 ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
Chris@82 359 {
Chris@82 360 E T3O, T3P, T3I, T3J;
Chris@82 361 T3O = T23 - T2a;
Chris@82 362 T3P = T1O - T1V;
Chris@82 363 T3Q = FNMS(KP618033988, T3P, T3O);
Chris@82 364 T3S = FMA(KP618033988, T3O, T3P);
Chris@82 365 T3H = FNMS(KP866025403, T3u, T3t);
Chris@82 366 T3I = T2X + T2Y;
Chris@82 367 T3J = T2U + T2V;
Chris@82 368 T3K = T3I + T3J;
Chris@82 369 T3L = FNMS(KP250000000, T3K, T3H);
Chris@82 370 T3M = T3I - T3J;
Chris@82 371 }
Chris@82 372 ii[WS(rs, 5)] = T3K + T3H;
Chris@82 373 T3R = FMA(KP559016994, T3M, T3L);
Chris@82 374 ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
Chris@82 375 ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
Chris@82 376 T3N = FNMS(KP559016994, T3M, T3L);
Chris@82 377 ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
Chris@82 378 ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
Chris@82 379 }
Chris@82 380 {
Chris@82 381 E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
Chris@82 382 {
Chris@82 383 E T3C, T3D, T3w, T3x;
Chris@82 384 T3C = T2e - T2f;
Chris@82 385 T3D = T2h - T2i;
Chris@82 386 T3E = FMA(KP618033988, T3D, T3C);
Chris@82 387 T3G = FNMS(KP618033988, T3C, T3D);
Chris@82 388 T3v = FMA(KP866025403, T3u, T3t);
Chris@82 389 T3w = T2t + T2z;
Chris@82 390 T3x = T2G + T2M;
Chris@82 391 T3y = T3w + T3x;
Chris@82 392 T3z = FNMS(KP250000000, T3y, T3v);
Chris@82 393 T3A = T3w - T3x;
Chris@82 394 }
Chris@82 395 ii[WS(rs, 10)] = T3y + T3v;
Chris@82 396 T3F = FNMS(KP559016994, T3A, T3z);
Chris@82 397 ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
Chris@82 398 ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
Chris@82 399 T3B = FMA(KP559016994, T3A, T3z);
Chris@82 400 ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
Chris@82 401 ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
Chris@82 402 }
Chris@82 403 {
Chris@82 404 E T2O, T2Q, T2d, T2k, T2l, T2m, T2P, T2n;
Chris@82 405 {
Chris@82 406 E T2A, T2N, T2g, T2j;
Chris@82 407 T2A = T2t - T2z;
Chris@82 408 T2N = T2G - T2M;
Chris@82 409 T2O = FMA(KP618033988, T2N, T2A);
Chris@82 410 T2Q = FNMS(KP618033988, T2A, T2N);
Chris@82 411 T2d = FMA(KP866025403, T1G, T1B);
Chris@82 412 T2g = T2e + T2f;
Chris@82 413 T2j = T2h + T2i;
Chris@82 414 T2k = T2g + T2j;
Chris@82 415 T2l = FNMS(KP250000000, T2k, T2d);
Chris@82 416 T2m = T2g - T2j;
Chris@82 417 }
Chris@82 418 ri[WS(rs, 10)] = T2d + T2k;
Chris@82 419 T2P = FNMS(KP559016994, T2m, T2l);
Chris@82 420 ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
Chris@82 421 ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
Chris@82 422 T2n = FMA(KP559016994, T2m, T2l);
Chris@82 423 ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
Chris@82 424 ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
Chris@82 425 }
Chris@82 426 }
Chris@82 427 }
Chris@82 428 }
Chris@82 429
Chris@82 430 static const tw_instr twinstr[] = {
Chris@82 431 {TW_FULL, 0, 15},
Chris@82 432 {TW_NEXT, 1, 0}
Chris@82 433 };
Chris@82 434
Chris@82 435 static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, {72, 28, 112, 0}, 0, 0, 0 };
Chris@82 436
Chris@82 437 void X(codelet_t1_15) (planner *p) {
Chris@82 438 X(kdft_dit_register) (p, t1_15, &desc);
Chris@82 439 }
Chris@82 440 #else
Chris@82 441
Chris@82 442 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
Chris@82 443
Chris@82 444 /*
Chris@82 445 * This function contains 184 FP additions, 112 FP multiplications,
Chris@82 446 * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
Chris@82 447 * 65 stack variables, 6 constants, and 60 memory accesses
Chris@82 448 */
Chris@82 449 #include "dft/scalar/t.h"
Chris@82 450
Chris@82 451 static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 452 {
Chris@82 453 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@82 454 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@82 455 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@82 456 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@82 457 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 458 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@82 459 {
Chris@82 460 INT m;
Chris@82 461 for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@82 462 E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
Chris@82 463 E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
Chris@82 464 E T24, T2v, T1B, T1R;
Chris@82 465 {
Chris@82 466 E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
Chris@82 467 T1 = ri[0];
Chris@82 468 T2R = ii[0];
Chris@82 469 {
Chris@82 470 E T3, T5, T2, T4;
Chris@82 471 T3 = ri[WS(rs, 5)];
Chris@82 472 T5 = ii[WS(rs, 5)];
Chris@82 473 T2 = W[8];
Chris@82 474 T4 = W[9];
Chris@82 475 T6 = FMA(T2, T3, T4 * T5);
Chris@82 476 T1o = FNMS(T4, T3, T2 * T5);
Chris@82 477 }
Chris@82 478 {
Chris@82 479 E T8, Ta, T7, T9;
Chris@82 480 T8 = ri[WS(rs, 10)];
Chris@82 481 Ta = ii[WS(rs, 10)];
Chris@82 482 T7 = W[18];
Chris@82 483 T9 = W[19];
Chris@82 484 Tb = FMA(T7, T8, T9 * Ta);
Chris@82 485 T1p = FNMS(T9, T8, T7 * Ta);
Chris@82 486 }
Chris@82 487 T1q = KP866025403 * (T1o - T1p);
Chris@82 488 T34 = KP866025403 * (Tb - T6);
Chris@82 489 Tc = T6 + Tb;
Chris@82 490 Td = T1 + Tc;
Chris@82 491 T1n = FNMS(KP500000000, Tc, T1);
Chris@82 492 T2Q = T1o + T1p;
Chris@82 493 T2S = T2Q + T2R;
Chris@82 494 T35 = FNMS(KP500000000, T2Q, T2R);
Chris@82 495 }
Chris@82 496 {
Chris@82 497 E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
Chris@82 498 E T2i;
Chris@82 499 {
Chris@82 500 E TO, TQ, TN, TP;
Chris@82 501 TO = ri[WS(rs, 6)];
Chris@82 502 TQ = ii[WS(rs, 6)];
Chris@82 503 TN = W[10];
Chris@82 504 TP = W[11];
Chris@82 505 TR = FMA(TN, TO, TP * TQ);
Chris@82 506 T2c = FNMS(TP, TO, TN * TQ);
Chris@82 507 }
Chris@82 508 {
Chris@82 509 E T15, T17, T14, T16;
Chris@82 510 T15 = ri[WS(rs, 9)];
Chris@82 511 T17 = ii[WS(rs, 9)];
Chris@82 512 T14 = W[16];
Chris@82 513 T16 = W[17];
Chris@82 514 T18 = FMA(T14, T15, T16 * T17);
Chris@82 515 T2h = FNMS(T16, T15, T14 * T17);
Chris@82 516 }
Chris@82 517 {
Chris@82 518 E TT, TV, TS, TU;
Chris@82 519 TT = ri[WS(rs, 11)];
Chris@82 520 TV = ii[WS(rs, 11)];
Chris@82 521 TS = W[20];
Chris@82 522 TU = W[21];
Chris@82 523 TW = FMA(TS, TT, TU * TV);
Chris@82 524 T1E = FNMS(TU, TT, TS * TV);
Chris@82 525 }
Chris@82 526 {
Chris@82 527 E TY, T10, TX, TZ;
Chris@82 528 TY = ri[WS(rs, 1)];
Chris@82 529 T10 = ii[WS(rs, 1)];
Chris@82 530 TX = W[0];
Chris@82 531 TZ = W[1];
Chris@82 532 T11 = FMA(TX, TY, TZ * T10);
Chris@82 533 T1F = FNMS(TZ, TY, TX * T10);
Chris@82 534 }
Chris@82 535 T12 = TW + T11;
Chris@82 536 T2d = T1E + T1F;
Chris@82 537 {
Chris@82 538 E T1a, T1c, T19, T1b;
Chris@82 539 T1a = ri[WS(rs, 14)];
Chris@82 540 T1c = ii[WS(rs, 14)];
Chris@82 541 T19 = W[26];
Chris@82 542 T1b = W[27];
Chris@82 543 T1d = FMA(T19, T1a, T1b * T1c);
Chris@82 544 T1J = FNMS(T1b, T1a, T19 * T1c);
Chris@82 545 }
Chris@82 546 {
Chris@82 547 E T1f, T1h, T1e, T1g;
Chris@82 548 T1f = ri[WS(rs, 4)];
Chris@82 549 T1h = ii[WS(rs, 4)];
Chris@82 550 T1e = W[6];
Chris@82 551 T1g = W[7];
Chris@82 552 T1i = FMA(T1e, T1f, T1g * T1h);
Chris@82 553 T1K = FNMS(T1g, T1f, T1e * T1h);
Chris@82 554 }
Chris@82 555 T1j = T1d + T1i;
Chris@82 556 T2i = T1J + T1K;
Chris@82 557 {
Chris@82 558 E T1D, T1G, T2g, T2j;
Chris@82 559 T13 = TR + T12;
Chris@82 560 T1k = T18 + T1j;
Chris@82 561 T1l = T13 + T1k;
Chris@82 562 T2E = T2c + T2d;
Chris@82 563 T2F = T2h + T2i;
Chris@82 564 T2O = T2E + T2F;
Chris@82 565 T1D = FNMS(KP500000000, T12, TR);
Chris@82 566 T1G = KP866025403 * (T1E - T1F);
Chris@82 567 T1H = T1D - T1G;
Chris@82 568 T1T = T1D + T1G;
Chris@82 569 T2g = KP866025403 * (T1i - T1d);
Chris@82 570 T2j = FNMS(KP500000000, T2i, T2h);
Chris@82 571 T2k = T2g + T2j;
Chris@82 572 T2t = T2j - T2g;
Chris@82 573 {
Chris@82 574 E T2b, T2e, T1I, T1L;
Chris@82 575 T2b = KP866025403 * (T11 - TW);
Chris@82 576 T2e = FNMS(KP500000000, T2d, T2c);
Chris@82 577 T2f = T2b + T2e;
Chris@82 578 T2s = T2e - T2b;
Chris@82 579 T1I = FNMS(KP500000000, T1j, T18);
Chris@82 580 T1L = KP866025403 * (T1J - T1K);
Chris@82 581 T1M = T1I - T1L;
Chris@82 582 T1U = T1I + T1L;
Chris@82 583 }
Chris@82 584 }
Chris@82 585 }
Chris@82 586 {
Chris@82 587 E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
Chris@82 588 E T27;
Chris@82 589 {
Chris@82 590 E Tf, Th, Te, Tg;
Chris@82 591 Tf = ri[WS(rs, 3)];
Chris@82 592 Th = ii[WS(rs, 3)];
Chris@82 593 Te = W[4];
Chris@82 594 Tg = W[5];
Chris@82 595 Ti = FMA(Te, Tf, Tg * Th);
Chris@82 596 T21 = FNMS(Tg, Tf, Te * Th);
Chris@82 597 }
Chris@82 598 {
Chris@82 599 E Tw, Ty, Tv, Tx;
Chris@82 600 Tw = ri[WS(rs, 12)];
Chris@82 601 Ty = ii[WS(rs, 12)];
Chris@82 602 Tv = W[22];
Chris@82 603 Tx = W[23];
Chris@82 604 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 605 T26 = FNMS(Tx, Tw, Tv * Ty);
Chris@82 606 }
Chris@82 607 {
Chris@82 608 E Tk, Tm, Tj, Tl;
Chris@82 609 Tk = ri[WS(rs, 8)];
Chris@82 610 Tm = ii[WS(rs, 8)];
Chris@82 611 Tj = W[14];
Chris@82 612 Tl = W[15];
Chris@82 613 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@82 614 T1t = FNMS(Tl, Tk, Tj * Tm);
Chris@82 615 }
Chris@82 616 {
Chris@82 617 E Tp, Tr, To, Tq;
Chris@82 618 Tp = ri[WS(rs, 13)];
Chris@82 619 Tr = ii[WS(rs, 13)];
Chris@82 620 To = W[24];
Chris@82 621 Tq = W[25];
Chris@82 622 Ts = FMA(To, Tp, Tq * Tr);
Chris@82 623 T1u = FNMS(Tq, Tp, To * Tr);
Chris@82 624 }
Chris@82 625 Tt = Tn + Ts;
Chris@82 626 T22 = T1t + T1u;
Chris@82 627 {
Chris@82 628 E TB, TD, TA, TC;
Chris@82 629 TB = ri[WS(rs, 2)];
Chris@82 630 TD = ii[WS(rs, 2)];
Chris@82 631 TA = W[2];
Chris@82 632 TC = W[3];
Chris@82 633 TE = FMA(TA, TB, TC * TD);
Chris@82 634 T1y = FNMS(TC, TB, TA * TD);
Chris@82 635 }
Chris@82 636 {
Chris@82 637 E TG, TI, TF, TH;
Chris@82 638 TG = ri[WS(rs, 7)];
Chris@82 639 TI = ii[WS(rs, 7)];
Chris@82 640 TF = W[12];
Chris@82 641 TH = W[13];
Chris@82 642 TJ = FMA(TF, TG, TH * TI);
Chris@82 643 T1z = FNMS(TH, TG, TF * TI);
Chris@82 644 }
Chris@82 645 TK = TE + TJ;
Chris@82 646 T27 = T1y + T1z;
Chris@82 647 {
Chris@82 648 E T1s, T1v, T25, T28;
Chris@82 649 Tu = Ti + Tt;
Chris@82 650 TL = Tz + TK;
Chris@82 651 TM = Tu + TL;
Chris@82 652 T2H = T21 + T22;
Chris@82 653 T2I = T26 + T27;
Chris@82 654 T2N = T2H + T2I;
Chris@82 655 T1s = FNMS(KP500000000, Tt, Ti);
Chris@82 656 T1v = KP866025403 * (T1t - T1u);
Chris@82 657 T1w = T1s - T1v;
Chris@82 658 T1Q = T1s + T1v;
Chris@82 659 T25 = KP866025403 * (TJ - TE);
Chris@82 660 T28 = FNMS(KP500000000, T27, T26);
Chris@82 661 T29 = T25 + T28;
Chris@82 662 T2w = T28 - T25;
Chris@82 663 {
Chris@82 664 E T20, T23, T1x, T1A;
Chris@82 665 T20 = KP866025403 * (Ts - Tn);
Chris@82 666 T23 = FNMS(KP500000000, T22, T21);
Chris@82 667 T24 = T20 + T23;
Chris@82 668 T2v = T23 - T20;
Chris@82 669 T1x = FNMS(KP500000000, TK, Tz);
Chris@82 670 T1A = KP866025403 * (T1y - T1z);
Chris@82 671 T1B = T1x - T1A;
Chris@82 672 T1R = T1x + T1A;
Chris@82 673 }
Chris@82 674 }
Chris@82 675 }
Chris@82 676 {
Chris@82 677 E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
Chris@82 678 T2C = KP559016994 * (TM - T1l);
Chris@82 679 T1m = TM + T1l;
Chris@82 680 T2B = FNMS(KP250000000, T1m, Td);
Chris@82 681 T2G = T2E - T2F;
Chris@82 682 T2J = T2H - T2I;
Chris@82 683 T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
Chris@82 684 T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
Chris@82 685 ri[0] = Td + T1m;
Chris@82 686 T2L = T2C + T2B;
Chris@82 687 ri[WS(rs, 9)] = T2L - T2M;
Chris@82 688 ri[WS(rs, 6)] = T2L + T2M;
Chris@82 689 T2D = T2B - T2C;
Chris@82 690 ri[WS(rs, 12)] = T2D - T2K;
Chris@82 691 ri[WS(rs, 3)] = T2D + T2K;
Chris@82 692 }
Chris@82 693 {
Chris@82 694 E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
Chris@82 695 T2U = KP559016994 * (T2N - T2O);
Chris@82 696 T2P = T2N + T2O;
Chris@82 697 T2T = FNMS(KP250000000, T2P, T2S);
Chris@82 698 T2W = T13 - T1k;
Chris@82 699 T2X = Tu - TL;
Chris@82 700 T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
Chris@82 701 T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
Chris@82 702 ii[0] = T2P + T2S;
Chris@82 703 T2Z = T2U + T2T;
Chris@82 704 ii[WS(rs, 6)] = T2Z - T30;
Chris@82 705 ii[WS(rs, 9)] = T30 + T2Z;
Chris@82 706 T2V = T2T - T2U;
Chris@82 707 ii[WS(rs, 3)] = T2V - T2Y;
Chris@82 708 ii[WS(rs, 12)] = T2Y + T2V;
Chris@82 709 }
Chris@82 710 {
Chris@82 711 E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
Chris@82 712 {
Chris@82 713 E T2u, T2x, T1C, T1N;
Chris@82 714 T2u = T2s - T2t;
Chris@82 715 T2x = T2v - T2w;
Chris@82 716 T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
Chris@82 717 T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
Chris@82 718 T1r = T1n - T1q;
Chris@82 719 T1C = T1w + T1B;
Chris@82 720 T1N = T1H + T1M;
Chris@82 721 T1O = T1C + T1N;
Chris@82 722 T2p = FNMS(KP250000000, T1O, T1r);
Chris@82 723 T2q = KP559016994 * (T1C - T1N);
Chris@82 724 }
Chris@82 725 ri[WS(rs, 5)] = T1r + T1O;
Chris@82 726 T2z = T2q + T2p;
Chris@82 727 ri[WS(rs, 14)] = T2z - T2A;
Chris@82 728 ri[WS(rs, 11)] = T2z + T2A;
Chris@82 729 T2r = T2p - T2q;
Chris@82 730 ri[WS(rs, 2)] = T2r - T2y;
Chris@82 731 ri[WS(rs, 8)] = T2r + T2y;
Chris@82 732 }
Chris@82 733 {
Chris@82 734 E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
Chris@82 735 {
Chris@82 736 E T3f, T3g, T3j, T3k;
Chris@82 737 T3f = T1H - T1M;
Chris@82 738 T3g = T1w - T1B;
Chris@82 739 T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
Chris@82 740 T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
Chris@82 741 T3i = T35 - T34;
Chris@82 742 T3j = T2v + T2w;
Chris@82 743 T3k = T2s + T2t;
Chris@82 744 T3l = T3j + T3k;
Chris@82 745 T3m = FNMS(KP250000000, T3l, T3i);
Chris@82 746 T3n = KP559016994 * (T3j - T3k);
Chris@82 747 }
Chris@82 748 ii[WS(rs, 5)] = T3l + T3i;
Chris@82 749 T3p = T3n + T3m;
Chris@82 750 ii[WS(rs, 11)] = T3p - T3q;
Chris@82 751 ii[WS(rs, 14)] = T3q + T3p;
Chris@82 752 T3o = T3m - T3n;
Chris@82 753 ii[WS(rs, 2)] = T3h + T3o;
Chris@82 754 ii[WS(rs, 8)] = T3o - T3h;
Chris@82 755 }
Chris@82 756 {
Chris@82 757 E T3c, T3d, T36, T37, T33, T38, T3e, T39;
Chris@82 758 {
Chris@82 759 E T3a, T3b, T31, T32;
Chris@82 760 T3a = T1Q - T1R;
Chris@82 761 T3b = T1T - T1U;
Chris@82 762 T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
Chris@82 763 T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
Chris@82 764 T36 = T34 + T35;
Chris@82 765 T31 = T24 + T29;
Chris@82 766 T32 = T2f + T2k;
Chris@82 767 T37 = T31 + T32;
Chris@82 768 T33 = KP559016994 * (T31 - T32);
Chris@82 769 T38 = FNMS(KP250000000, T37, T36);
Chris@82 770 }
Chris@82 771 ii[WS(rs, 10)] = T37 + T36;
Chris@82 772 T3e = T38 - T33;
Chris@82 773 ii[WS(rs, 7)] = T3d + T3e;
Chris@82 774 ii[WS(rs, 13)] = T3e - T3d;
Chris@82 775 T39 = T33 + T38;
Chris@82 776 ii[WS(rs, 1)] = T39 - T3c;
Chris@82 777 ii[WS(rs, 4)] = T3c + T39;
Chris@82 778 }
Chris@82 779 {
Chris@82 780 E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
Chris@82 781 {
Chris@82 782 E T2a, T2l, T1S, T1V;
Chris@82 783 T2a = T24 - T29;
Chris@82 784 T2l = T2f - T2k;
Chris@82 785 T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
Chris@82 786 T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
Chris@82 787 T1P = T1n + T1q;
Chris@82 788 T1S = T1Q + T1R;
Chris@82 789 T1V = T1T + T1U;
Chris@82 790 T1W = T1S + T1V;
Chris@82 791 T1X = KP559016994 * (T1S - T1V);
Chris@82 792 T1Y = FNMS(KP250000000, T1W, T1P);
Chris@82 793 }
Chris@82 794 ri[WS(rs, 10)] = T1P + T1W;
Chris@82 795 T2n = T1Y - T1X;
Chris@82 796 ri[WS(rs, 7)] = T2n - T2o;
Chris@82 797 ri[WS(rs, 13)] = T2n + T2o;
Chris@82 798 T1Z = T1X + T1Y;
Chris@82 799 ri[WS(rs, 4)] = T1Z - T2m;
Chris@82 800 ri[WS(rs, 1)] = T1Z + T2m;
Chris@82 801 }
Chris@82 802 }
Chris@82 803 }
Chris@82 804 }
Chris@82 805
Chris@82 806 static const tw_instr twinstr[] = {
Chris@82 807 {TW_FULL, 0, 15},
Chris@82 808 {TW_NEXT, 1, 0}
Chris@82 809 };
Chris@82 810
Chris@82 811 static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, {128, 56, 56, 0}, 0, 0, 0 };
Chris@82 812
Chris@82 813 void X(codelet_t1_15) (planner *p) {
Chris@82 814 X(kdft_dit_register) (p, t1_15, &desc);
Chris@82 815 }
Chris@82 816 #endif