annotate src/fftw-3.3.5/dft/scalar/codelets/t1_15.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:36:16 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-dft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include t.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 184 FP additions, 140 FP multiplications,
Chris@42 32 * (or, 72 additions, 28 multiplications, 112 fused multiply/add),
Chris@42 33 * 89 stack variables, 6 constants, and 60 memory accesses
Chris@42 34 */
Chris@42 35 #include "t.h"
Chris@42 36
Chris@42 37 static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 40 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 41 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 42 DK(KP618033988, +0.618033988749894848204586834365638117720309180);
Chris@42 43 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 44 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 45 {
Chris@42 46 INT m;
Chris@42 47 for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@42 48 E T2d, T2O, T2Q, T2m, T2k, T2l, T2P, T2n;
Chris@42 49 {
Chris@42 50 E T1G, T3u, T3k, T3t, T1B, Tf, T37, T1y, T2V, T2M, T2a, T2i, T39, Tz, T2X;
Chris@42 51 E T2t, T1O, T2e, T3a, TT, T10, T2Y, T2z, T1V, T2f, T2C, T12, T15, T14, T21;
Chris@42 52 E T1c, T1Y, T13;
Chris@42 53 {
Chris@42 54 E T2I, T1k, T1m, T1p, T1o, T28, T1w, T25, T1n;
Chris@42 55 {
Chris@42 56 E T1, T3j, T9, Tc, Tb, T1D, T7, T1E, Ta, T1j, T1i, T1h;
Chris@42 57 T1 = ri[0];
Chris@42 58 T3j = ii[0];
Chris@42 59 {
Chris@42 60 E T3, T6, T2, T5, T1C, T4, T8;
Chris@42 61 T3 = ri[WS(rs, 5)];
Chris@42 62 T6 = ii[WS(rs, 5)];
Chris@42 63 T2 = W[8];
Chris@42 64 T5 = W[9];
Chris@42 65 T9 = ri[WS(rs, 10)];
Chris@42 66 Tc = ii[WS(rs, 10)];
Chris@42 67 T1C = T2 * T6;
Chris@42 68 T4 = T2 * T3;
Chris@42 69 T8 = W[18];
Chris@42 70 Tb = W[19];
Chris@42 71 T1D = FNMS(T5, T3, T1C);
Chris@42 72 T7 = FMA(T5, T6, T4);
Chris@42 73 T1E = T8 * Tc;
Chris@42 74 Ta = T8 * T9;
Chris@42 75 }
Chris@42 76 {
Chris@42 77 E T1g, T1F, Td, T1f, T3i, Te, T2H;
Chris@42 78 T1g = ri[WS(rs, 9)];
Chris@42 79 T1j = ii[WS(rs, 9)];
Chris@42 80 T1F = FNMS(Tb, T9, T1E);
Chris@42 81 Td = FMA(Tb, Tc, Ta);
Chris@42 82 T1f = W[16];
Chris@42 83 T1i = W[17];
Chris@42 84 T1G = T1D - T1F;
Chris@42 85 T3i = T1D + T1F;
Chris@42 86 T3u = Td - T7;
Chris@42 87 Te = T7 + Td;
Chris@42 88 T2H = T1f * T1j;
Chris@42 89 T1h = T1f * T1g;
Chris@42 90 T3k = T3i + T3j;
Chris@42 91 T3t = FNMS(KP500000000, T3i, T3j);
Chris@42 92 T1B = FNMS(KP500000000, Te, T1);
Chris@42 93 Tf = T1 + Te;
Chris@42 94 T2I = FNMS(T1i, T1g, T2H);
Chris@42 95 }
Chris@42 96 T1k = FMA(T1i, T1j, T1h);
Chris@42 97 {
Chris@42 98 E T1s, T1v, T1r, T1u, T27, T1t, T1l;
Chris@42 99 T1s = ri[WS(rs, 4)];
Chris@42 100 T1v = ii[WS(rs, 4)];
Chris@42 101 T1r = W[6];
Chris@42 102 T1u = W[7];
Chris@42 103 T1m = ri[WS(rs, 14)];
Chris@42 104 T1p = ii[WS(rs, 14)];
Chris@42 105 T27 = T1r * T1v;
Chris@42 106 T1t = T1r * T1s;
Chris@42 107 T1l = W[26];
Chris@42 108 T1o = W[27];
Chris@42 109 T28 = FNMS(T1u, T1s, T27);
Chris@42 110 T1w = FMA(T1u, T1v, T1t);
Chris@42 111 T25 = T1l * T1p;
Chris@42 112 T1n = T1l * T1m;
Chris@42 113 }
Chris@42 114 }
Chris@42 115 {
Chris@42 116 E Tl, T2p, Tn, Tq, Tp, T1M, Tx, T1J, To;
Chris@42 117 {
Chris@42 118 E Th, Tk, T26, T1q, Tg, Tj;
Chris@42 119 Th = ri[WS(rs, 3)];
Chris@42 120 Tk = ii[WS(rs, 3)];
Chris@42 121 T26 = FNMS(T1o, T1m, T25);
Chris@42 122 T1q = FMA(T1o, T1p, T1n);
Chris@42 123 Tg = W[4];
Chris@42 124 Tj = W[5];
Chris@42 125 {
Chris@42 126 E T29, T2J, T1x, T2L;
Chris@42 127 T29 = T26 - T28;
Chris@42 128 T2J = T26 + T28;
Chris@42 129 T1x = T1q + T1w;
Chris@42 130 T2L = T1w - T1q;
Chris@42 131 {
Chris@42 132 E T2o, Ti, T2K, T24;
Chris@42 133 T2o = Tg * Tk;
Chris@42 134 Ti = Tg * Th;
Chris@42 135 T2K = FNMS(KP500000000, T2J, T2I);
Chris@42 136 T37 = T2I + T2J;
Chris@42 137 T24 = FNMS(KP500000000, T1x, T1k);
Chris@42 138 T1y = T1k + T1x;
Chris@42 139 Tl = FMA(Tj, Tk, Ti);
Chris@42 140 T2V = FNMS(KP866025403, T2L, T2K);
Chris@42 141 T2M = FMA(KP866025403, T2L, T2K);
Chris@42 142 T2a = FNMS(KP866025403, T29, T24);
Chris@42 143 T2i = FMA(KP866025403, T29, T24);
Chris@42 144 T2p = FNMS(Tj, Th, T2o);
Chris@42 145 }
Chris@42 146 }
Chris@42 147 }
Chris@42 148 {
Chris@42 149 E Tt, Tw, Ts, Tv, T1L, Tu, Tm;
Chris@42 150 Tt = ri[WS(rs, 13)];
Chris@42 151 Tw = ii[WS(rs, 13)];
Chris@42 152 Ts = W[24];
Chris@42 153 Tv = W[25];
Chris@42 154 Tn = ri[WS(rs, 8)];
Chris@42 155 Tq = ii[WS(rs, 8)];
Chris@42 156 T1L = Ts * Tw;
Chris@42 157 Tu = Ts * Tt;
Chris@42 158 Tm = W[14];
Chris@42 159 Tp = W[15];
Chris@42 160 T1M = FNMS(Tv, Tt, T1L);
Chris@42 161 Tx = FMA(Tv, Tw, Tu);
Chris@42 162 T1J = Tm * Tq;
Chris@42 163 To = Tm * Tn;
Chris@42 164 }
Chris@42 165 {
Chris@42 166 E TF, T2v, TH, TK, TJ, T1T, TR, T1Q, TI;
Chris@42 167 {
Chris@42 168 E TB, TE, T1K, Tr, TA, TD;
Chris@42 169 TB = ri[WS(rs, 12)];
Chris@42 170 TE = ii[WS(rs, 12)];
Chris@42 171 T1K = FNMS(Tp, Tn, T1J);
Chris@42 172 Tr = FMA(Tp, Tq, To);
Chris@42 173 TA = W[22];
Chris@42 174 TD = W[23];
Chris@42 175 {
Chris@42 176 E T1N, T2q, Ty, T2s;
Chris@42 177 T1N = T1K - T1M;
Chris@42 178 T2q = T1K + T1M;
Chris@42 179 Ty = Tr + Tx;
Chris@42 180 T2s = Tx - Tr;
Chris@42 181 {
Chris@42 182 E T2u, TC, T2r, T1I;
Chris@42 183 T2u = TA * TE;
Chris@42 184 TC = TA * TB;
Chris@42 185 T2r = FNMS(KP500000000, T2q, T2p);
Chris@42 186 T39 = T2p + T2q;
Chris@42 187 T1I = FNMS(KP500000000, Ty, Tl);
Chris@42 188 Tz = Tl + Ty;
Chris@42 189 TF = FMA(TD, TE, TC);
Chris@42 190 T2X = FNMS(KP866025403, T2s, T2r);
Chris@42 191 T2t = FMA(KP866025403, T2s, T2r);
Chris@42 192 T1O = FNMS(KP866025403, T1N, T1I);
Chris@42 193 T2e = FMA(KP866025403, T1N, T1I);
Chris@42 194 T2v = FNMS(TD, TB, T2u);
Chris@42 195 }
Chris@42 196 }
Chris@42 197 }
Chris@42 198 {
Chris@42 199 E TN, TQ, TM, TP, T1S, TO, TG;
Chris@42 200 TN = ri[WS(rs, 7)];
Chris@42 201 TQ = ii[WS(rs, 7)];
Chris@42 202 TM = W[12];
Chris@42 203 TP = W[13];
Chris@42 204 TH = ri[WS(rs, 2)];
Chris@42 205 TK = ii[WS(rs, 2)];
Chris@42 206 T1S = TM * TQ;
Chris@42 207 TO = TM * TN;
Chris@42 208 TG = W[2];
Chris@42 209 TJ = W[3];
Chris@42 210 T1T = FNMS(TP, TN, T1S);
Chris@42 211 TR = FMA(TP, TQ, TO);
Chris@42 212 T1Q = TG * TK;
Chris@42 213 TI = TG * TH;
Chris@42 214 }
Chris@42 215 {
Chris@42 216 E TW, TZ, T1R, TL, TV, TY;
Chris@42 217 TW = ri[WS(rs, 6)];
Chris@42 218 TZ = ii[WS(rs, 6)];
Chris@42 219 T1R = FNMS(TJ, TH, T1Q);
Chris@42 220 TL = FMA(TJ, TK, TI);
Chris@42 221 TV = W[10];
Chris@42 222 TY = W[11];
Chris@42 223 {
Chris@42 224 E T1U, T2w, TS, T2y;
Chris@42 225 T1U = T1R - T1T;
Chris@42 226 T2w = T1R + T1T;
Chris@42 227 TS = TL + TR;
Chris@42 228 T2y = TR - TL;
Chris@42 229 {
Chris@42 230 E T2B, TX, T2x, T1P;
Chris@42 231 T2B = TV * TZ;
Chris@42 232 TX = TV * TW;
Chris@42 233 T2x = FNMS(KP500000000, T2w, T2v);
Chris@42 234 T3a = T2v + T2w;
Chris@42 235 T1P = FNMS(KP500000000, TS, TF);
Chris@42 236 TT = TF + TS;
Chris@42 237 T10 = FMA(TY, TZ, TX);
Chris@42 238 T2Y = FNMS(KP866025403, T2y, T2x);
Chris@42 239 T2z = FMA(KP866025403, T2y, T2x);
Chris@42 240 T1V = FNMS(KP866025403, T1U, T1P);
Chris@42 241 T2f = FMA(KP866025403, T1U, T1P);
Chris@42 242 T2C = FNMS(TY, TW, T2B);
Chris@42 243 }
Chris@42 244 }
Chris@42 245 }
Chris@42 246 {
Chris@42 247 E T18, T1b, T17, T1a, T20, T19, T11;
Chris@42 248 T18 = ri[WS(rs, 1)];
Chris@42 249 T1b = ii[WS(rs, 1)];
Chris@42 250 T17 = W[0];
Chris@42 251 T1a = W[1];
Chris@42 252 T12 = ri[WS(rs, 11)];
Chris@42 253 T15 = ii[WS(rs, 11)];
Chris@42 254 T20 = T17 * T1b;
Chris@42 255 T19 = T17 * T18;
Chris@42 256 T11 = W[20];
Chris@42 257 T14 = W[21];
Chris@42 258 T21 = FNMS(T1a, T18, T20);
Chris@42 259 T1c = FMA(T1a, T1b, T19);
Chris@42 260 T1Y = T11 * T15;
Chris@42 261 T13 = T11 * T12;
Chris@42 262 }
Chris@42 263 }
Chris@42 264 }
Chris@42 265 }
Chris@42 266 {
Chris@42 267 E T2G, T2h, T3J, T3I, T32, T30, T1H, T1W, T3P, T3O, T2b;
Chris@42 268 {
Chris@42 269 E T3f, T3b, T1Z, T16, T3p, TU;
Chris@42 270 T3f = T39 + T3a;
Chris@42 271 T3b = T39 - T3a;
Chris@42 272 T1Z = FNMS(T14, T12, T1Y);
Chris@42 273 T16 = FMA(T14, T15, T13);
Chris@42 274 T3p = Tz - TT;
Chris@42 275 TU = Tz + TT;
Chris@42 276 {
Chris@42 277 E T3g, T2U, T23, T3c, T3e, T3q, T3s, T1A, T34, T3r, T3n;
Chris@42 278 {
Chris@42 279 E T22, T1d, T2F, T2E, T36, T2D;
Chris@42 280 T22 = T1Z - T21;
Chris@42 281 T2D = T1Z + T21;
Chris@42 282 T1d = T16 + T1c;
Chris@42 283 T2F = T1c - T16;
Chris@42 284 T2E = FNMS(KP500000000, T2D, T2C);
Chris@42 285 T36 = T2C + T2D;
Chris@42 286 {
Chris@42 287 E T1e, T1X, T38, T1z, T3o;
Chris@42 288 T1e = T10 + T1d;
Chris@42 289 T1X = FNMS(KP500000000, T1d, T10);
Chris@42 290 T38 = T36 - T37;
Chris@42 291 T3g = T36 + T37;
Chris@42 292 T2G = FMA(KP866025403, T2F, T2E);
Chris@42 293 T2U = FNMS(KP866025403, T2F, T2E);
Chris@42 294 T1z = T1e + T1y;
Chris@42 295 T3o = T1e - T1y;
Chris@42 296 T2h = FMA(KP866025403, T22, T1X);
Chris@42 297 T23 = FNMS(KP866025403, T22, T1X);
Chris@42 298 T3c = FNMS(KP618033988, T3b, T38);
Chris@42 299 T3e = FMA(KP618033988, T38, T3b);
Chris@42 300 T3q = FNMS(KP618033988, T3p, T3o);
Chris@42 301 T3s = FMA(KP618033988, T3o, T3p);
Chris@42 302 T1A = TU + T1z;
Chris@42 303 T34 = TU - T1z;
Chris@42 304 }
Chris@42 305 }
Chris@42 306 {
Chris@42 307 E T2W, T33, T3m, T3h, T2Z, T3d, T35, T3l;
Chris@42 308 T3J = T2U + T2V;
Chris@42 309 T2W = T2U - T2V;
Chris@42 310 ri[0] = Tf + T1A;
Chris@42 311 T33 = FNMS(KP250000000, T1A, Tf);
Chris@42 312 T3m = T3f - T3g;
Chris@42 313 T3h = T3f + T3g;
Chris@42 314 T2Z = T2X - T2Y;
Chris@42 315 T3I = T2X + T2Y;
Chris@42 316 T3d = FMA(KP559016994, T34, T33);
Chris@42 317 T35 = FNMS(KP559016994, T34, T33);
Chris@42 318 ii[0] = T3h + T3k;
Chris@42 319 T3l = FNMS(KP250000000, T3h, T3k);
Chris@42 320 ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
Chris@42 321 ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
Chris@42 322 ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
Chris@42 323 ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
Chris@42 324 T3r = FMA(KP559016994, T3m, T3l);
Chris@42 325 T3n = FNMS(KP559016994, T3m, T3l);
Chris@42 326 T32 = FMA(KP618033988, T2W, T2Z);
Chris@42 327 T30 = FNMS(KP618033988, T2Z, T2W);
Chris@42 328 }
Chris@42 329 ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
Chris@42 330 ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
Chris@42 331 ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
Chris@42 332 ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
Chris@42 333 T2d = FMA(KP866025403, T1G, T1B);
Chris@42 334 T1H = FNMS(KP866025403, T1G, T1B);
Chris@42 335 T1W = T1O + T1V;
Chris@42 336 T3P = T1O - T1V;
Chris@42 337 T3O = T23 - T2a;
Chris@42 338 T2b = T23 + T2a;
Chris@42 339 }
Chris@42 340 }
Chris@42 341 {
Chris@42 342 E T3H, T3v, T2S, T3Q, T3S, T2R, T2c;
Chris@42 343 T3H = FNMS(KP866025403, T3u, T3t);
Chris@42 344 T3v = FMA(KP866025403, T3u, T3t);
Chris@42 345 T2c = T1W + T2b;
Chris@42 346 T2S = T1W - T2b;
Chris@42 347 T3Q = FNMS(KP618033988, T3P, T3O);
Chris@42 348 T3S = FMA(KP618033988, T3O, T3P);
Chris@42 349 ri[WS(rs, 5)] = T1H + T2c;
Chris@42 350 T2R = FNMS(KP250000000, T2c, T1H);
Chris@42 351 {
Chris@42 352 E T2g, T2j, T3G, T3E, T2A, T2N, T3y, T3A, T3M, T3L, T3z, T3F, T3B;
Chris@42 353 {
Chris@42 354 E T3C, T3D, T31, T2T, T3K;
Chris@42 355 T2g = T2e + T2f;
Chris@42 356 T3C = T2e - T2f;
Chris@42 357 T3D = T2h - T2i;
Chris@42 358 T2j = T2h + T2i;
Chris@42 359 T31 = FMA(KP559016994, T2S, T2R);
Chris@42 360 T2T = FNMS(KP559016994, T2S, T2R);
Chris@42 361 T3K = T3I + T3J;
Chris@42 362 T3M = T3I - T3J;
Chris@42 363 ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
Chris@42 364 ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
Chris@42 365 ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
Chris@42 366 ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
Chris@42 367 ii[WS(rs, 5)] = T3K + T3H;
Chris@42 368 T3L = FNMS(KP250000000, T3K, T3H);
Chris@42 369 T3G = FNMS(KP618033988, T3C, T3D);
Chris@42 370 T3E = FMA(KP618033988, T3D, T3C);
Chris@42 371 }
Chris@42 372 {
Chris@42 373 E T3N, T3R, T3w, T3x;
Chris@42 374 T3N = FNMS(KP559016994, T3M, T3L);
Chris@42 375 T3R = FMA(KP559016994, T3M, T3L);
Chris@42 376 T3w = T2t + T2z;
Chris@42 377 T2A = T2t - T2z;
Chris@42 378 T2N = T2G - T2M;
Chris@42 379 T3x = T2G + T2M;
Chris@42 380 ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
Chris@42 381 ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
Chris@42 382 ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
Chris@42 383 ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
Chris@42 384 T3y = T3w + T3x;
Chris@42 385 T3A = T3w - T3x;
Chris@42 386 }
Chris@42 387 ii[WS(rs, 10)] = T3y + T3v;
Chris@42 388 T3z = FNMS(KP250000000, T3y, T3v);
Chris@42 389 T2O = FMA(KP618033988, T2N, T2A);
Chris@42 390 T2Q = FNMS(KP618033988, T2A, T2N);
Chris@42 391 T3F = FNMS(KP559016994, T3A, T3z);
Chris@42 392 T3B = FMA(KP559016994, T3A, T3z);
Chris@42 393 ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
Chris@42 394 ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
Chris@42 395 ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
Chris@42 396 ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
Chris@42 397 T2m = T2g - T2j;
Chris@42 398 T2k = T2g + T2j;
Chris@42 399 }
Chris@42 400 }
Chris@42 401 }
Chris@42 402 }
Chris@42 403 ri[WS(rs, 10)] = T2d + T2k;
Chris@42 404 T2l = FNMS(KP250000000, T2k, T2d);
Chris@42 405 T2P = FNMS(KP559016994, T2m, T2l);
Chris@42 406 T2n = FMA(KP559016994, T2m, T2l);
Chris@42 407 ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
Chris@42 408 ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
Chris@42 409 ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
Chris@42 410 ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
Chris@42 411 }
Chris@42 412 }
Chris@42 413 }
Chris@42 414
Chris@42 415 static const tw_instr twinstr[] = {
Chris@42 416 {TW_FULL, 0, 15},
Chris@42 417 {TW_NEXT, 1, 0}
Chris@42 418 };
Chris@42 419
Chris@42 420 static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, {72, 28, 112, 0}, 0, 0, 0 };
Chris@42 421
Chris@42 422 void X(codelet_t1_15) (planner *p) {
Chris@42 423 X(kdft_dit_register) (p, t1_15, &desc);
Chris@42 424 }
Chris@42 425 #else /* HAVE_FMA */
Chris@42 426
Chris@42 427 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include t.h */
Chris@42 428
Chris@42 429 /*
Chris@42 430 * This function contains 184 FP additions, 112 FP multiplications,
Chris@42 431 * (or, 128 additions, 56 multiplications, 56 fused multiply/add),
Chris@42 432 * 65 stack variables, 6 constants, and 60 memory accesses
Chris@42 433 */
Chris@42 434 #include "t.h"
Chris@42 435
Chris@42 436 static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 437 {
Chris@42 438 DK(KP587785252, +0.587785252292473129168705954639072768597652438);
Chris@42 439 DK(KP951056516, +0.951056516295153572116439333379382143405698634);
Chris@42 440 DK(KP250000000, +0.250000000000000000000000000000000000000000000);
Chris@42 441 DK(KP559016994, +0.559016994374947424102293417182819058860154590);
Chris@42 442 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 443 DK(KP866025403, +0.866025403784438646763723170752936183471402627);
Chris@42 444 {
Chris@42 445 INT m;
Chris@42 446 for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
Chris@42 447 E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
Chris@42 448 E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
Chris@42 449 E T24, T2v, T1B, T1R;
Chris@42 450 {
Chris@42 451 E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
Chris@42 452 T1 = ri[0];
Chris@42 453 T2R = ii[0];
Chris@42 454 {
Chris@42 455 E T3, T5, T2, T4;
Chris@42 456 T3 = ri[WS(rs, 5)];
Chris@42 457 T5 = ii[WS(rs, 5)];
Chris@42 458 T2 = W[8];
Chris@42 459 T4 = W[9];
Chris@42 460 T6 = FMA(T2, T3, T4 * T5);
Chris@42 461 T1o = FNMS(T4, T3, T2 * T5);
Chris@42 462 }
Chris@42 463 {
Chris@42 464 E T8, Ta, T7, T9;
Chris@42 465 T8 = ri[WS(rs, 10)];
Chris@42 466 Ta = ii[WS(rs, 10)];
Chris@42 467 T7 = W[18];
Chris@42 468 T9 = W[19];
Chris@42 469 Tb = FMA(T7, T8, T9 * Ta);
Chris@42 470 T1p = FNMS(T9, T8, T7 * Ta);
Chris@42 471 }
Chris@42 472 T1q = KP866025403 * (T1o - T1p);
Chris@42 473 T34 = KP866025403 * (Tb - T6);
Chris@42 474 Tc = T6 + Tb;
Chris@42 475 Td = T1 + Tc;
Chris@42 476 T1n = FNMS(KP500000000, Tc, T1);
Chris@42 477 T2Q = T1o + T1p;
Chris@42 478 T2S = T2Q + T2R;
Chris@42 479 T35 = FNMS(KP500000000, T2Q, T2R);
Chris@42 480 }
Chris@42 481 {
Chris@42 482 E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
Chris@42 483 E T2i;
Chris@42 484 {
Chris@42 485 E TO, TQ, TN, TP;
Chris@42 486 TO = ri[WS(rs, 6)];
Chris@42 487 TQ = ii[WS(rs, 6)];
Chris@42 488 TN = W[10];
Chris@42 489 TP = W[11];
Chris@42 490 TR = FMA(TN, TO, TP * TQ);
Chris@42 491 T2c = FNMS(TP, TO, TN * TQ);
Chris@42 492 }
Chris@42 493 {
Chris@42 494 E T15, T17, T14, T16;
Chris@42 495 T15 = ri[WS(rs, 9)];
Chris@42 496 T17 = ii[WS(rs, 9)];
Chris@42 497 T14 = W[16];
Chris@42 498 T16 = W[17];
Chris@42 499 T18 = FMA(T14, T15, T16 * T17);
Chris@42 500 T2h = FNMS(T16, T15, T14 * T17);
Chris@42 501 }
Chris@42 502 {
Chris@42 503 E TT, TV, TS, TU;
Chris@42 504 TT = ri[WS(rs, 11)];
Chris@42 505 TV = ii[WS(rs, 11)];
Chris@42 506 TS = W[20];
Chris@42 507 TU = W[21];
Chris@42 508 TW = FMA(TS, TT, TU * TV);
Chris@42 509 T1E = FNMS(TU, TT, TS * TV);
Chris@42 510 }
Chris@42 511 {
Chris@42 512 E TY, T10, TX, TZ;
Chris@42 513 TY = ri[WS(rs, 1)];
Chris@42 514 T10 = ii[WS(rs, 1)];
Chris@42 515 TX = W[0];
Chris@42 516 TZ = W[1];
Chris@42 517 T11 = FMA(TX, TY, TZ * T10);
Chris@42 518 T1F = FNMS(TZ, TY, TX * T10);
Chris@42 519 }
Chris@42 520 T12 = TW + T11;
Chris@42 521 T2d = T1E + T1F;
Chris@42 522 {
Chris@42 523 E T1a, T1c, T19, T1b;
Chris@42 524 T1a = ri[WS(rs, 14)];
Chris@42 525 T1c = ii[WS(rs, 14)];
Chris@42 526 T19 = W[26];
Chris@42 527 T1b = W[27];
Chris@42 528 T1d = FMA(T19, T1a, T1b * T1c);
Chris@42 529 T1J = FNMS(T1b, T1a, T19 * T1c);
Chris@42 530 }
Chris@42 531 {
Chris@42 532 E T1f, T1h, T1e, T1g;
Chris@42 533 T1f = ri[WS(rs, 4)];
Chris@42 534 T1h = ii[WS(rs, 4)];
Chris@42 535 T1e = W[6];
Chris@42 536 T1g = W[7];
Chris@42 537 T1i = FMA(T1e, T1f, T1g * T1h);
Chris@42 538 T1K = FNMS(T1g, T1f, T1e * T1h);
Chris@42 539 }
Chris@42 540 T1j = T1d + T1i;
Chris@42 541 T2i = T1J + T1K;
Chris@42 542 {
Chris@42 543 E T1D, T1G, T2g, T2j;
Chris@42 544 T13 = TR + T12;
Chris@42 545 T1k = T18 + T1j;
Chris@42 546 T1l = T13 + T1k;
Chris@42 547 T2E = T2c + T2d;
Chris@42 548 T2F = T2h + T2i;
Chris@42 549 T2O = T2E + T2F;
Chris@42 550 T1D = FNMS(KP500000000, T12, TR);
Chris@42 551 T1G = KP866025403 * (T1E - T1F);
Chris@42 552 T1H = T1D - T1G;
Chris@42 553 T1T = T1D + T1G;
Chris@42 554 T2g = KP866025403 * (T1i - T1d);
Chris@42 555 T2j = FNMS(KP500000000, T2i, T2h);
Chris@42 556 T2k = T2g + T2j;
Chris@42 557 T2t = T2j - T2g;
Chris@42 558 {
Chris@42 559 E T2b, T2e, T1I, T1L;
Chris@42 560 T2b = KP866025403 * (T11 - TW);
Chris@42 561 T2e = FNMS(KP500000000, T2d, T2c);
Chris@42 562 T2f = T2b + T2e;
Chris@42 563 T2s = T2e - T2b;
Chris@42 564 T1I = FNMS(KP500000000, T1j, T18);
Chris@42 565 T1L = KP866025403 * (T1J - T1K);
Chris@42 566 T1M = T1I - T1L;
Chris@42 567 T1U = T1I + T1L;
Chris@42 568 }
Chris@42 569 }
Chris@42 570 }
Chris@42 571 {
Chris@42 572 E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
Chris@42 573 E T27;
Chris@42 574 {
Chris@42 575 E Tf, Th, Te, Tg;
Chris@42 576 Tf = ri[WS(rs, 3)];
Chris@42 577 Th = ii[WS(rs, 3)];
Chris@42 578 Te = W[4];
Chris@42 579 Tg = W[5];
Chris@42 580 Ti = FMA(Te, Tf, Tg * Th);
Chris@42 581 T21 = FNMS(Tg, Tf, Te * Th);
Chris@42 582 }
Chris@42 583 {
Chris@42 584 E Tw, Ty, Tv, Tx;
Chris@42 585 Tw = ri[WS(rs, 12)];
Chris@42 586 Ty = ii[WS(rs, 12)];
Chris@42 587 Tv = W[22];
Chris@42 588 Tx = W[23];
Chris@42 589 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@42 590 T26 = FNMS(Tx, Tw, Tv * Ty);
Chris@42 591 }
Chris@42 592 {
Chris@42 593 E Tk, Tm, Tj, Tl;
Chris@42 594 Tk = ri[WS(rs, 8)];
Chris@42 595 Tm = ii[WS(rs, 8)];
Chris@42 596 Tj = W[14];
Chris@42 597 Tl = W[15];
Chris@42 598 Tn = FMA(Tj, Tk, Tl * Tm);
Chris@42 599 T1t = FNMS(Tl, Tk, Tj * Tm);
Chris@42 600 }
Chris@42 601 {
Chris@42 602 E Tp, Tr, To, Tq;
Chris@42 603 Tp = ri[WS(rs, 13)];
Chris@42 604 Tr = ii[WS(rs, 13)];
Chris@42 605 To = W[24];
Chris@42 606 Tq = W[25];
Chris@42 607 Ts = FMA(To, Tp, Tq * Tr);
Chris@42 608 T1u = FNMS(Tq, Tp, To * Tr);
Chris@42 609 }
Chris@42 610 Tt = Tn + Ts;
Chris@42 611 T22 = T1t + T1u;
Chris@42 612 {
Chris@42 613 E TB, TD, TA, TC;
Chris@42 614 TB = ri[WS(rs, 2)];
Chris@42 615 TD = ii[WS(rs, 2)];
Chris@42 616 TA = W[2];
Chris@42 617 TC = W[3];
Chris@42 618 TE = FMA(TA, TB, TC * TD);
Chris@42 619 T1y = FNMS(TC, TB, TA * TD);
Chris@42 620 }
Chris@42 621 {
Chris@42 622 E TG, TI, TF, TH;
Chris@42 623 TG = ri[WS(rs, 7)];
Chris@42 624 TI = ii[WS(rs, 7)];
Chris@42 625 TF = W[12];
Chris@42 626 TH = W[13];
Chris@42 627 TJ = FMA(TF, TG, TH * TI);
Chris@42 628 T1z = FNMS(TH, TG, TF * TI);
Chris@42 629 }
Chris@42 630 TK = TE + TJ;
Chris@42 631 T27 = T1y + T1z;
Chris@42 632 {
Chris@42 633 E T1s, T1v, T25, T28;
Chris@42 634 Tu = Ti + Tt;
Chris@42 635 TL = Tz + TK;
Chris@42 636 TM = Tu + TL;
Chris@42 637 T2H = T21 + T22;
Chris@42 638 T2I = T26 + T27;
Chris@42 639 T2N = T2H + T2I;
Chris@42 640 T1s = FNMS(KP500000000, Tt, Ti);
Chris@42 641 T1v = KP866025403 * (T1t - T1u);
Chris@42 642 T1w = T1s - T1v;
Chris@42 643 T1Q = T1s + T1v;
Chris@42 644 T25 = KP866025403 * (TJ - TE);
Chris@42 645 T28 = FNMS(KP500000000, T27, T26);
Chris@42 646 T29 = T25 + T28;
Chris@42 647 T2w = T28 - T25;
Chris@42 648 {
Chris@42 649 E T20, T23, T1x, T1A;
Chris@42 650 T20 = KP866025403 * (Ts - Tn);
Chris@42 651 T23 = FNMS(KP500000000, T22, T21);
Chris@42 652 T24 = T20 + T23;
Chris@42 653 T2v = T23 - T20;
Chris@42 654 T1x = FNMS(KP500000000, TK, Tz);
Chris@42 655 T1A = KP866025403 * (T1y - T1z);
Chris@42 656 T1B = T1x - T1A;
Chris@42 657 T1R = T1x + T1A;
Chris@42 658 }
Chris@42 659 }
Chris@42 660 }
Chris@42 661 {
Chris@42 662 E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
Chris@42 663 T2C = KP559016994 * (TM - T1l);
Chris@42 664 T1m = TM + T1l;
Chris@42 665 T2B = FNMS(KP250000000, T1m, Td);
Chris@42 666 T2G = T2E - T2F;
Chris@42 667 T2J = T2H - T2I;
Chris@42 668 T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
Chris@42 669 T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
Chris@42 670 ri[0] = Td + T1m;
Chris@42 671 T2L = T2C + T2B;
Chris@42 672 ri[WS(rs, 9)] = T2L - T2M;
Chris@42 673 ri[WS(rs, 6)] = T2L + T2M;
Chris@42 674 T2D = T2B - T2C;
Chris@42 675 ri[WS(rs, 12)] = T2D - T2K;
Chris@42 676 ri[WS(rs, 3)] = T2D + T2K;
Chris@42 677 }
Chris@42 678 {
Chris@42 679 E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
Chris@42 680 T2U = KP559016994 * (T2N - T2O);
Chris@42 681 T2P = T2N + T2O;
Chris@42 682 T2T = FNMS(KP250000000, T2P, T2S);
Chris@42 683 T2W = T13 - T1k;
Chris@42 684 T2X = Tu - TL;
Chris@42 685 T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
Chris@42 686 T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
Chris@42 687 ii[0] = T2P + T2S;
Chris@42 688 T2Z = T2U + T2T;
Chris@42 689 ii[WS(rs, 6)] = T2Z - T30;
Chris@42 690 ii[WS(rs, 9)] = T30 + T2Z;
Chris@42 691 T2V = T2T - T2U;
Chris@42 692 ii[WS(rs, 3)] = T2V - T2Y;
Chris@42 693 ii[WS(rs, 12)] = T2Y + T2V;
Chris@42 694 }
Chris@42 695 {
Chris@42 696 E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
Chris@42 697 {
Chris@42 698 E T2u, T2x, T1C, T1N;
Chris@42 699 T2u = T2s - T2t;
Chris@42 700 T2x = T2v - T2w;
Chris@42 701 T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
Chris@42 702 T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
Chris@42 703 T1r = T1n - T1q;
Chris@42 704 T1C = T1w + T1B;
Chris@42 705 T1N = T1H + T1M;
Chris@42 706 T1O = T1C + T1N;
Chris@42 707 T2p = FNMS(KP250000000, T1O, T1r);
Chris@42 708 T2q = KP559016994 * (T1C - T1N);
Chris@42 709 }
Chris@42 710 ri[WS(rs, 5)] = T1r + T1O;
Chris@42 711 T2z = T2q + T2p;
Chris@42 712 ri[WS(rs, 14)] = T2z - T2A;
Chris@42 713 ri[WS(rs, 11)] = T2z + T2A;
Chris@42 714 T2r = T2p - T2q;
Chris@42 715 ri[WS(rs, 2)] = T2r - T2y;
Chris@42 716 ri[WS(rs, 8)] = T2r + T2y;
Chris@42 717 }
Chris@42 718 {
Chris@42 719 E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
Chris@42 720 {
Chris@42 721 E T3f, T3g, T3j, T3k;
Chris@42 722 T3f = T1H - T1M;
Chris@42 723 T3g = T1w - T1B;
Chris@42 724 T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
Chris@42 725 T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
Chris@42 726 T3i = T35 - T34;
Chris@42 727 T3j = T2v + T2w;
Chris@42 728 T3k = T2s + T2t;
Chris@42 729 T3l = T3j + T3k;
Chris@42 730 T3m = FNMS(KP250000000, T3l, T3i);
Chris@42 731 T3n = KP559016994 * (T3j - T3k);
Chris@42 732 }
Chris@42 733 ii[WS(rs, 5)] = T3l + T3i;
Chris@42 734 T3p = T3n + T3m;
Chris@42 735 ii[WS(rs, 11)] = T3p - T3q;
Chris@42 736 ii[WS(rs, 14)] = T3q + T3p;
Chris@42 737 T3o = T3m - T3n;
Chris@42 738 ii[WS(rs, 2)] = T3h + T3o;
Chris@42 739 ii[WS(rs, 8)] = T3o - T3h;
Chris@42 740 }
Chris@42 741 {
Chris@42 742 E T3c, T3d, T36, T37, T33, T38, T3e, T39;
Chris@42 743 {
Chris@42 744 E T3a, T3b, T31, T32;
Chris@42 745 T3a = T1Q - T1R;
Chris@42 746 T3b = T1T - T1U;
Chris@42 747 T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
Chris@42 748 T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
Chris@42 749 T36 = T34 + T35;
Chris@42 750 T31 = T24 + T29;
Chris@42 751 T32 = T2f + T2k;
Chris@42 752 T37 = T31 + T32;
Chris@42 753 T33 = KP559016994 * (T31 - T32);
Chris@42 754 T38 = FNMS(KP250000000, T37, T36);
Chris@42 755 }
Chris@42 756 ii[WS(rs, 10)] = T37 + T36;
Chris@42 757 T3e = T38 - T33;
Chris@42 758 ii[WS(rs, 7)] = T3d + T3e;
Chris@42 759 ii[WS(rs, 13)] = T3e - T3d;
Chris@42 760 T39 = T33 + T38;
Chris@42 761 ii[WS(rs, 1)] = T39 - T3c;
Chris@42 762 ii[WS(rs, 4)] = T3c + T39;
Chris@42 763 }
Chris@42 764 {
Chris@42 765 E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
Chris@42 766 {
Chris@42 767 E T2a, T2l, T1S, T1V;
Chris@42 768 T2a = T24 - T29;
Chris@42 769 T2l = T2f - T2k;
Chris@42 770 T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
Chris@42 771 T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
Chris@42 772 T1P = T1n + T1q;
Chris@42 773 T1S = T1Q + T1R;
Chris@42 774 T1V = T1T + T1U;
Chris@42 775 T1W = T1S + T1V;
Chris@42 776 T1X = KP559016994 * (T1S - T1V);
Chris@42 777 T1Y = FNMS(KP250000000, T1W, T1P);
Chris@42 778 }
Chris@42 779 ri[WS(rs, 10)] = T1P + T1W;
Chris@42 780 T2n = T1Y - T1X;
Chris@42 781 ri[WS(rs, 7)] = T2n - T2o;
Chris@42 782 ri[WS(rs, 13)] = T2n + T2o;
Chris@42 783 T1Z = T1X + T1Y;
Chris@42 784 ri[WS(rs, 4)] = T1Z - T2m;
Chris@42 785 ri[WS(rs, 1)] = T1Z + T2m;
Chris@42 786 }
Chris@42 787 }
Chris@42 788 }
Chris@42 789 }
Chris@42 790
Chris@42 791 static const tw_instr twinstr[] = {
Chris@42 792 {TW_FULL, 0, 15},
Chris@42 793 {TW_NEXT, 1, 0}
Chris@42 794 };
Chris@42 795
Chris@42 796 static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, {128, 56, 56, 0}, 0, 0, 0 };
Chris@42 797
Chris@42 798 void X(codelet_t1_15) (planner *p) {
Chris@42 799 X(kdft_dit_register) (p, t1_15, &desc);
Chris@42 800 }
Chris@42 801 #endif /* HAVE_FMA */