annotate src/fftw-3.3.3/dft/scalar/codelets/t1_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 37bf6b4a2645
children
rev   line source
Chris@10 1 /*
Chris@10 2 * Copyright (c) 2003, 2007-11 Matteo Frigo
Chris@10 3 * Copyright (c) 2003, 2007-11 Massachusetts Institute of Technology
Chris@10 4 *
Chris@10 5 * This program is free software; you can redistribute it and/or modify
Chris@10 6 * it under the terms of the GNU General Public License as published by
Chris@10 7 * the Free Software Foundation; either version 2 of the License, or
Chris@10 8 * (at your option) any later version.
Chris@10 9 *
Chris@10 10 * This program is distributed in the hope that it will be useful,
Chris@10 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@10 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@10 13 * GNU General Public License for more details.
Chris@10 14 *
Chris@10 15 * You should have received a copy of the GNU General Public License
Chris@10 16 * along with this program; if not, write to the Free Software
Chris@10 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@10 18 *
Chris@10 19 */
Chris@10 20
Chris@10 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@10 22 /* Generated on Sun Nov 25 07:35:51 EST 2012 */
Chris@10 23
Chris@10 24 #include "codelet-dft.h"
Chris@10 25
Chris@10 26 #ifdef HAVE_FMA
Chris@10 27
Chris@10 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include t.h */
Chris@10 29
Chris@10 30 /*
Chris@10 31 * This function contains 174 FP additions, 100 FP multiplications,
Chris@10 32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
Chris@10 33 * 97 stack variables, 3 constants, and 64 memory accesses
Chris@10 34 */
Chris@10 35 #include "t.h"
Chris@10 36
Chris@10 37 static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 38 {
Chris@10 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@10 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 42 {
Chris@10 43 INT m;
Chris@10 44 for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 45 E T3G, T3F;
Chris@10 46 {
Chris@10 47 E T3z, T3o, T8, T1I, T2o, T35, T2r, T1s, T2w, T36, T2p, T1F, T3k, T1N, T3A;
Chris@10 48 E Tl, T1T, T2V, T1U, Tz, T29, T30, T2c, T11, TB, TE, T2h, T31, T2a, T1e;
Chris@10 49 E TC, T1X, TH, TK, TG, TD, TJ;
Chris@10 50 {
Chris@10 51 E Ta, Td, Tb, T1J, Tg, Tj, Tf, Tc, Ti;
Chris@10 52 {
Chris@10 53 E T1h, T1k, T1n, T2k, T1i, T1q, T1m, T1j, T1p;
Chris@10 54 {
Chris@10 55 E T1, T3n, T3, T6, T2, T5;
Chris@10 56 T1 = ri[0];
Chris@10 57 T3n = ii[0];
Chris@10 58 T3 = ri[WS(rs, 8)];
Chris@10 59 T6 = ii[WS(rs, 8)];
Chris@10 60 T2 = W[14];
Chris@10 61 T5 = W[15];
Chris@10 62 {
Chris@10 63 E T3l, T4, T1g, T3m, T7;
Chris@10 64 T1h = ri[WS(rs, 15)];
Chris@10 65 T1k = ii[WS(rs, 15)];
Chris@10 66 T3l = T2 * T6;
Chris@10 67 T4 = T2 * T3;
Chris@10 68 T1g = W[28];
Chris@10 69 T1n = ri[WS(rs, 7)];
Chris@10 70 T3m = FNMS(T5, T3, T3l);
Chris@10 71 T7 = FMA(T5, T6, T4);
Chris@10 72 T2k = T1g * T1k;
Chris@10 73 T1i = T1g * T1h;
Chris@10 74 T3z = T3n - T3m;
Chris@10 75 T3o = T3m + T3n;
Chris@10 76 T8 = T1 + T7;
Chris@10 77 T1I = T1 - T7;
Chris@10 78 T1q = ii[WS(rs, 7)];
Chris@10 79 T1m = W[12];
Chris@10 80 }
Chris@10 81 T1j = W[29];
Chris@10 82 T1p = W[13];
Chris@10 83 }
Chris@10 84 {
Chris@10 85 E T1u, T1x, T1v, T2s, T1A, T1D, T1z, T1w, T1C;
Chris@10 86 {
Chris@10 87 E T2l, T1l, T2n, T1r, T2m, T1o, T1t;
Chris@10 88 T1u = ri[WS(rs, 3)];
Chris@10 89 T2m = T1m * T1q;
Chris@10 90 T1o = T1m * T1n;
Chris@10 91 T2l = FNMS(T1j, T1h, T2k);
Chris@10 92 T1l = FMA(T1j, T1k, T1i);
Chris@10 93 T2n = FNMS(T1p, T1n, T2m);
Chris@10 94 T1r = FMA(T1p, T1q, T1o);
Chris@10 95 T1x = ii[WS(rs, 3)];
Chris@10 96 T1t = W[4];
Chris@10 97 T2o = T2l - T2n;
Chris@10 98 T35 = T2l + T2n;
Chris@10 99 T2r = T1l - T1r;
Chris@10 100 T1s = T1l + T1r;
Chris@10 101 T1v = T1t * T1u;
Chris@10 102 T2s = T1t * T1x;
Chris@10 103 }
Chris@10 104 T1A = ri[WS(rs, 11)];
Chris@10 105 T1D = ii[WS(rs, 11)];
Chris@10 106 T1z = W[20];
Chris@10 107 T1w = W[5];
Chris@10 108 T1C = W[21];
Chris@10 109 {
Chris@10 110 E T2t, T1y, T2v, T1E, T2u, T1B, T9;
Chris@10 111 Ta = ri[WS(rs, 4)];
Chris@10 112 T2u = T1z * T1D;
Chris@10 113 T1B = T1z * T1A;
Chris@10 114 T2t = FNMS(T1w, T1u, T2s);
Chris@10 115 T1y = FMA(T1w, T1x, T1v);
Chris@10 116 T2v = FNMS(T1C, T1A, T2u);
Chris@10 117 T1E = FMA(T1C, T1D, T1B);
Chris@10 118 Td = ii[WS(rs, 4)];
Chris@10 119 T9 = W[6];
Chris@10 120 T2w = T2t - T2v;
Chris@10 121 T36 = T2t + T2v;
Chris@10 122 T2p = T1y - T1E;
Chris@10 123 T1F = T1y + T1E;
Chris@10 124 Tb = T9 * Ta;
Chris@10 125 T1J = T9 * Td;
Chris@10 126 }
Chris@10 127 Tg = ri[WS(rs, 12)];
Chris@10 128 Tj = ii[WS(rs, 12)];
Chris@10 129 Tf = W[22];
Chris@10 130 Tc = W[7];
Chris@10 131 Ti = W[23];
Chris@10 132 }
Chris@10 133 }
Chris@10 134 {
Chris@10 135 E TQ, TT, TR, T25, TW, TZ, TV, TS, TY;
Chris@10 136 {
Chris@10 137 E To, Tr, Tp, T1P, Tu, Tx, Tt, Tq, Tw;
Chris@10 138 {
Chris@10 139 E T1K, Te, T1M, Tk, T1L, Th, Tn;
Chris@10 140 To = ri[WS(rs, 2)];
Chris@10 141 T1L = Tf * Tj;
Chris@10 142 Th = Tf * Tg;
Chris@10 143 T1K = FNMS(Tc, Ta, T1J);
Chris@10 144 Te = FMA(Tc, Td, Tb);
Chris@10 145 T1M = FNMS(Ti, Tg, T1L);
Chris@10 146 Tk = FMA(Ti, Tj, Th);
Chris@10 147 Tr = ii[WS(rs, 2)];
Chris@10 148 Tn = W[2];
Chris@10 149 T3k = T1K + T1M;
Chris@10 150 T1N = T1K - T1M;
Chris@10 151 T3A = Te - Tk;
Chris@10 152 Tl = Te + Tk;
Chris@10 153 Tp = Tn * To;
Chris@10 154 T1P = Tn * Tr;
Chris@10 155 }
Chris@10 156 Tu = ri[WS(rs, 10)];
Chris@10 157 Tx = ii[WS(rs, 10)];
Chris@10 158 Tt = W[18];
Chris@10 159 Tq = W[3];
Chris@10 160 Tw = W[19];
Chris@10 161 {
Chris@10 162 E T1Q, Ts, T1S, Ty, T1R, Tv, TP;
Chris@10 163 TQ = ri[WS(rs, 1)];
Chris@10 164 T1R = Tt * Tx;
Chris@10 165 Tv = Tt * Tu;
Chris@10 166 T1Q = FNMS(Tq, To, T1P);
Chris@10 167 Ts = FMA(Tq, Tr, Tp);
Chris@10 168 T1S = FNMS(Tw, Tu, T1R);
Chris@10 169 Ty = FMA(Tw, Tx, Tv);
Chris@10 170 TT = ii[WS(rs, 1)];
Chris@10 171 TP = W[0];
Chris@10 172 T1T = T1Q - T1S;
Chris@10 173 T2V = T1Q + T1S;
Chris@10 174 T1U = Ts - Ty;
Chris@10 175 Tz = Ts + Ty;
Chris@10 176 TR = TP * TQ;
Chris@10 177 T25 = TP * TT;
Chris@10 178 }
Chris@10 179 TW = ri[WS(rs, 9)];
Chris@10 180 TZ = ii[WS(rs, 9)];
Chris@10 181 TV = W[16];
Chris@10 182 TS = W[1];
Chris@10 183 TY = W[17];
Chris@10 184 }
Chris@10 185 {
Chris@10 186 E T13, T16, T14, T2d, T19, T1c, T18, T15, T1b;
Chris@10 187 {
Chris@10 188 E T26, TU, T28, T10, T27, TX, T12;
Chris@10 189 T13 = ri[WS(rs, 5)];
Chris@10 190 T27 = TV * TZ;
Chris@10 191 TX = TV * TW;
Chris@10 192 T26 = FNMS(TS, TQ, T25);
Chris@10 193 TU = FMA(TS, TT, TR);
Chris@10 194 T28 = FNMS(TY, TW, T27);
Chris@10 195 T10 = FMA(TY, TZ, TX);
Chris@10 196 T16 = ii[WS(rs, 5)];
Chris@10 197 T12 = W[8];
Chris@10 198 T29 = T26 - T28;
Chris@10 199 T30 = T26 + T28;
Chris@10 200 T2c = TU - T10;
Chris@10 201 T11 = TU + T10;
Chris@10 202 T14 = T12 * T13;
Chris@10 203 T2d = T12 * T16;
Chris@10 204 }
Chris@10 205 T19 = ri[WS(rs, 13)];
Chris@10 206 T1c = ii[WS(rs, 13)];
Chris@10 207 T18 = W[24];
Chris@10 208 T15 = W[9];
Chris@10 209 T1b = W[25];
Chris@10 210 {
Chris@10 211 E T2e, T17, T2g, T1d, T2f, T1a, TA;
Chris@10 212 TB = ri[WS(rs, 14)];
Chris@10 213 T2f = T18 * T1c;
Chris@10 214 T1a = T18 * T19;
Chris@10 215 T2e = FNMS(T15, T13, T2d);
Chris@10 216 T17 = FMA(T15, T16, T14);
Chris@10 217 T2g = FNMS(T1b, T19, T2f);
Chris@10 218 T1d = FMA(T1b, T1c, T1a);
Chris@10 219 TE = ii[WS(rs, 14)];
Chris@10 220 TA = W[26];
Chris@10 221 T2h = T2e - T2g;
Chris@10 222 T31 = T2e + T2g;
Chris@10 223 T2a = T17 - T1d;
Chris@10 224 T1e = T17 + T1d;
Chris@10 225 TC = TA * TB;
Chris@10 226 T1X = TA * TE;
Chris@10 227 }
Chris@10 228 TH = ri[WS(rs, 6)];
Chris@10 229 TK = ii[WS(rs, 6)];
Chris@10 230 TG = W[10];
Chris@10 231 TD = W[27];
Chris@10 232 TJ = W[11];
Chris@10 233 }
Chris@10 234 }
Chris@10 235 }
Chris@10 236 {
Chris@10 237 E T2U, T3u, T2Z, T21, T1W, T34, T2X, T3f, T32, T3t, T1H, T3q, T3e, TO, T3g;
Chris@10 238 E T37, T3r, T3s, T3h, T3i;
Chris@10 239 {
Chris@10 240 E Tm, T1Y, TF, T20, TL, T3p, T1Z, TI;
Chris@10 241 T2U = T8 - Tl;
Chris@10 242 Tm = T8 + Tl;
Chris@10 243 T1Z = TG * TK;
Chris@10 244 TI = TG * TH;
Chris@10 245 T1Y = FNMS(TD, TB, T1X);
Chris@10 246 TF = FMA(TD, TE, TC);
Chris@10 247 T20 = FNMS(TJ, TH, T1Z);
Chris@10 248 TL = FMA(TJ, TK, TI);
Chris@10 249 T3p = T3k + T3o;
Chris@10 250 T3u = T3o - T3k;
Chris@10 251 {
Chris@10 252 E T1f, TM, T1G, T3j, T2W, TN;
Chris@10 253 T2Z = T11 - T1e;
Chris@10 254 T1f = T11 + T1e;
Chris@10 255 T21 = T1Y - T20;
Chris@10 256 T2W = T1Y + T20;
Chris@10 257 T1W = TF - TL;
Chris@10 258 TM = TF + TL;
Chris@10 259 T1G = T1s + T1F;
Chris@10 260 T34 = T1s - T1F;
Chris@10 261 T2X = T2V - T2W;
Chris@10 262 T3j = T2V + T2W;
Chris@10 263 T3f = T30 + T31;
Chris@10 264 T32 = T30 - T31;
Chris@10 265 T3t = TM - Tz;
Chris@10 266 TN = Tz + TM;
Chris@10 267 T3r = T1G - T1f;
Chris@10 268 T1H = T1f + T1G;
Chris@10 269 T3s = T3p - T3j;
Chris@10 270 T3q = T3j + T3p;
Chris@10 271 T3e = Tm - TN;
Chris@10 272 TO = Tm + TN;
Chris@10 273 T3g = T35 + T36;
Chris@10 274 T37 = T35 - T36;
Chris@10 275 }
Chris@10 276 }
Chris@10 277 ii[WS(rs, 12)] = T3s - T3r;
Chris@10 278 ii[WS(rs, 4)] = T3r + T3s;
Chris@10 279 ri[0] = TO + T1H;
Chris@10 280 ri[WS(rs, 8)] = TO - T1H;
Chris@10 281 T3h = T3f - T3g;
Chris@10 282 T3i = T3f + T3g;
Chris@10 283 {
Chris@10 284 E T3a, T2Y, T3x, T3v, T3b, T33;
Chris@10 285 ii[0] = T3i + T3q;
Chris@10 286 ii[WS(rs, 8)] = T3q - T3i;
Chris@10 287 ri[WS(rs, 4)] = T3e + T3h;
Chris@10 288 ri[WS(rs, 12)] = T3e - T3h;
Chris@10 289 T3a = T2U - T2X;
Chris@10 290 T2Y = T2U + T2X;
Chris@10 291 T3x = T3u - T3t;
Chris@10 292 T3v = T3t + T3u;
Chris@10 293 T3b = T32 - T2Z;
Chris@10 294 T33 = T2Z + T32;
Chris@10 295 {
Chris@10 296 E T2E, T1O, T3B, T3H, T2x, T2q, T3C, T23, T2S, T2O, T2K, T2J, T3I, T2H, T2B;
Chris@10 297 E T2j;
Chris@10 298 {
Chris@10 299 E T2F, T1V, T22, T2G, T3c, T38;
Chris@10 300 T2E = T1I + T1N;
Chris@10 301 T1O = T1I - T1N;
Chris@10 302 T3B = T3z - T3A;
Chris@10 303 T3H = T3A + T3z;
Chris@10 304 T3c = T34 + T37;
Chris@10 305 T38 = T34 - T37;
Chris@10 306 T2F = T1U + T1T;
Chris@10 307 T1V = T1T - T1U;
Chris@10 308 {
Chris@10 309 E T3d, T3w, T3y, T39;
Chris@10 310 T3d = T3b - T3c;
Chris@10 311 T3w = T3b + T3c;
Chris@10 312 T3y = T38 - T33;
Chris@10 313 T39 = T33 + T38;
Chris@10 314 ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
Chris@10 315 ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
Chris@10 316 ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
Chris@10 317 ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
Chris@10 318 ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
Chris@10 319 ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
Chris@10 320 ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
Chris@10 321 ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
Chris@10 322 T22 = T1W + T21;
Chris@10 323 T2G = T1W - T21;
Chris@10 324 }
Chris@10 325 {
Chris@10 326 E T2M, T2N, T2b, T2i;
Chris@10 327 T2x = T2r - T2w;
Chris@10 328 T2M = T2r + T2w;
Chris@10 329 T2N = T2o - T2p;
Chris@10 330 T2q = T2o + T2p;
Chris@10 331 T3C = T1V + T22;
Chris@10 332 T23 = T1V - T22;
Chris@10 333 T2S = FMA(KP414213562, T2M, T2N);
Chris@10 334 T2O = FNMS(KP414213562, T2N, T2M);
Chris@10 335 T2K = T29 - T2a;
Chris@10 336 T2b = T29 + T2a;
Chris@10 337 T2i = T2c - T2h;
Chris@10 338 T2J = T2c + T2h;
Chris@10 339 T3I = T2G - T2F;
Chris@10 340 T2H = T2F + T2G;
Chris@10 341 T2B = FNMS(KP414213562, T2b, T2i);
Chris@10 342 T2j = FMA(KP414213562, T2i, T2b);
Chris@10 343 }
Chris@10 344 }
Chris@10 345 {
Chris@10 346 E T2R, T2L, T3L, T3M;
Chris@10 347 {
Chris@10 348 E T2A, T24, T2C, T2y, T3J, T3K, T2D, T2z;
Chris@10 349 T2A = FNMS(KP707106781, T23, T1O);
Chris@10 350 T24 = FMA(KP707106781, T23, T1O);
Chris@10 351 T2R = FNMS(KP414213562, T2J, T2K);
Chris@10 352 T2L = FMA(KP414213562, T2K, T2J);
Chris@10 353 T2C = FMA(KP414213562, T2q, T2x);
Chris@10 354 T2y = FNMS(KP414213562, T2x, T2q);
Chris@10 355 T3J = FMA(KP707106781, T3I, T3H);
Chris@10 356 T3L = FNMS(KP707106781, T3I, T3H);
Chris@10 357 T3K = T2C - T2B;
Chris@10 358 T2D = T2B + T2C;
Chris@10 359 T3M = T2j + T2y;
Chris@10 360 T2z = T2j - T2y;
Chris@10 361 ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
Chris@10 362 ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
Chris@10 363 ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
Chris@10 364 ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
Chris@10 365 ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
Chris@10 366 ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
Chris@10 367 }
Chris@10 368 {
Chris@10 369 E T2Q, T3D, T3E, T2T, T2I, T2P;
Chris@10 370 T2Q = FNMS(KP707106781, T2H, T2E);
Chris@10 371 T2I = FMA(KP707106781, T2H, T2E);
Chris@10 372 T2P = T2L + T2O;
Chris@10 373 T3G = T2O - T2L;
Chris@10 374 T3F = FNMS(KP707106781, T3C, T3B);
Chris@10 375 T3D = FMA(KP707106781, T3C, T3B);
Chris@10 376 ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
Chris@10 377 ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
Chris@10 378 ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
Chris@10 379 ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
Chris@10 380 T3E = T2R + T2S;
Chris@10 381 T2T = T2R - T2S;
Chris@10 382 ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
Chris@10 383 ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
Chris@10 384 ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
Chris@10 385 ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
Chris@10 386 }
Chris@10 387 }
Chris@10 388 }
Chris@10 389 }
Chris@10 390 }
Chris@10 391 }
Chris@10 392 ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
Chris@10 393 ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
Chris@10 394 }
Chris@10 395 }
Chris@10 396 }
Chris@10 397
Chris@10 398 static const tw_instr twinstr[] = {
Chris@10 399 {TW_FULL, 0, 16},
Chris@10 400 {TW_NEXT, 1, 0}
Chris@10 401 };
Chris@10 402
Chris@10 403 static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
Chris@10 404
Chris@10 405 void X(codelet_t1_16) (planner *p) {
Chris@10 406 X(kdft_dit_register) (p, t1_16, &desc);
Chris@10 407 }
Chris@10 408 #else /* HAVE_FMA */
Chris@10 409
Chris@10 410 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include t.h */
Chris@10 411
Chris@10 412 /*
Chris@10 413 * This function contains 174 FP additions, 84 FP multiplications,
Chris@10 414 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
Chris@10 415 * 52 stack variables, 3 constants, and 64 memory accesses
Chris@10 416 */
Chris@10 417 #include "t.h"
Chris@10 418
Chris@10 419 static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@10 420 {
Chris@10 421 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@10 422 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@10 423 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@10 424 {
Chris@10 425 INT m;
Chris@10 426 for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@10 427 E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
Chris@10 428 E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
Chris@10 429 E T2y, T2z, T1O, T2g, T1T, T2h;
Chris@10 430 {
Chris@10 431 E T1, T2T, T6, T2S;
Chris@10 432 T1 = ri[0];
Chris@10 433 T2T = ii[0];
Chris@10 434 {
Chris@10 435 E T3, T5, T2, T4;
Chris@10 436 T3 = ri[WS(rs, 8)];
Chris@10 437 T5 = ii[WS(rs, 8)];
Chris@10 438 T2 = W[14];
Chris@10 439 T4 = W[15];
Chris@10 440 T6 = FMA(T2, T3, T4 * T5);
Chris@10 441 T2S = FNMS(T4, T3, T2 * T5);
Chris@10 442 }
Chris@10 443 T7 = T1 + T6;
Chris@10 444 T37 = T2T - T2S;
Chris@10 445 T1t = T1 - T6;
Chris@10 446 T2U = T2S + T2T;
Chris@10 447 }
Chris@10 448 {
Chris@10 449 E Tc, T1u, Th, T1v;
Chris@10 450 {
Chris@10 451 E T9, Tb, T8, Ta;
Chris@10 452 T9 = ri[WS(rs, 4)];
Chris@10 453 Tb = ii[WS(rs, 4)];
Chris@10 454 T8 = W[6];
Chris@10 455 Ta = W[7];
Chris@10 456 Tc = FMA(T8, T9, Ta * Tb);
Chris@10 457 T1u = FNMS(Ta, T9, T8 * Tb);
Chris@10 458 }
Chris@10 459 {
Chris@10 460 E Te, Tg, Td, Tf;
Chris@10 461 Te = ri[WS(rs, 12)];
Chris@10 462 Tg = ii[WS(rs, 12)];
Chris@10 463 Td = W[22];
Chris@10 464 Tf = W[23];
Chris@10 465 Th = FMA(Td, Te, Tf * Tg);
Chris@10 466 T1v = FNMS(Tf, Te, Td * Tg);
Chris@10 467 }
Chris@10 468 Ti = Tc + Th;
Chris@10 469 T38 = Tc - Th;
Chris@10 470 T1w = T1u - T1v;
Chris@10 471 T2R = T1u + T1v;
Chris@10 472 }
Chris@10 473 {
Chris@10 474 E To, T1y, Tt, T1z, T1A, T1B;
Chris@10 475 {
Chris@10 476 E Tl, Tn, Tk, Tm;
Chris@10 477 Tl = ri[WS(rs, 2)];
Chris@10 478 Tn = ii[WS(rs, 2)];
Chris@10 479 Tk = W[2];
Chris@10 480 Tm = W[3];
Chris@10 481 To = FMA(Tk, Tl, Tm * Tn);
Chris@10 482 T1y = FNMS(Tm, Tl, Tk * Tn);
Chris@10 483 }
Chris@10 484 {
Chris@10 485 E Tq, Ts, Tp, Tr;
Chris@10 486 Tq = ri[WS(rs, 10)];
Chris@10 487 Ts = ii[WS(rs, 10)];
Chris@10 488 Tp = W[18];
Chris@10 489 Tr = W[19];
Chris@10 490 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@10 491 T1z = FNMS(Tr, Tq, Tp * Ts);
Chris@10 492 }
Chris@10 493 Tu = To + Tt;
Chris@10 494 T2s = T1y + T1z;
Chris@10 495 T1A = T1y - T1z;
Chris@10 496 T1B = To - Tt;
Chris@10 497 T1C = T1A - T1B;
Chris@10 498 T2c = T1B + T1A;
Chris@10 499 }
Chris@10 500 {
Chris@10 501 E Tz, T1E, TE, T1F, T1D, T1G;
Chris@10 502 {
Chris@10 503 E Tw, Ty, Tv, Tx;
Chris@10 504 Tw = ri[WS(rs, 14)];
Chris@10 505 Ty = ii[WS(rs, 14)];
Chris@10 506 Tv = W[26];
Chris@10 507 Tx = W[27];
Chris@10 508 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@10 509 T1E = FNMS(Tx, Tw, Tv * Ty);
Chris@10 510 }
Chris@10 511 {
Chris@10 512 E TB, TD, TA, TC;
Chris@10 513 TB = ri[WS(rs, 6)];
Chris@10 514 TD = ii[WS(rs, 6)];
Chris@10 515 TA = W[10];
Chris@10 516 TC = W[11];
Chris@10 517 TE = FMA(TA, TB, TC * TD);
Chris@10 518 T1F = FNMS(TC, TB, TA * TD);
Chris@10 519 }
Chris@10 520 TF = Tz + TE;
Chris@10 521 T2t = T1E + T1F;
Chris@10 522 T1D = Tz - TE;
Chris@10 523 T1G = T1E - T1F;
Chris@10 524 T1H = T1D + T1G;
Chris@10 525 T2d = T1D - T1G;
Chris@10 526 }
Chris@10 527 {
Chris@10 528 E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
Chris@10 529 {
Chris@10 530 E T16, T18, T15, T17;
Chris@10 531 T16 = ri[WS(rs, 15)];
Chris@10 532 T18 = ii[WS(rs, 15)];
Chris@10 533 T15 = W[28];
Chris@10 534 T17 = W[29];
Chris@10 535 T19 = FMA(T15, T16, T17 * T18);
Chris@10 536 T20 = FNMS(T17, T16, T15 * T18);
Chris@10 537 }
Chris@10 538 {
Chris@10 539 E T1m, T1o, T1l, T1n;
Chris@10 540 T1m = ri[WS(rs, 11)];
Chris@10 541 T1o = ii[WS(rs, 11)];
Chris@10 542 T1l = W[20];
Chris@10 543 T1n = W[21];
Chris@10 544 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@10 545 T1X = FNMS(T1n, T1m, T1l * T1o);
Chris@10 546 }
Chris@10 547 {
Chris@10 548 E T1b, T1d, T1a, T1c;
Chris@10 549 T1b = ri[WS(rs, 7)];
Chris@10 550 T1d = ii[WS(rs, 7)];
Chris@10 551 T1a = W[12];
Chris@10 552 T1c = W[13];
Chris@10 553 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@10 554 T21 = FNMS(T1c, T1b, T1a * T1d);
Chris@10 555 }
Chris@10 556 {
Chris@10 557 E T1h, T1j, T1g, T1i;
Chris@10 558 T1h = ri[WS(rs, 3)];
Chris@10 559 T1j = ii[WS(rs, 3)];
Chris@10 560 T1g = W[4];
Chris@10 561 T1i = W[5];
Chris@10 562 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@10 563 T1W = FNMS(T1i, T1h, T1g * T1j);
Chris@10 564 }
Chris@10 565 T1f = T19 + T1e;
Chris@10 566 T1q = T1k + T1p;
Chris@10 567 T2B = T1f - T1q;
Chris@10 568 T2C = T20 + T21;
Chris@10 569 T2D = T1W + T1X;
Chris@10 570 T2E = T2C - T2D;
Chris@10 571 {
Chris@10 572 E T1V, T1Y, T22, T23;
Chris@10 573 T1V = T19 - T1e;
Chris@10 574 T1Y = T1W - T1X;
Chris@10 575 T1Z = T1V - T1Y;
Chris@10 576 T2j = T1V + T1Y;
Chris@10 577 T22 = T20 - T21;
Chris@10 578 T23 = T1k - T1p;
Chris@10 579 T24 = T22 + T23;
Chris@10 580 T2k = T22 - T23;
Chris@10 581 }
Chris@10 582 }
Chris@10 583 {
Chris@10 584 E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
Chris@10 585 {
Chris@10 586 E TJ, TL, TI, TK;
Chris@10 587 TJ = ri[WS(rs, 1)];
Chris@10 588 TL = ii[WS(rs, 1)];
Chris@10 589 TI = W[0];
Chris@10 590 TK = W[1];
Chris@10 591 TM = FMA(TI, TJ, TK * TL);
Chris@10 592 T1K = FNMS(TK, TJ, TI * TL);
Chris@10 593 }
Chris@10 594 {
Chris@10 595 E TZ, T11, TY, T10;
Chris@10 596 TZ = ri[WS(rs, 13)];
Chris@10 597 T11 = ii[WS(rs, 13)];
Chris@10 598 TY = W[24];
Chris@10 599 T10 = W[25];
Chris@10 600 T12 = FMA(TY, TZ, T10 * T11);
Chris@10 601 T1R = FNMS(T10, TZ, TY * T11);
Chris@10 602 }
Chris@10 603 {
Chris@10 604 E TO, TQ, TN, TP;
Chris@10 605 TO = ri[WS(rs, 9)];
Chris@10 606 TQ = ii[WS(rs, 9)];
Chris@10 607 TN = W[16];
Chris@10 608 TP = W[17];
Chris@10 609 TR = FMA(TN, TO, TP * TQ);
Chris@10 610 T1L = FNMS(TP, TO, TN * TQ);
Chris@10 611 }
Chris@10 612 {
Chris@10 613 E TU, TW, TT, TV;
Chris@10 614 TU = ri[WS(rs, 5)];
Chris@10 615 TW = ii[WS(rs, 5)];
Chris@10 616 TT = W[8];
Chris@10 617 TV = W[9];
Chris@10 618 TX = FMA(TT, TU, TV * TW);
Chris@10 619 T1Q = FNMS(TV, TU, TT * TW);
Chris@10 620 }
Chris@10 621 TS = TM + TR;
Chris@10 622 T13 = TX + T12;
Chris@10 623 T2w = TS - T13;
Chris@10 624 T2x = T1K + T1L;
Chris@10 625 T2y = T1Q + T1R;
Chris@10 626 T2z = T2x - T2y;
Chris@10 627 {
Chris@10 628 E T1M, T1N, T1P, T1S;
Chris@10 629 T1M = T1K - T1L;
Chris@10 630 T1N = TX - T12;
Chris@10 631 T1O = T1M + T1N;
Chris@10 632 T2g = T1M - T1N;
Chris@10 633 T1P = TM - TR;
Chris@10 634 T1S = T1Q - T1R;
Chris@10 635 T1T = T1P - T1S;
Chris@10 636 T2h = T1P + T1S;
Chris@10 637 }
Chris@10 638 }
Chris@10 639 {
Chris@10 640 E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
Chris@10 641 {
Chris@10 642 E T1x, T1I, T3e, T3f;
Chris@10 643 T1x = T1t - T1w;
Chris@10 644 T1I = KP707106781 * (T1C - T1H);
Chris@10 645 T1J = T1x + T1I;
Chris@10 646 T27 = T1x - T1I;
Chris@10 647 T3e = KP707106781 * (T2d - T2c);
Chris@10 648 T3f = T38 + T37;
Chris@10 649 T3g = T3e + T3f;
Chris@10 650 T3i = T3f - T3e;
Chris@10 651 }
Chris@10 652 {
Chris@10 653 E T1U, T25, T28, T29;
Chris@10 654 T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
Chris@10 655 T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
Chris@10 656 T26 = T1U + T25;
Chris@10 657 T3h = T25 - T1U;
Chris@10 658 T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
Chris@10 659 T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
Chris@10 660 T2a = T28 - T29;
Chris@10 661 T3d = T28 + T29;
Chris@10 662 }
Chris@10 663 ri[WS(rs, 11)] = T1J - T26;
Chris@10 664 ii[WS(rs, 11)] = T3g - T3d;
Chris@10 665 ri[WS(rs, 3)] = T1J + T26;
Chris@10 666 ii[WS(rs, 3)] = T3d + T3g;
Chris@10 667 ri[WS(rs, 15)] = T27 - T2a;
Chris@10 668 ii[WS(rs, 15)] = T3i - T3h;
Chris@10 669 ri[WS(rs, 7)] = T27 + T2a;
Chris@10 670 ii[WS(rs, 7)] = T3h + T3i;
Chris@10 671 }
Chris@10 672 {
Chris@10 673 E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
Chris@10 674 {
Chris@10 675 E T2r, T2u, T30, T31;
Chris@10 676 T2r = T7 - Ti;
Chris@10 677 T2u = T2s - T2t;
Chris@10 678 T2v = T2r + T2u;
Chris@10 679 T2H = T2r - T2u;
Chris@10 680 T30 = TF - Tu;
Chris@10 681 T31 = T2U - T2R;
Chris@10 682 T32 = T30 + T31;
Chris@10 683 T34 = T31 - T30;
Chris@10 684 }
Chris@10 685 {
Chris@10 686 E T2A, T2F, T2I, T2J;
Chris@10 687 T2A = T2w + T2z;
Chris@10 688 T2F = T2B - T2E;
Chris@10 689 T2G = KP707106781 * (T2A + T2F);
Chris@10 690 T33 = KP707106781 * (T2F - T2A);
Chris@10 691 T2I = T2z - T2w;
Chris@10 692 T2J = T2B + T2E;
Chris@10 693 T2K = KP707106781 * (T2I - T2J);
Chris@10 694 T2Z = KP707106781 * (T2I + T2J);
Chris@10 695 }
Chris@10 696 ri[WS(rs, 10)] = T2v - T2G;
Chris@10 697 ii[WS(rs, 10)] = T32 - T2Z;
Chris@10 698 ri[WS(rs, 2)] = T2v + T2G;
Chris@10 699 ii[WS(rs, 2)] = T2Z + T32;
Chris@10 700 ri[WS(rs, 14)] = T2H - T2K;
Chris@10 701 ii[WS(rs, 14)] = T34 - T33;
Chris@10 702 ri[WS(rs, 6)] = T2H + T2K;
Chris@10 703 ii[WS(rs, 6)] = T33 + T34;
Chris@10 704 }
Chris@10 705 {
Chris@10 706 E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
Chris@10 707 {
Chris@10 708 E T2b, T2e, T36, T39;
Chris@10 709 T2b = T1t + T1w;
Chris@10 710 T2e = KP707106781 * (T2c + T2d);
Chris@10 711 T2f = T2b + T2e;
Chris@10 712 T2n = T2b - T2e;
Chris@10 713 T36 = KP707106781 * (T1C + T1H);
Chris@10 714 T39 = T37 - T38;
Chris@10 715 T3a = T36 + T39;
Chris@10 716 T3c = T39 - T36;
Chris@10 717 }
Chris@10 718 {
Chris@10 719 E T2i, T2l, T2o, T2p;
Chris@10 720 T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
Chris@10 721 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
Chris@10 722 T2m = T2i + T2l;
Chris@10 723 T3b = T2l - T2i;
Chris@10 724 T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
Chris@10 725 T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
Chris@10 726 T2q = T2o - T2p;
Chris@10 727 T35 = T2o + T2p;
Chris@10 728 }
Chris@10 729 ri[WS(rs, 9)] = T2f - T2m;
Chris@10 730 ii[WS(rs, 9)] = T3a - T35;
Chris@10 731 ri[WS(rs, 1)] = T2f + T2m;
Chris@10 732 ii[WS(rs, 1)] = T35 + T3a;
Chris@10 733 ri[WS(rs, 13)] = T2n - T2q;
Chris@10 734 ii[WS(rs, 13)] = T3c - T3b;
Chris@10 735 ri[WS(rs, 5)] = T2n + T2q;
Chris@10 736 ii[WS(rs, 5)] = T3b + T3c;
Chris@10 737 }
Chris@10 738 {
Chris@10 739 E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
Chris@10 740 {
Chris@10 741 E Tj, TG, T2Q, T2V;
Chris@10 742 Tj = T7 + Ti;
Chris@10 743 TG = Tu + TF;
Chris@10 744 TH = Tj + TG;
Chris@10 745 T2L = Tj - TG;
Chris@10 746 T2Q = T2s + T2t;
Chris@10 747 T2V = T2R + T2U;
Chris@10 748 T2W = T2Q + T2V;
Chris@10 749 T2Y = T2V - T2Q;
Chris@10 750 }
Chris@10 751 {
Chris@10 752 E T14, T1r, T2M, T2N;
Chris@10 753 T14 = TS + T13;
Chris@10 754 T1r = T1f + T1q;
Chris@10 755 T1s = T14 + T1r;
Chris@10 756 T2X = T1r - T14;
Chris@10 757 T2M = T2x + T2y;
Chris@10 758 T2N = T2C + T2D;
Chris@10 759 T2O = T2M - T2N;
Chris@10 760 T2P = T2M + T2N;
Chris@10 761 }
Chris@10 762 ri[WS(rs, 8)] = TH - T1s;
Chris@10 763 ii[WS(rs, 8)] = T2W - T2P;
Chris@10 764 ri[0] = TH + T1s;
Chris@10 765 ii[0] = T2P + T2W;
Chris@10 766 ri[WS(rs, 12)] = T2L - T2O;
Chris@10 767 ii[WS(rs, 12)] = T2Y - T2X;
Chris@10 768 ri[WS(rs, 4)] = T2L + T2O;
Chris@10 769 ii[WS(rs, 4)] = T2X + T2Y;
Chris@10 770 }
Chris@10 771 }
Chris@10 772 }
Chris@10 773 }
Chris@10 774
Chris@10 775 static const tw_instr twinstr[] = {
Chris@10 776 {TW_FULL, 0, 16},
Chris@10 777 {TW_NEXT, 1, 0}
Chris@10 778 };
Chris@10 779
Chris@10 780 static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
Chris@10 781
Chris@10 782 void X(codelet_t1_16) (planner *p) {
Chris@10 783 X(kdft_dit_register) (p, t1_16, &desc);
Chris@10 784 }
Chris@10 785 #endif /* HAVE_FMA */