annotate src/fftw-3.3.8/dft/scalar/codelets/t1_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:04:15 EDT 2018 */
Chris@82 23
Chris@82 24 #include "dft/codelet-dft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 174 FP additions, 100 FP multiplications,
Chris@82 32 * (or, 104 additions, 30 multiplications, 70 fused multiply/add),
Chris@82 33 * 60 stack variables, 3 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "dft/scalar/t.h"
Chris@82 36
Chris@82 37 static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 {
Chris@82 43 INT m;
Chris@82 44 for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 45 E T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N;
Chris@82 46 E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
Chris@82 47 E T1W, T21;
Chris@82 48 {
Chris@82 49 E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
Chris@82 50 T1 = ri[0];
Chris@82 51 T3n = ii[0];
Chris@82 52 T3 = ri[WS(rs, 8)];
Chris@82 53 T6 = ii[WS(rs, 8)];
Chris@82 54 T2 = W[14];
Chris@82 55 T4 = T2 * T3;
Chris@82 56 T3l = T2 * T6;
Chris@82 57 T5 = W[15];
Chris@82 58 T7 = FMA(T5, T6, T4);
Chris@82 59 T3m = FNMS(T5, T3, T3l);
Chris@82 60 T8 = T1 + T7;
Chris@82 61 T3z = T3n - T3m;
Chris@82 62 T1I = T1 - T7;
Chris@82 63 T3o = T3m + T3n;
Chris@82 64 }
Chris@82 65 {
Chris@82 66 E T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m;
Chris@82 67 T1h = ri[WS(rs, 15)];
Chris@82 68 T1k = ii[WS(rs, 15)];
Chris@82 69 T1g = W[28];
Chris@82 70 T1i = T1g * T1h;
Chris@82 71 T2k = T1g * T1k;
Chris@82 72 T1n = ri[WS(rs, 7)];
Chris@82 73 T1q = ii[WS(rs, 7)];
Chris@82 74 T1m = W[12];
Chris@82 75 T1o = T1m * T1n;
Chris@82 76 T2m = T1m * T1q;
Chris@82 77 {
Chris@82 78 E T1l, T2l, T1r, T2n, T1j, T1p;
Chris@82 79 T1j = W[29];
Chris@82 80 T1l = FMA(T1j, T1k, T1i);
Chris@82 81 T2l = FNMS(T1j, T1h, T2k);
Chris@82 82 T1p = W[13];
Chris@82 83 T1r = FMA(T1p, T1q, T1o);
Chris@82 84 T2n = FNMS(T1p, T1n, T2m);
Chris@82 85 T1s = T1l + T1r;
Chris@82 86 T35 = T2l + T2n;
Chris@82 87 T2o = T2l - T2n;
Chris@82 88 T2r = T1l - T1r;
Chris@82 89 }
Chris@82 90 }
Chris@82 91 {
Chris@82 92 E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
Chris@82 93 T1u = ri[WS(rs, 3)];
Chris@82 94 T1x = ii[WS(rs, 3)];
Chris@82 95 T1t = W[4];
Chris@82 96 T1v = T1t * T1u;
Chris@82 97 T2s = T1t * T1x;
Chris@82 98 T1A = ri[WS(rs, 11)];
Chris@82 99 T1D = ii[WS(rs, 11)];
Chris@82 100 T1z = W[20];
Chris@82 101 T1B = T1z * T1A;
Chris@82 102 T2u = T1z * T1D;
Chris@82 103 {
Chris@82 104 E T1y, T2t, T1E, T2v, T1w, T1C;
Chris@82 105 T1w = W[5];
Chris@82 106 T1y = FMA(T1w, T1x, T1v);
Chris@82 107 T2t = FNMS(T1w, T1u, T2s);
Chris@82 108 T1C = W[21];
Chris@82 109 T1E = FMA(T1C, T1D, T1B);
Chris@82 110 T2v = FNMS(T1C, T1A, T2u);
Chris@82 111 T1F = T1y + T1E;
Chris@82 112 T36 = T2t + T2v;
Chris@82 113 T2p = T1y - T1E;
Chris@82 114 T2w = T2t - T2v;
Chris@82 115 }
Chris@82 116 }
Chris@82 117 {
Chris@82 118 E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
Chris@82 119 Ta = ri[WS(rs, 4)];
Chris@82 120 Td = ii[WS(rs, 4)];
Chris@82 121 T9 = W[6];
Chris@82 122 Tb = T9 * Ta;
Chris@82 123 T1J = T9 * Td;
Chris@82 124 Tg = ri[WS(rs, 12)];
Chris@82 125 Tj = ii[WS(rs, 12)];
Chris@82 126 Tf = W[22];
Chris@82 127 Th = Tf * Tg;
Chris@82 128 T1L = Tf * Tj;
Chris@82 129 {
Chris@82 130 E Te, T1K, Tk, T1M, Tc, Ti;
Chris@82 131 Tc = W[7];
Chris@82 132 Te = FMA(Tc, Td, Tb);
Chris@82 133 T1K = FNMS(Tc, Ta, T1J);
Chris@82 134 Ti = W[23];
Chris@82 135 Tk = FMA(Ti, Tj, Th);
Chris@82 136 T1M = FNMS(Ti, Tg, T1L);
Chris@82 137 Tl = Te + Tk;
Chris@82 138 T3A = Te - Tk;
Chris@82 139 T1N = T1K - T1M;
Chris@82 140 T3k = T1K + T1M;
Chris@82 141 }
Chris@82 142 }
Chris@82 143 {
Chris@82 144 E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
Chris@82 145 To = ri[WS(rs, 2)];
Chris@82 146 Tr = ii[WS(rs, 2)];
Chris@82 147 Tn = W[2];
Chris@82 148 Tp = Tn * To;
Chris@82 149 T1P = Tn * Tr;
Chris@82 150 Tu = ri[WS(rs, 10)];
Chris@82 151 Tx = ii[WS(rs, 10)];
Chris@82 152 Tt = W[18];
Chris@82 153 Tv = Tt * Tu;
Chris@82 154 T1R = Tt * Tx;
Chris@82 155 {
Chris@82 156 E Ts, T1Q, Ty, T1S, Tq, Tw;
Chris@82 157 Tq = W[3];
Chris@82 158 Ts = FMA(Tq, Tr, Tp);
Chris@82 159 T1Q = FNMS(Tq, To, T1P);
Chris@82 160 Tw = W[19];
Chris@82 161 Ty = FMA(Tw, Tx, Tv);
Chris@82 162 T1S = FNMS(Tw, Tu, T1R);
Chris@82 163 Tz = Ts + Ty;
Chris@82 164 T2V = T1Q + T1S;
Chris@82 165 T1T = T1Q - T1S;
Chris@82 166 T1U = Ts - Ty;
Chris@82 167 }
Chris@82 168 }
Chris@82 169 {
Chris@82 170 E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
Chris@82 171 TQ = ri[WS(rs, 1)];
Chris@82 172 TT = ii[WS(rs, 1)];
Chris@82 173 TP = W[0];
Chris@82 174 TR = TP * TQ;
Chris@82 175 T25 = TP * TT;
Chris@82 176 TW = ri[WS(rs, 9)];
Chris@82 177 TZ = ii[WS(rs, 9)];
Chris@82 178 TV = W[16];
Chris@82 179 TX = TV * TW;
Chris@82 180 T27 = TV * TZ;
Chris@82 181 {
Chris@82 182 E TU, T26, T10, T28, TS, TY;
Chris@82 183 TS = W[1];
Chris@82 184 TU = FMA(TS, TT, TR);
Chris@82 185 T26 = FNMS(TS, TQ, T25);
Chris@82 186 TY = W[17];
Chris@82 187 T10 = FMA(TY, TZ, TX);
Chris@82 188 T28 = FNMS(TY, TW, T27);
Chris@82 189 T11 = TU + T10;
Chris@82 190 T30 = T26 + T28;
Chris@82 191 T29 = T26 - T28;
Chris@82 192 T2c = TU - T10;
Chris@82 193 }
Chris@82 194 }
Chris@82 195 {
Chris@82 196 E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
Chris@82 197 T13 = ri[WS(rs, 5)];
Chris@82 198 T16 = ii[WS(rs, 5)];
Chris@82 199 T12 = W[8];
Chris@82 200 T14 = T12 * T13;
Chris@82 201 T2d = T12 * T16;
Chris@82 202 T19 = ri[WS(rs, 13)];
Chris@82 203 T1c = ii[WS(rs, 13)];
Chris@82 204 T18 = W[24];
Chris@82 205 T1a = T18 * T19;
Chris@82 206 T2f = T18 * T1c;
Chris@82 207 {
Chris@82 208 E T17, T2e, T1d, T2g, T15, T1b;
Chris@82 209 T15 = W[9];
Chris@82 210 T17 = FMA(T15, T16, T14);
Chris@82 211 T2e = FNMS(T15, T13, T2d);
Chris@82 212 T1b = W[25];
Chris@82 213 T1d = FMA(T1b, T1c, T1a);
Chris@82 214 T2g = FNMS(T1b, T19, T2f);
Chris@82 215 T1e = T17 + T1d;
Chris@82 216 T31 = T2e + T2g;
Chris@82 217 T2a = T17 - T1d;
Chris@82 218 T2h = T2e - T2g;
Chris@82 219 }
Chris@82 220 }
Chris@82 221 {
Chris@82 222 E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
Chris@82 223 TB = ri[WS(rs, 14)];
Chris@82 224 TE = ii[WS(rs, 14)];
Chris@82 225 TA = W[26];
Chris@82 226 TC = TA * TB;
Chris@82 227 T1X = TA * TE;
Chris@82 228 TH = ri[WS(rs, 6)];
Chris@82 229 TK = ii[WS(rs, 6)];
Chris@82 230 TG = W[10];
Chris@82 231 TI = TG * TH;
Chris@82 232 T1Z = TG * TK;
Chris@82 233 {
Chris@82 234 E TF, T1Y, TL, T20, TD, TJ;
Chris@82 235 TD = W[27];
Chris@82 236 TF = FMA(TD, TE, TC);
Chris@82 237 T1Y = FNMS(TD, TB, T1X);
Chris@82 238 TJ = W[11];
Chris@82 239 TL = FMA(TJ, TK, TI);
Chris@82 240 T20 = FNMS(TJ, TH, T1Z);
Chris@82 241 TM = TF + TL;
Chris@82 242 T2W = T1Y + T20;
Chris@82 243 T1W = TF - TL;
Chris@82 244 T21 = T1Y - T20;
Chris@82 245 }
Chris@82 246 }
Chris@82 247 {
Chris@82 248 E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
Chris@82 249 {
Chris@82 250 E Tm, TN, T3j, T3p;
Chris@82 251 Tm = T8 + Tl;
Chris@82 252 TN = Tz + TM;
Chris@82 253 TO = Tm + TN;
Chris@82 254 T3e = Tm - TN;
Chris@82 255 T3j = T2V + T2W;
Chris@82 256 T3p = T3k + T3o;
Chris@82 257 T3q = T3j + T3p;
Chris@82 258 T3s = T3p - T3j;
Chris@82 259 }
Chris@82 260 {
Chris@82 261 E T1f, T1G, T3f, T3g;
Chris@82 262 T1f = T11 + T1e;
Chris@82 263 T1G = T1s + T1F;
Chris@82 264 T1H = T1f + T1G;
Chris@82 265 T3r = T1G - T1f;
Chris@82 266 T3f = T30 + T31;
Chris@82 267 T3g = T35 + T36;
Chris@82 268 T3h = T3f - T3g;
Chris@82 269 T3i = T3f + T3g;
Chris@82 270 }
Chris@82 271 ri[WS(rs, 8)] = TO - T1H;
Chris@82 272 ii[WS(rs, 8)] = T3q - T3i;
Chris@82 273 ri[0] = TO + T1H;
Chris@82 274 ii[0] = T3i + T3q;
Chris@82 275 ri[WS(rs, 12)] = T3e - T3h;
Chris@82 276 ii[WS(rs, 12)] = T3s - T3r;
Chris@82 277 ri[WS(rs, 4)] = T3e + T3h;
Chris@82 278 ii[WS(rs, 4)] = T3r + T3s;
Chris@82 279 }
Chris@82 280 {
Chris@82 281 E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
Chris@82 282 {
Chris@82 283 E T2U, T2X, T3t, T3u;
Chris@82 284 T2U = T8 - Tl;
Chris@82 285 T2X = T2V - T2W;
Chris@82 286 T2Y = T2U + T2X;
Chris@82 287 T3a = T2U - T2X;
Chris@82 288 T3t = TM - Tz;
Chris@82 289 T3u = T3o - T3k;
Chris@82 290 T3v = T3t + T3u;
Chris@82 291 T3x = T3u - T3t;
Chris@82 292 }
Chris@82 293 {
Chris@82 294 E T2Z, T32, T34, T37;
Chris@82 295 T2Z = T11 - T1e;
Chris@82 296 T32 = T30 - T31;
Chris@82 297 T33 = T2Z + T32;
Chris@82 298 T3b = T32 - T2Z;
Chris@82 299 T34 = T1s - T1F;
Chris@82 300 T37 = T35 - T36;
Chris@82 301 T38 = T34 - T37;
Chris@82 302 T3c = T34 + T37;
Chris@82 303 }
Chris@82 304 {
Chris@82 305 E T39, T3w, T3d, T3y;
Chris@82 306 T39 = T33 + T38;
Chris@82 307 ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
Chris@82 308 ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
Chris@82 309 T3w = T3b + T3c;
Chris@82 310 ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
Chris@82 311 ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
Chris@82 312 T3d = T3b - T3c;
Chris@82 313 ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
Chris@82 314 ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
Chris@82 315 T3y = T38 - T33;
Chris@82 316 ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
Chris@82 317 ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
Chris@82 318 }
Chris@82 319 }
Chris@82 320 {
Chris@82 321 E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
Chris@82 322 E T2C;
Chris@82 323 {
Chris@82 324 E T1V, T22, T2b, T2i;
Chris@82 325 T1O = T1I - T1N;
Chris@82 326 T3B = T3z - T3A;
Chris@82 327 T3H = T3A + T3z;
Chris@82 328 T2E = T1I + T1N;
Chris@82 329 T1V = T1T - T1U;
Chris@82 330 T22 = T1W + T21;
Chris@82 331 T23 = T1V - T22;
Chris@82 332 T3C = T1V + T22;
Chris@82 333 {
Chris@82 334 E T2M, T2N, T2F, T2G;
Chris@82 335 T2M = T2r + T2w;
Chris@82 336 T2N = T2o - T2p;
Chris@82 337 T2O = FNMS(KP414213562, T2N, T2M);
Chris@82 338 T2S = FMA(KP414213562, T2M, T2N);
Chris@82 339 T2F = T1U + T1T;
Chris@82 340 T2G = T1W - T21;
Chris@82 341 T2H = T2F + T2G;
Chris@82 342 T3I = T2G - T2F;
Chris@82 343 }
Chris@82 344 T2b = T29 + T2a;
Chris@82 345 T2i = T2c - T2h;
Chris@82 346 T2j = FMA(KP414213562, T2i, T2b);
Chris@82 347 T2B = FNMS(KP414213562, T2b, T2i);
Chris@82 348 {
Chris@82 349 E T2J, T2K, T2q, T2x;
Chris@82 350 T2J = T2c + T2h;
Chris@82 351 T2K = T29 - T2a;
Chris@82 352 T2L = FMA(KP414213562, T2K, T2J);
Chris@82 353 T2R = FNMS(KP414213562, T2J, T2K);
Chris@82 354 T2q = T2o + T2p;
Chris@82 355 T2x = T2r - T2w;
Chris@82 356 T2y = FNMS(KP414213562, T2x, T2q);
Chris@82 357 T2C = FMA(KP414213562, T2q, T2x);
Chris@82 358 }
Chris@82 359 }
Chris@82 360 {
Chris@82 361 E T24, T2z, T3J, T3K;
Chris@82 362 T24 = FMA(KP707106781, T23, T1O);
Chris@82 363 T2z = T2j - T2y;
Chris@82 364 ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
Chris@82 365 ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
Chris@82 366 T3J = FMA(KP707106781, T3I, T3H);
Chris@82 367 T3K = T2C - T2B;
Chris@82 368 ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
Chris@82 369 ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
Chris@82 370 }
Chris@82 371 {
Chris@82 372 E T2A, T2D, T3L, T3M;
Chris@82 373 T2A = FNMS(KP707106781, T23, T1O);
Chris@82 374 T2D = T2B + T2C;
Chris@82 375 ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
Chris@82 376 ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
Chris@82 377 T3L = FNMS(KP707106781, T3I, T3H);
Chris@82 378 T3M = T2j + T2y;
Chris@82 379 ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
Chris@82 380 ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
Chris@82 381 }
Chris@82 382 {
Chris@82 383 E T2I, T2P, T3D, T3E;
Chris@82 384 T2I = FMA(KP707106781, T2H, T2E);
Chris@82 385 T2P = T2L + T2O;
Chris@82 386 ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
Chris@82 387 ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
Chris@82 388 T3D = FMA(KP707106781, T3C, T3B);
Chris@82 389 T3E = T2R + T2S;
Chris@82 390 ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
Chris@82 391 ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
Chris@82 392 }
Chris@82 393 {
Chris@82 394 E T2Q, T2T, T3F, T3G;
Chris@82 395 T2Q = FNMS(KP707106781, T2H, T2E);
Chris@82 396 T2T = T2R - T2S;
Chris@82 397 ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
Chris@82 398 ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
Chris@82 399 T3F = FNMS(KP707106781, T3C, T3B);
Chris@82 400 T3G = T2O - T2L;
Chris@82 401 ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
Chris@82 402 ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
Chris@82 403 }
Chris@82 404 }
Chris@82 405 }
Chris@82 406 }
Chris@82 407 }
Chris@82 408
Chris@82 409 static const tw_instr twinstr[] = {
Chris@82 410 {TW_FULL, 0, 16},
Chris@82 411 {TW_NEXT, 1, 0}
Chris@82 412 };
Chris@82 413
Chris@82 414 static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {104, 30, 70, 0}, 0, 0, 0 };
Chris@82 415
Chris@82 416 void X(codelet_t1_16) (planner *p) {
Chris@82 417 X(kdft_dit_register) (p, t1_16, &desc);
Chris@82 418 }
Chris@82 419 #else
Chris@82 420
Chris@82 421 /* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
Chris@82 422
Chris@82 423 /*
Chris@82 424 * This function contains 174 FP additions, 84 FP multiplications,
Chris@82 425 * (or, 136 additions, 46 multiplications, 38 fused multiply/add),
Chris@82 426 * 52 stack variables, 3 constants, and 64 memory accesses
Chris@82 427 */
Chris@82 428 #include "dft/scalar/t.h"
Chris@82 429
Chris@82 430 static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 431 {
Chris@82 432 DK(KP382683432, +0.382683432365089771728459984030398866761344562);
Chris@82 433 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 434 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 435 {
Chris@82 436 INT m;
Chris@82 437 for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
Chris@82 438 E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
Chris@82 439 E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
Chris@82 440 E T2y, T2z, T1O, T2g, T1T, T2h;
Chris@82 441 {
Chris@82 442 E T1, T2T, T6, T2S;
Chris@82 443 T1 = ri[0];
Chris@82 444 T2T = ii[0];
Chris@82 445 {
Chris@82 446 E T3, T5, T2, T4;
Chris@82 447 T3 = ri[WS(rs, 8)];
Chris@82 448 T5 = ii[WS(rs, 8)];
Chris@82 449 T2 = W[14];
Chris@82 450 T4 = W[15];
Chris@82 451 T6 = FMA(T2, T3, T4 * T5);
Chris@82 452 T2S = FNMS(T4, T3, T2 * T5);
Chris@82 453 }
Chris@82 454 T7 = T1 + T6;
Chris@82 455 T37 = T2T - T2S;
Chris@82 456 T1t = T1 - T6;
Chris@82 457 T2U = T2S + T2T;
Chris@82 458 }
Chris@82 459 {
Chris@82 460 E Tc, T1u, Th, T1v;
Chris@82 461 {
Chris@82 462 E T9, Tb, T8, Ta;
Chris@82 463 T9 = ri[WS(rs, 4)];
Chris@82 464 Tb = ii[WS(rs, 4)];
Chris@82 465 T8 = W[6];
Chris@82 466 Ta = W[7];
Chris@82 467 Tc = FMA(T8, T9, Ta * Tb);
Chris@82 468 T1u = FNMS(Ta, T9, T8 * Tb);
Chris@82 469 }
Chris@82 470 {
Chris@82 471 E Te, Tg, Td, Tf;
Chris@82 472 Te = ri[WS(rs, 12)];
Chris@82 473 Tg = ii[WS(rs, 12)];
Chris@82 474 Td = W[22];
Chris@82 475 Tf = W[23];
Chris@82 476 Th = FMA(Td, Te, Tf * Tg);
Chris@82 477 T1v = FNMS(Tf, Te, Td * Tg);
Chris@82 478 }
Chris@82 479 Ti = Tc + Th;
Chris@82 480 T38 = Tc - Th;
Chris@82 481 T1w = T1u - T1v;
Chris@82 482 T2R = T1u + T1v;
Chris@82 483 }
Chris@82 484 {
Chris@82 485 E To, T1y, Tt, T1z, T1A, T1B;
Chris@82 486 {
Chris@82 487 E Tl, Tn, Tk, Tm;
Chris@82 488 Tl = ri[WS(rs, 2)];
Chris@82 489 Tn = ii[WS(rs, 2)];
Chris@82 490 Tk = W[2];
Chris@82 491 Tm = W[3];
Chris@82 492 To = FMA(Tk, Tl, Tm * Tn);
Chris@82 493 T1y = FNMS(Tm, Tl, Tk * Tn);
Chris@82 494 }
Chris@82 495 {
Chris@82 496 E Tq, Ts, Tp, Tr;
Chris@82 497 Tq = ri[WS(rs, 10)];
Chris@82 498 Ts = ii[WS(rs, 10)];
Chris@82 499 Tp = W[18];
Chris@82 500 Tr = W[19];
Chris@82 501 Tt = FMA(Tp, Tq, Tr * Ts);
Chris@82 502 T1z = FNMS(Tr, Tq, Tp * Ts);
Chris@82 503 }
Chris@82 504 Tu = To + Tt;
Chris@82 505 T2s = T1y + T1z;
Chris@82 506 T1A = T1y - T1z;
Chris@82 507 T1B = To - Tt;
Chris@82 508 T1C = T1A - T1B;
Chris@82 509 T2c = T1B + T1A;
Chris@82 510 }
Chris@82 511 {
Chris@82 512 E Tz, T1E, TE, T1F, T1D, T1G;
Chris@82 513 {
Chris@82 514 E Tw, Ty, Tv, Tx;
Chris@82 515 Tw = ri[WS(rs, 14)];
Chris@82 516 Ty = ii[WS(rs, 14)];
Chris@82 517 Tv = W[26];
Chris@82 518 Tx = W[27];
Chris@82 519 Tz = FMA(Tv, Tw, Tx * Ty);
Chris@82 520 T1E = FNMS(Tx, Tw, Tv * Ty);
Chris@82 521 }
Chris@82 522 {
Chris@82 523 E TB, TD, TA, TC;
Chris@82 524 TB = ri[WS(rs, 6)];
Chris@82 525 TD = ii[WS(rs, 6)];
Chris@82 526 TA = W[10];
Chris@82 527 TC = W[11];
Chris@82 528 TE = FMA(TA, TB, TC * TD);
Chris@82 529 T1F = FNMS(TC, TB, TA * TD);
Chris@82 530 }
Chris@82 531 TF = Tz + TE;
Chris@82 532 T2t = T1E + T1F;
Chris@82 533 T1D = Tz - TE;
Chris@82 534 T1G = T1E - T1F;
Chris@82 535 T1H = T1D + T1G;
Chris@82 536 T2d = T1D - T1G;
Chris@82 537 }
Chris@82 538 {
Chris@82 539 E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
Chris@82 540 {
Chris@82 541 E T16, T18, T15, T17;
Chris@82 542 T16 = ri[WS(rs, 15)];
Chris@82 543 T18 = ii[WS(rs, 15)];
Chris@82 544 T15 = W[28];
Chris@82 545 T17 = W[29];
Chris@82 546 T19 = FMA(T15, T16, T17 * T18);
Chris@82 547 T20 = FNMS(T17, T16, T15 * T18);
Chris@82 548 }
Chris@82 549 {
Chris@82 550 E T1m, T1o, T1l, T1n;
Chris@82 551 T1m = ri[WS(rs, 11)];
Chris@82 552 T1o = ii[WS(rs, 11)];
Chris@82 553 T1l = W[20];
Chris@82 554 T1n = W[21];
Chris@82 555 T1p = FMA(T1l, T1m, T1n * T1o);
Chris@82 556 T1X = FNMS(T1n, T1m, T1l * T1o);
Chris@82 557 }
Chris@82 558 {
Chris@82 559 E T1b, T1d, T1a, T1c;
Chris@82 560 T1b = ri[WS(rs, 7)];
Chris@82 561 T1d = ii[WS(rs, 7)];
Chris@82 562 T1a = W[12];
Chris@82 563 T1c = W[13];
Chris@82 564 T1e = FMA(T1a, T1b, T1c * T1d);
Chris@82 565 T21 = FNMS(T1c, T1b, T1a * T1d);
Chris@82 566 }
Chris@82 567 {
Chris@82 568 E T1h, T1j, T1g, T1i;
Chris@82 569 T1h = ri[WS(rs, 3)];
Chris@82 570 T1j = ii[WS(rs, 3)];
Chris@82 571 T1g = W[4];
Chris@82 572 T1i = W[5];
Chris@82 573 T1k = FMA(T1g, T1h, T1i * T1j);
Chris@82 574 T1W = FNMS(T1i, T1h, T1g * T1j);
Chris@82 575 }
Chris@82 576 T1f = T19 + T1e;
Chris@82 577 T1q = T1k + T1p;
Chris@82 578 T2B = T1f - T1q;
Chris@82 579 T2C = T20 + T21;
Chris@82 580 T2D = T1W + T1X;
Chris@82 581 T2E = T2C - T2D;
Chris@82 582 {
Chris@82 583 E T1V, T1Y, T22, T23;
Chris@82 584 T1V = T19 - T1e;
Chris@82 585 T1Y = T1W - T1X;
Chris@82 586 T1Z = T1V - T1Y;
Chris@82 587 T2j = T1V + T1Y;
Chris@82 588 T22 = T20 - T21;
Chris@82 589 T23 = T1k - T1p;
Chris@82 590 T24 = T22 + T23;
Chris@82 591 T2k = T22 - T23;
Chris@82 592 }
Chris@82 593 }
Chris@82 594 {
Chris@82 595 E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
Chris@82 596 {
Chris@82 597 E TJ, TL, TI, TK;
Chris@82 598 TJ = ri[WS(rs, 1)];
Chris@82 599 TL = ii[WS(rs, 1)];
Chris@82 600 TI = W[0];
Chris@82 601 TK = W[1];
Chris@82 602 TM = FMA(TI, TJ, TK * TL);
Chris@82 603 T1K = FNMS(TK, TJ, TI * TL);
Chris@82 604 }
Chris@82 605 {
Chris@82 606 E TZ, T11, TY, T10;
Chris@82 607 TZ = ri[WS(rs, 13)];
Chris@82 608 T11 = ii[WS(rs, 13)];
Chris@82 609 TY = W[24];
Chris@82 610 T10 = W[25];
Chris@82 611 T12 = FMA(TY, TZ, T10 * T11);
Chris@82 612 T1R = FNMS(T10, TZ, TY * T11);
Chris@82 613 }
Chris@82 614 {
Chris@82 615 E TO, TQ, TN, TP;
Chris@82 616 TO = ri[WS(rs, 9)];
Chris@82 617 TQ = ii[WS(rs, 9)];
Chris@82 618 TN = W[16];
Chris@82 619 TP = W[17];
Chris@82 620 TR = FMA(TN, TO, TP * TQ);
Chris@82 621 T1L = FNMS(TP, TO, TN * TQ);
Chris@82 622 }
Chris@82 623 {
Chris@82 624 E TU, TW, TT, TV;
Chris@82 625 TU = ri[WS(rs, 5)];
Chris@82 626 TW = ii[WS(rs, 5)];
Chris@82 627 TT = W[8];
Chris@82 628 TV = W[9];
Chris@82 629 TX = FMA(TT, TU, TV * TW);
Chris@82 630 T1Q = FNMS(TV, TU, TT * TW);
Chris@82 631 }
Chris@82 632 TS = TM + TR;
Chris@82 633 T13 = TX + T12;
Chris@82 634 T2w = TS - T13;
Chris@82 635 T2x = T1K + T1L;
Chris@82 636 T2y = T1Q + T1R;
Chris@82 637 T2z = T2x - T2y;
Chris@82 638 {
Chris@82 639 E T1M, T1N, T1P, T1S;
Chris@82 640 T1M = T1K - T1L;
Chris@82 641 T1N = TX - T12;
Chris@82 642 T1O = T1M + T1N;
Chris@82 643 T2g = T1M - T1N;
Chris@82 644 T1P = TM - TR;
Chris@82 645 T1S = T1Q - T1R;
Chris@82 646 T1T = T1P - T1S;
Chris@82 647 T2h = T1P + T1S;
Chris@82 648 }
Chris@82 649 }
Chris@82 650 {
Chris@82 651 E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
Chris@82 652 {
Chris@82 653 E T1x, T1I, T3e, T3f;
Chris@82 654 T1x = T1t - T1w;
Chris@82 655 T1I = KP707106781 * (T1C - T1H);
Chris@82 656 T1J = T1x + T1I;
Chris@82 657 T27 = T1x - T1I;
Chris@82 658 T3e = KP707106781 * (T2d - T2c);
Chris@82 659 T3f = T38 + T37;
Chris@82 660 T3g = T3e + T3f;
Chris@82 661 T3i = T3f - T3e;
Chris@82 662 }
Chris@82 663 {
Chris@82 664 E T1U, T25, T28, T29;
Chris@82 665 T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
Chris@82 666 T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
Chris@82 667 T26 = T1U + T25;
Chris@82 668 T3h = T25 - T1U;
Chris@82 669 T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
Chris@82 670 T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
Chris@82 671 T2a = T28 - T29;
Chris@82 672 T3d = T28 + T29;
Chris@82 673 }
Chris@82 674 ri[WS(rs, 11)] = T1J - T26;
Chris@82 675 ii[WS(rs, 11)] = T3g - T3d;
Chris@82 676 ri[WS(rs, 3)] = T1J + T26;
Chris@82 677 ii[WS(rs, 3)] = T3d + T3g;
Chris@82 678 ri[WS(rs, 15)] = T27 - T2a;
Chris@82 679 ii[WS(rs, 15)] = T3i - T3h;
Chris@82 680 ri[WS(rs, 7)] = T27 + T2a;
Chris@82 681 ii[WS(rs, 7)] = T3h + T3i;
Chris@82 682 }
Chris@82 683 {
Chris@82 684 E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
Chris@82 685 {
Chris@82 686 E T2r, T2u, T30, T31;
Chris@82 687 T2r = T7 - Ti;
Chris@82 688 T2u = T2s - T2t;
Chris@82 689 T2v = T2r + T2u;
Chris@82 690 T2H = T2r - T2u;
Chris@82 691 T30 = TF - Tu;
Chris@82 692 T31 = T2U - T2R;
Chris@82 693 T32 = T30 + T31;
Chris@82 694 T34 = T31 - T30;
Chris@82 695 }
Chris@82 696 {
Chris@82 697 E T2A, T2F, T2I, T2J;
Chris@82 698 T2A = T2w + T2z;
Chris@82 699 T2F = T2B - T2E;
Chris@82 700 T2G = KP707106781 * (T2A + T2F);
Chris@82 701 T33 = KP707106781 * (T2F - T2A);
Chris@82 702 T2I = T2z - T2w;
Chris@82 703 T2J = T2B + T2E;
Chris@82 704 T2K = KP707106781 * (T2I - T2J);
Chris@82 705 T2Z = KP707106781 * (T2I + T2J);
Chris@82 706 }
Chris@82 707 ri[WS(rs, 10)] = T2v - T2G;
Chris@82 708 ii[WS(rs, 10)] = T32 - T2Z;
Chris@82 709 ri[WS(rs, 2)] = T2v + T2G;
Chris@82 710 ii[WS(rs, 2)] = T2Z + T32;
Chris@82 711 ri[WS(rs, 14)] = T2H - T2K;
Chris@82 712 ii[WS(rs, 14)] = T34 - T33;
Chris@82 713 ri[WS(rs, 6)] = T2H + T2K;
Chris@82 714 ii[WS(rs, 6)] = T33 + T34;
Chris@82 715 }
Chris@82 716 {
Chris@82 717 E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
Chris@82 718 {
Chris@82 719 E T2b, T2e, T36, T39;
Chris@82 720 T2b = T1t + T1w;
Chris@82 721 T2e = KP707106781 * (T2c + T2d);
Chris@82 722 T2f = T2b + T2e;
Chris@82 723 T2n = T2b - T2e;
Chris@82 724 T36 = KP707106781 * (T1C + T1H);
Chris@82 725 T39 = T37 - T38;
Chris@82 726 T3a = T36 + T39;
Chris@82 727 T3c = T39 - T36;
Chris@82 728 }
Chris@82 729 {
Chris@82 730 E T2i, T2l, T2o, T2p;
Chris@82 731 T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
Chris@82 732 T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
Chris@82 733 T2m = T2i + T2l;
Chris@82 734 T3b = T2l - T2i;
Chris@82 735 T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
Chris@82 736 T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
Chris@82 737 T2q = T2o - T2p;
Chris@82 738 T35 = T2o + T2p;
Chris@82 739 }
Chris@82 740 ri[WS(rs, 9)] = T2f - T2m;
Chris@82 741 ii[WS(rs, 9)] = T3a - T35;
Chris@82 742 ri[WS(rs, 1)] = T2f + T2m;
Chris@82 743 ii[WS(rs, 1)] = T35 + T3a;
Chris@82 744 ri[WS(rs, 13)] = T2n - T2q;
Chris@82 745 ii[WS(rs, 13)] = T3c - T3b;
Chris@82 746 ri[WS(rs, 5)] = T2n + T2q;
Chris@82 747 ii[WS(rs, 5)] = T3b + T3c;
Chris@82 748 }
Chris@82 749 {
Chris@82 750 E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
Chris@82 751 {
Chris@82 752 E Tj, TG, T2Q, T2V;
Chris@82 753 Tj = T7 + Ti;
Chris@82 754 TG = Tu + TF;
Chris@82 755 TH = Tj + TG;
Chris@82 756 T2L = Tj - TG;
Chris@82 757 T2Q = T2s + T2t;
Chris@82 758 T2V = T2R + T2U;
Chris@82 759 T2W = T2Q + T2V;
Chris@82 760 T2Y = T2V - T2Q;
Chris@82 761 }
Chris@82 762 {
Chris@82 763 E T14, T1r, T2M, T2N;
Chris@82 764 T14 = TS + T13;
Chris@82 765 T1r = T1f + T1q;
Chris@82 766 T1s = T14 + T1r;
Chris@82 767 T2X = T1r - T14;
Chris@82 768 T2M = T2x + T2y;
Chris@82 769 T2N = T2C + T2D;
Chris@82 770 T2O = T2M - T2N;
Chris@82 771 T2P = T2M + T2N;
Chris@82 772 }
Chris@82 773 ri[WS(rs, 8)] = TH - T1s;
Chris@82 774 ii[WS(rs, 8)] = T2W - T2P;
Chris@82 775 ri[0] = TH + T1s;
Chris@82 776 ii[0] = T2P + T2W;
Chris@82 777 ri[WS(rs, 12)] = T2L - T2O;
Chris@82 778 ii[WS(rs, 12)] = T2Y - T2X;
Chris@82 779 ri[WS(rs, 4)] = T2L + T2O;
Chris@82 780 ii[WS(rs, 4)] = T2X + T2Y;
Chris@82 781 }
Chris@82 782 }
Chris@82 783 }
Chris@82 784 }
Chris@82 785
Chris@82 786 static const tw_instr twinstr[] = {
Chris@82 787 {TW_FULL, 0, 16},
Chris@82 788 {TW_NEXT, 1, 0}
Chris@82 789 };
Chris@82 790
Chris@82 791 static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, {136, 46, 38, 0}, 0, 0, 0 };
Chris@82 792
Chris@82 793 void X(codelet_t1_16) (planner *p) {
Chris@82 794 X(kdft_dit_register) (p, t1_16, &desc);
Chris@82 795 }
Chris@82 796 #endif