annotate src/fftw-3.3.8/rdft/scalar/r2cf/hc2cfdft2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents d0c2a83c1364
children
rev   line source
Chris@82 1 /*
Chris@82 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@82 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@82 4 *
Chris@82 5 * This program is free software; you can redistribute it and/or modify
Chris@82 6 * it under the terms of the GNU General Public License as published by
Chris@82 7 * the Free Software Foundation; either version 2 of the License, or
Chris@82 8 * (at your option) any later version.
Chris@82 9 *
Chris@82 10 * This program is distributed in the hope that it will be useful,
Chris@82 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@82 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@82 13 * GNU General Public License for more details.
Chris@82 14 *
Chris@82 15 * You should have received a copy of the GNU General Public License
Chris@82 16 * along with this program; if not, write to the Free Software
Chris@82 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@82 18 *
Chris@82 19 */
Chris@82 20
Chris@82 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@82 22 /* Generated on Thu May 24 08:07:16 EDT 2018 */
Chris@82 23
Chris@82 24 #include "rdft/codelet-rdft.h"
Chris@82 25
Chris@82 26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
Chris@82 27
Chris@82 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include rdft/scalar/hc2cf.h */
Chris@82 29
Chris@82 30 /*
Chris@82 31 * This function contains 228 FP additions, 166 FP multiplications,
Chris@82 32 * (or, 136 additions, 74 multiplications, 92 fused multiply/add),
Chris@82 33 * 91 stack variables, 4 constants, and 64 memory accesses
Chris@82 34 */
Chris@82 35 #include "rdft/scalar/hc2cf.h"
Chris@82 36
Chris@82 37 static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 38 {
Chris@82 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@82 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@82 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@82 42 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 43 {
Chris@82 44 INT m;
Chris@82 45 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 46 E T1, T2, Tw, Ty, Th, Tj, T4, T5, TY, T6, Tk, T1o, T1d, Tz, T1j;
Chris@82 47 E Tq, TF, T18, TR, TL, T13, T1A, T1K, T1E, T1H, Tc, T25, T2k, T29, T2h;
Chris@82 48 {
Chris@82 49 E Tx, TE, Ti, TK, Tp, TQ, Tb, T3;
Chris@82 50 T1 = W[0];
Chris@82 51 T2 = W[2];
Chris@82 52 T3 = T1 * T2;
Chris@82 53 Tw = W[6];
Chris@82 54 Tx = T1 * Tw;
Chris@82 55 Ty = W[7];
Chris@82 56 TE = T1 * Ty;
Chris@82 57 Th = W[4];
Chris@82 58 Ti = T1 * Th;
Chris@82 59 TK = T2 * Th;
Chris@82 60 Tj = W[5];
Chris@82 61 Tp = T1 * Tj;
Chris@82 62 TQ = T2 * Tj;
Chris@82 63 T4 = W[1];
Chris@82 64 T5 = W[3];
Chris@82 65 Tb = T1 * T5;
Chris@82 66 TY = FNMS(T4, T5, T3);
Chris@82 67 T6 = FMA(T4, T5, T3);
Chris@82 68 Tk = FNMS(T4, Tj, Ti);
Chris@82 69 T1o = FNMS(T4, Th, Tp);
Chris@82 70 T1d = FMA(T5, Th, TQ);
Chris@82 71 Tz = FMA(T4, Ty, Tx);
Chris@82 72 T1j = FMA(T4, Tj, Ti);
Chris@82 73 Tq = FMA(T4, Th, Tp);
Chris@82 74 TF = FNMS(T4, Tw, TE);
Chris@82 75 T18 = FNMS(T5, Tj, TK);
Chris@82 76 TR = FNMS(T5, Th, TQ);
Chris@82 77 TL = FMA(T5, Tj, TK);
Chris@82 78 {
Chris@82 79 E T1z, T1D, T24, T28;
Chris@82 80 T1z = TY * Th;
Chris@82 81 T1D = TY * Tj;
Chris@82 82 T13 = FMA(T4, T2, Tb);
Chris@82 83 T1A = FMA(T13, Tj, T1z);
Chris@82 84 T1K = FMA(T13, Th, T1D);
Chris@82 85 T1E = FNMS(T13, Th, T1D);
Chris@82 86 T1H = FNMS(T13, Tj, T1z);
Chris@82 87 T24 = T6 * Th;
Chris@82 88 T28 = T6 * Tj;
Chris@82 89 Tc = FNMS(T4, T2, Tb);
Chris@82 90 T25 = FNMS(Tc, Tj, T24);
Chris@82 91 T2k = FNMS(Tc, Th, T28);
Chris@82 92 T29 = FMA(Tc, Th, T28);
Chris@82 93 T2h = FMA(Tc, Tj, T24);
Chris@82 94 }
Chris@82 95 }
Chris@82 96 {
Chris@82 97 E T1v, T2q, T1s, T2s, T38, T3T, T1Y, T3P, T17, T1h, T2x, T2v, T33, T3Q, T1N;
Chris@82 98 E T3S, Tg, Tu, T3A, T2B, T2D, T3B, T2c, T3L, T2S, T3I, TJ, TV, T3E, T2G;
Chris@82 99 E T2I, T3D, T2n, T3J, T2X, T3M;
Chris@82 100 {
Chris@82 101 E T1t, T1u, T1W, T1m, T1Q, T1S, T1T, T1V, T36, T1r, T34, T1P, T1k, T1l, T1n;
Chris@82 102 E T2r;
Chris@82 103 T1t = Ip[0];
Chris@82 104 T1u = Im[0];
Chris@82 105 T1W = T1t + T1u;
Chris@82 106 T1k = Ip[WS(rs, 4)];
Chris@82 107 T1l = Im[WS(rs, 4)];
Chris@82 108 T1m = T1k - T1l;
Chris@82 109 T1Q = T1k + T1l;
Chris@82 110 {
Chris@82 111 E T1U, T1p, T1q, T1O;
Chris@82 112 T1S = Rm[0];
Chris@82 113 T1T = Rp[0];
Chris@82 114 T1U = T1S - T1T;
Chris@82 115 T1V = T1 * T1U;
Chris@82 116 T36 = T4 * T1U;
Chris@82 117 T1p = Rp[WS(rs, 4)];
Chris@82 118 T1q = Rm[WS(rs, 4)];
Chris@82 119 T1O = T1q - T1p;
Chris@82 120 T1r = T1p + T1q;
Chris@82 121 T34 = Tj * T1O;
Chris@82 122 T1P = Th * T1O;
Chris@82 123 }
Chris@82 124 T1v = T1t - T1u;
Chris@82 125 T2q = T1T + T1S;
Chris@82 126 T1n = T1j * T1m;
Chris@82 127 T1s = FNMS(T1o, T1r, T1n);
Chris@82 128 T2r = T1j * T1r;
Chris@82 129 T2s = FMA(T1o, T1m, T2r);
Chris@82 130 {
Chris@82 131 E T35, T37, T1R, T1X;
Chris@82 132 T35 = FMA(Th, T1Q, T34);
Chris@82 133 T37 = FMA(T1, T1W, T36);
Chris@82 134 T38 = T35 + T37;
Chris@82 135 T3T = T37 - T35;
Chris@82 136 T1R = FNMS(Tj, T1Q, T1P);
Chris@82 137 T1X = FNMS(T4, T1W, T1V);
Chris@82 138 T1Y = T1R + T1X;
Chris@82 139 T3P = T1X - T1R;
Chris@82 140 }
Chris@82 141 }
Chris@82 142 {
Chris@82 143 E T11, T1F, T16, T2Z, T1C, T1b, T1L, T1g, T31, T1J;
Chris@82 144 {
Chris@82 145 E TZ, T10, T14, T15, T1B;
Chris@82 146 TZ = Ip[WS(rs, 2)];
Chris@82 147 T10 = Im[WS(rs, 2)];
Chris@82 148 T11 = TZ - T10;
Chris@82 149 T1F = TZ + T10;
Chris@82 150 T14 = Rp[WS(rs, 2)];
Chris@82 151 T15 = Rm[WS(rs, 2)];
Chris@82 152 T1B = T15 - T14;
Chris@82 153 T16 = T14 + T15;
Chris@82 154 T2Z = T1E * T1B;
Chris@82 155 T1C = T1A * T1B;
Chris@82 156 }
Chris@82 157 {
Chris@82 158 E T19, T1a, T1e, T1f, T1I;
Chris@82 159 T19 = Ip[WS(rs, 6)];
Chris@82 160 T1a = Im[WS(rs, 6)];
Chris@82 161 T1b = T19 - T1a;
Chris@82 162 T1L = T19 + T1a;
Chris@82 163 T1e = Rp[WS(rs, 6)];
Chris@82 164 T1f = Rm[WS(rs, 6)];
Chris@82 165 T1I = T1f - T1e;
Chris@82 166 T1g = T1e + T1f;
Chris@82 167 T31 = T1K * T1I;
Chris@82 168 T1J = T1H * T1I;
Chris@82 169 }
Chris@82 170 {
Chris@82 171 E T12, T1c, T2w, T2u;
Chris@82 172 T12 = TY * T11;
Chris@82 173 T17 = FNMS(T13, T16, T12);
Chris@82 174 T1c = T18 * T1b;
Chris@82 175 T1h = FNMS(T1d, T1g, T1c);
Chris@82 176 T2w = T18 * T1g;
Chris@82 177 T2x = FMA(T1d, T1b, T2w);
Chris@82 178 T2u = TY * T16;
Chris@82 179 T2v = FMA(T13, T11, T2u);
Chris@82 180 {
Chris@82 181 E T30, T32, T1G, T1M;
Chris@82 182 T30 = FMA(T1A, T1F, T2Z);
Chris@82 183 T32 = FMA(T1H, T1L, T31);
Chris@82 184 T33 = T30 + T32;
Chris@82 185 T3Q = T30 - T32;
Chris@82 186 T1G = FNMS(T1E, T1F, T1C);
Chris@82 187 T1M = FNMS(T1K, T1L, T1J);
Chris@82 188 T1N = T1G + T1M;
Chris@82 189 T3S = T1G - T1M;
Chris@82 190 }
Chris@82 191 }
Chris@82 192 }
Chris@82 193 {
Chris@82 194 E T9, T22, Ta, T2O, Tf, T20, T21, T2A, Tn, T2a, To, T2Q, Tt, T26, T27;
Chris@82 195 E T2C;
Chris@82 196 {
Chris@82 197 E T7, T8, Td, Te;
Chris@82 198 T7 = Ip[WS(rs, 1)];
Chris@82 199 T8 = Im[WS(rs, 1)];
Chris@82 200 T9 = T7 - T8;
Chris@82 201 T22 = T7 + T8;
Chris@82 202 Ta = T6 * T9;
Chris@82 203 T2O = T2 * T22;
Chris@82 204 Td = Rp[WS(rs, 1)];
Chris@82 205 Te = Rm[WS(rs, 1)];
Chris@82 206 Tf = Td + Te;
Chris@82 207 T20 = Td - Te;
Chris@82 208 T21 = T2 * T20;
Chris@82 209 T2A = T6 * Tf;
Chris@82 210 }
Chris@82 211 {
Chris@82 212 E Tl, Tm, Tr, Ts;
Chris@82 213 Tl = Ip[WS(rs, 5)];
Chris@82 214 Tm = Im[WS(rs, 5)];
Chris@82 215 Tn = Tl - Tm;
Chris@82 216 T2a = Tl + Tm;
Chris@82 217 To = Tk * Tn;
Chris@82 218 T2Q = T25 * T2a;
Chris@82 219 Tr = Rp[WS(rs, 5)];
Chris@82 220 Ts = Rm[WS(rs, 5)];
Chris@82 221 Tt = Tr + Ts;
Chris@82 222 T26 = Tr - Ts;
Chris@82 223 T27 = T25 * T26;
Chris@82 224 T2C = Tk * Tt;
Chris@82 225 }
Chris@82 226 Tg = FNMS(Tc, Tf, Ta);
Chris@82 227 Tu = FNMS(Tq, Tt, To);
Chris@82 228 T3A = Tg - Tu;
Chris@82 229 T2B = FMA(Tc, T9, T2A);
Chris@82 230 T2D = FMA(Tq, Tn, T2C);
Chris@82 231 T3B = T2B - T2D;
Chris@82 232 {
Chris@82 233 E T23, T2b, T2P, T2R;
Chris@82 234 T23 = FMA(T5, T22, T21);
Chris@82 235 T2b = FMA(T29, T2a, T27);
Chris@82 236 T2c = T23 + T2b;
Chris@82 237 T3L = T2b - T23;
Chris@82 238 T2P = FNMS(T5, T20, T2O);
Chris@82 239 T2R = FNMS(T29, T26, T2Q);
Chris@82 240 T2S = T2P + T2R;
Chris@82 241 T3I = T2R - T2P;
Chris@82 242 }
Chris@82 243 }
Chris@82 244 {
Chris@82 245 E TC, T2f, TD, T2T, TI, T2d, T2e, T2F, TO, T2l, TP, T2V, TU, T2i, T2j;
Chris@82 246 E T2H;
Chris@82 247 {
Chris@82 248 E TA, TB, TG, TH;
Chris@82 249 TA = Ip[WS(rs, 7)];
Chris@82 250 TB = Im[WS(rs, 7)];
Chris@82 251 TC = TA - TB;
Chris@82 252 T2f = TA + TB;
Chris@82 253 TD = Tz * TC;
Chris@82 254 T2T = Tw * T2f;
Chris@82 255 TG = Rp[WS(rs, 7)];
Chris@82 256 TH = Rm[WS(rs, 7)];
Chris@82 257 TI = TG + TH;
Chris@82 258 T2d = TG - TH;
Chris@82 259 T2e = Tw * T2d;
Chris@82 260 T2F = Tz * TI;
Chris@82 261 }
Chris@82 262 {
Chris@82 263 E TM, TN, TS, TT;
Chris@82 264 TM = Ip[WS(rs, 3)];
Chris@82 265 TN = Im[WS(rs, 3)];
Chris@82 266 TO = TM - TN;
Chris@82 267 T2l = TM + TN;
Chris@82 268 TP = TL * TO;
Chris@82 269 T2V = T2h * T2l;
Chris@82 270 TS = Rp[WS(rs, 3)];
Chris@82 271 TT = Rm[WS(rs, 3)];
Chris@82 272 TU = TS + TT;
Chris@82 273 T2i = TS - TT;
Chris@82 274 T2j = T2h * T2i;
Chris@82 275 T2H = TL * TU;
Chris@82 276 }
Chris@82 277 TJ = FNMS(TF, TI, TD);
Chris@82 278 TV = FNMS(TR, TU, TP);
Chris@82 279 T3E = TJ - TV;
Chris@82 280 T2G = FMA(TF, TC, T2F);
Chris@82 281 T2I = FMA(TR, TO, T2H);
Chris@82 282 T3D = T2G - T2I;
Chris@82 283 {
Chris@82 284 E T2g, T2m, T2U, T2W;
Chris@82 285 T2g = FMA(Ty, T2f, T2e);
Chris@82 286 T2m = FMA(T2k, T2l, T2j);
Chris@82 287 T2n = T2g + T2m;
Chris@82 288 T3J = T2m - T2g;
Chris@82 289 T2U = FNMS(Ty, T2d, T2T);
Chris@82 290 T2W = FNMS(T2k, T2i, T2V);
Chris@82 291 T2X = T2U + T2W;
Chris@82 292 T3M = T2U - T2W;
Chris@82 293 }
Chris@82 294 }
Chris@82 295 {
Chris@82 296 E TX, T3o, T3i, T3s, T3l, T3t, T1x, T3e, T2p, T2M, T2K, T3d, T3a, T3c, T2z;
Chris@82 297 E T3n;
Chris@82 298 {
Chris@82 299 E Tv, TW, T3g, T3h;
Chris@82 300 Tv = Tg + Tu;
Chris@82 301 TW = TJ + TV;
Chris@82 302 TX = Tv + TW;
Chris@82 303 T3o = Tv - TW;
Chris@82 304 T3g = T2X - T2S;
Chris@82 305 T3h = T2c - T2n;
Chris@82 306 T3i = T3g + T3h;
Chris@82 307 T3s = T3g - T3h;
Chris@82 308 }
Chris@82 309 {
Chris@82 310 E T3j, T3k, T1i, T1w;
Chris@82 311 T3j = T1Y - T1N;
Chris@82 312 T3k = T38 - T33;
Chris@82 313 T3l = T3j - T3k;
Chris@82 314 T3t = T3j + T3k;
Chris@82 315 T1i = T17 + T1h;
Chris@82 316 T1w = T1s + T1v;
Chris@82 317 T1x = T1i + T1w;
Chris@82 318 T3e = T1w - T1i;
Chris@82 319 }
Chris@82 320 {
Chris@82 321 E T1Z, T2o, T2E, T2J;
Chris@82 322 T1Z = T1N + T1Y;
Chris@82 323 T2o = T2c + T2n;
Chris@82 324 T2p = T1Z - T2o;
Chris@82 325 T2M = T2o + T1Z;
Chris@82 326 T2E = T2B + T2D;
Chris@82 327 T2J = T2G + T2I;
Chris@82 328 T2K = T2E + T2J;
Chris@82 329 T3d = T2J - T2E;
Chris@82 330 }
Chris@82 331 {
Chris@82 332 E T2Y, T39, T2t, T2y;
Chris@82 333 T2Y = T2S + T2X;
Chris@82 334 T39 = T33 + T38;
Chris@82 335 T3a = T2Y - T39;
Chris@82 336 T3c = T2Y + T39;
Chris@82 337 T2t = T2q + T2s;
Chris@82 338 T2y = T2v + T2x;
Chris@82 339 T2z = T2t + T2y;
Chris@82 340 T3n = T2t - T2y;
Chris@82 341 }
Chris@82 342 {
Chris@82 343 E T1y, T3b, T2L, T2N;
Chris@82 344 T1y = TX + T1x;
Chris@82 345 Ip[0] = KP500000000 * (T1y + T2p);
Chris@82 346 Im[WS(rs, 7)] = KP500000000 * (T2p - T1y);
Chris@82 347 T3b = T2z + T2K;
Chris@82 348 Rm[WS(rs, 7)] = KP500000000 * (T3b - T3c);
Chris@82 349 Rp[0] = KP500000000 * (T3b + T3c);
Chris@82 350 T2L = T2z - T2K;
Chris@82 351 Rm[WS(rs, 3)] = KP500000000 * (T2L - T2M);
Chris@82 352 Rp[WS(rs, 4)] = KP500000000 * (T2L + T2M);
Chris@82 353 T2N = T1x - TX;
Chris@82 354 Ip[WS(rs, 4)] = KP500000000 * (T2N + T3a);
Chris@82 355 Im[WS(rs, 3)] = KP500000000 * (T3a - T2N);
Chris@82 356 }
Chris@82 357 {
Chris@82 358 E T3f, T3m, T3v, T3w;
Chris@82 359 T3f = T3d + T3e;
Chris@82 360 T3m = T3i + T3l;
Chris@82 361 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3m, T3f));
Chris@82 362 Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3m, T3f)));
Chris@82 363 T3v = T3n + T3o;
Chris@82 364 T3w = T3s + T3t;
Chris@82 365 Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3w, T3v));
Chris@82 366 Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3w, T3v));
Chris@82 367 }
Chris@82 368 {
Chris@82 369 E T3p, T3q, T3r, T3u;
Chris@82 370 T3p = T3n - T3o;
Chris@82 371 T3q = T3l - T3i;
Chris@82 372 Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3q, T3p));
Chris@82 373 Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3q, T3p));
Chris@82 374 T3r = T3e - T3d;
Chris@82 375 T3u = T3s - T3t;
Chris@82 376 Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3u, T3r));
Chris@82 377 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3u, T3r)));
Chris@82 378 }
Chris@82 379 }
Chris@82 380 {
Chris@82 381 E T3z, T4b, T4g, T4q, T4j, T4r, T3G, T4m, T3O, T46, T3Z, T4l, T42, T4c, T3V;
Chris@82 382 E T47;
Chris@82 383 {
Chris@82 384 E T3x, T3y, T4e, T4f;
Chris@82 385 T3x = T1v - T1s;
Chris@82 386 T3y = T2v - T2x;
Chris@82 387 T3z = T3x - T3y;
Chris@82 388 T4b = T3y + T3x;
Chris@82 389 T4e = T3I - T3J;
Chris@82 390 T4f = T3M - T3L;
Chris@82 391 T4g = FMA(KP414213562, T4f, T4e);
Chris@82 392 T4q = FNMS(KP414213562, T4e, T4f);
Chris@82 393 }
Chris@82 394 {
Chris@82 395 E T4h, T4i, T3C, T3F;
Chris@82 396 T4h = T3Q + T3P;
Chris@82 397 T4i = T3T - T3S;
Chris@82 398 T4j = FMA(KP414213562, T4i, T4h);
Chris@82 399 T4r = FNMS(KP414213562, T4h, T4i);
Chris@82 400 T3C = T3A - T3B;
Chris@82 401 T3F = T3D + T3E;
Chris@82 402 T3G = T3C + T3F;
Chris@82 403 T4m = T3C - T3F;
Chris@82 404 }
Chris@82 405 {
Chris@82 406 E T3K, T3N, T3X, T3Y;
Chris@82 407 T3K = T3I + T3J;
Chris@82 408 T3N = T3L + T3M;
Chris@82 409 T3O = FMA(KP414213562, T3N, T3K);
Chris@82 410 T46 = FNMS(KP414213562, T3K, T3N);
Chris@82 411 T3X = T2q - T2s;
Chris@82 412 T3Y = T17 - T1h;
Chris@82 413 T3Z = T3X + T3Y;
Chris@82 414 T4l = T3X - T3Y;
Chris@82 415 }
Chris@82 416 {
Chris@82 417 E T40, T41, T3R, T3U;
Chris@82 418 T40 = T3B + T3A;
Chris@82 419 T41 = T3D - T3E;
Chris@82 420 T42 = T40 + T41;
Chris@82 421 T4c = T41 - T40;
Chris@82 422 T3R = T3P - T3Q;
Chris@82 423 T3U = T3S + T3T;
Chris@82 424 T3V = FNMS(KP414213562, T3U, T3R);
Chris@82 425 T47 = FMA(KP414213562, T3R, T3U);
Chris@82 426 }
Chris@82 427 {
Chris@82 428 E T3H, T3W, T49, T4a;
Chris@82 429 T3H = FMA(KP707106781, T3G, T3z);
Chris@82 430 T3W = T3O + T3V;
Chris@82 431 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3W, T3H));
Chris@82 432 Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3H)));
Chris@82 433 T49 = FMA(KP707106781, T42, T3Z);
Chris@82 434 T4a = T46 + T47;
Chris@82 435 Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T4a, T49));
Chris@82 436 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T4a, T49));
Chris@82 437 }
Chris@82 438 {
Chris@82 439 E T43, T44, T45, T48;
Chris@82 440 T43 = FNMS(KP707106781, T42, T3Z);
Chris@82 441 T44 = T3V - T3O;
Chris@82 442 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T44, T43));
Chris@82 443 Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T44, T43));
Chris@82 444 T45 = FNMS(KP707106781, T3G, T3z);
Chris@82 445 T48 = T46 - T47;
Chris@82 446 Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T48, T45));
Chris@82 447 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T48, T45)));
Chris@82 448 }
Chris@82 449 {
Chris@82 450 E T4d, T4k, T4t, T4u;
Chris@82 451 T4d = FNMS(KP707106781, T4c, T4b);
Chris@82 452 T4k = T4g - T4j;
Chris@82 453 Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T4k, T4d));
Chris@82 454 Im[0] = -(KP500000000 * (FNMS(KP923879532, T4k, T4d)));
Chris@82 455 T4t = FNMS(KP707106781, T4m, T4l);
Chris@82 456 T4u = T4q + T4r;
Chris@82 457 Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4u, T4t));
Chris@82 458 Rm[0] = KP500000000 * (FMA(KP923879532, T4u, T4t));
Chris@82 459 }
Chris@82 460 {
Chris@82 461 E T4n, T4o, T4p, T4s;
Chris@82 462 T4n = FMA(KP707106781, T4m, T4l);
Chris@82 463 T4o = T4g + T4j;
Chris@82 464 Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4o, T4n));
Chris@82 465 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4o, T4n));
Chris@82 466 T4p = FMA(KP707106781, T4c, T4b);
Chris@82 467 T4s = T4q - T4r;
Chris@82 468 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4s, T4p));
Chris@82 469 Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4s, T4p)));
Chris@82 470 }
Chris@82 471 }
Chris@82 472 }
Chris@82 473 }
Chris@82 474 }
Chris@82 475 }
Chris@82 476
Chris@82 477 static const tw_instr twinstr[] = {
Chris@82 478 {TW_CEXP, 1, 1},
Chris@82 479 {TW_CEXP, 1, 3},
Chris@82 480 {TW_CEXP, 1, 9},
Chris@82 481 {TW_CEXP, 1, 15},
Chris@82 482 {TW_NEXT, 1, 0}
Chris@82 483 };
Chris@82 484
Chris@82 485 static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, {136, 74, 92, 0} };
Chris@82 486
Chris@82 487 void X(codelet_hc2cfdft2_16) (planner *p) {
Chris@82 488 X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
Chris@82 489 }
Chris@82 490 #else
Chris@82 491
Chris@82 492 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include rdft/scalar/hc2cf.h */
Chris@82 493
Chris@82 494 /*
Chris@82 495 * This function contains 228 FP additions, 124 FP multiplications,
Chris@82 496 * (or, 188 additions, 84 multiplications, 40 fused multiply/add),
Chris@82 497 * 91 stack variables, 4 constants, and 64 memory accesses
Chris@82 498 */
Chris@82 499 #include "rdft/scalar/hc2cf.h"
Chris@82 500
Chris@82 501 static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@82 502 {
Chris@82 503 DK(KP461939766, +0.461939766255643378064091594698394143411208313);
Chris@82 504 DK(KP191341716, +0.191341716182544885864229992015199433380672281);
Chris@82 505 DK(KP353553390, +0.353553390593273762200422181052424519642417969);
Chris@82 506 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@82 507 {
Chris@82 508 INT m;
Chris@82 509 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@82 510 E T1, T4, T2, T5, T7, Td, T12, TY, Tk, Ti, Tm, T1l, T1b, TL, T1h;
Chris@82 511 E Ts, TR, T17, Ty, Tz, TA, TE, T1L, T1Q, T1H, T1O, T24, T2d, T20, T2b;
Chris@82 512 {
Chris@82 513 E Tl, TP, Tq, TK, Tj, TQ, Tr, TJ;
Chris@82 514 {
Chris@82 515 E T3, Tc, T6, Tb;
Chris@82 516 T1 = W[0];
Chris@82 517 T4 = W[1];
Chris@82 518 T2 = W[2];
Chris@82 519 T5 = W[3];
Chris@82 520 T3 = T1 * T2;
Chris@82 521 Tc = T4 * T2;
Chris@82 522 T6 = T4 * T5;
Chris@82 523 Tb = T1 * T5;
Chris@82 524 T7 = T3 + T6;
Chris@82 525 Td = Tb - Tc;
Chris@82 526 T12 = Tb + Tc;
Chris@82 527 TY = T3 - T6;
Chris@82 528 Tk = W[5];
Chris@82 529 Tl = T4 * Tk;
Chris@82 530 TP = T2 * Tk;
Chris@82 531 Tq = T1 * Tk;
Chris@82 532 TK = T5 * Tk;
Chris@82 533 Ti = W[4];
Chris@82 534 Tj = T1 * Ti;
Chris@82 535 TQ = T5 * Ti;
Chris@82 536 Tr = T4 * Ti;
Chris@82 537 TJ = T2 * Ti;
Chris@82 538 }
Chris@82 539 Tm = Tj - Tl;
Chris@82 540 T1l = Tq - Tr;
Chris@82 541 T1b = TP + TQ;
Chris@82 542 TL = TJ + TK;
Chris@82 543 T1h = Tj + Tl;
Chris@82 544 Ts = Tq + Tr;
Chris@82 545 TR = TP - TQ;
Chris@82 546 T17 = TJ - TK;
Chris@82 547 Ty = W[6];
Chris@82 548 Tz = W[7];
Chris@82 549 TA = FMA(T1, Ty, T4 * Tz);
Chris@82 550 TE = FNMS(T4, Ty, T1 * Tz);
Chris@82 551 {
Chris@82 552 E T1J, T1K, T1F, T1G;
Chris@82 553 T1J = TY * Tk;
Chris@82 554 T1K = T12 * Ti;
Chris@82 555 T1L = T1J - T1K;
Chris@82 556 T1Q = T1J + T1K;
Chris@82 557 T1F = TY * Ti;
Chris@82 558 T1G = T12 * Tk;
Chris@82 559 T1H = T1F + T1G;
Chris@82 560 T1O = T1F - T1G;
Chris@82 561 }
Chris@82 562 {
Chris@82 563 E T22, T23, T1Y, T1Z;
Chris@82 564 T22 = T7 * Tk;
Chris@82 565 T23 = Td * Ti;
Chris@82 566 T24 = T22 + T23;
Chris@82 567 T2d = T22 - T23;
Chris@82 568 T1Y = T7 * Ti;
Chris@82 569 T1Z = Td * Tk;
Chris@82 570 T20 = T1Y - T1Z;
Chris@82 571 T2b = T1Y + T1Z;
Chris@82 572 }
Chris@82 573 }
Chris@82 574 {
Chris@82 575 E T1t, T3i, T2l, T3B, T1E, T3t, T2M, T3x, T1g, T3C, T2J, T3u, T1T, T3w, T2o;
Chris@82 576 E T3j, Tx, T3b, T2C, T3q, T27, T3m, T2s, T3c, TW, T3f, T2F, T3n, T2g, T3p;
Chris@82 577 E T2v, T3e;
Chris@82 578 {
Chris@82 579 E T1k, T1C, T1o, T1B, T1s, T1z, T1y, T2j, T1p, T2k;
Chris@82 580 {
Chris@82 581 E T1i, T1j, T1m, T1n;
Chris@82 582 T1i = Ip[WS(rs, 4)];
Chris@82 583 T1j = Im[WS(rs, 4)];
Chris@82 584 T1k = T1i - T1j;
Chris@82 585 T1C = T1i + T1j;
Chris@82 586 T1m = Rp[WS(rs, 4)];
Chris@82 587 T1n = Rm[WS(rs, 4)];
Chris@82 588 T1o = T1m + T1n;
Chris@82 589 T1B = T1m - T1n;
Chris@82 590 }
Chris@82 591 {
Chris@82 592 E T1q, T1r, T1w, T1x;
Chris@82 593 T1q = Ip[0];
Chris@82 594 T1r = Im[0];
Chris@82 595 T1s = T1q - T1r;
Chris@82 596 T1z = T1q + T1r;
Chris@82 597 T1w = Rm[0];
Chris@82 598 T1x = Rp[0];
Chris@82 599 T1y = T1w - T1x;
Chris@82 600 T2j = T1x + T1w;
Chris@82 601 }
Chris@82 602 T1p = FNMS(T1l, T1o, T1h * T1k);
Chris@82 603 T1t = T1p + T1s;
Chris@82 604 T3i = T1s - T1p;
Chris@82 605 T2k = FMA(T1h, T1o, T1l * T1k);
Chris@82 606 T2l = T2j + T2k;
Chris@82 607 T3B = T2j - T2k;
Chris@82 608 {
Chris@82 609 E T1A, T1D, T2K, T2L;
Chris@82 610 T1A = FNMS(T4, T1z, T1 * T1y);
Chris@82 611 T1D = FMA(Ti, T1B, Tk * T1C);
Chris@82 612 T1E = T1A - T1D;
Chris@82 613 T3t = T1D + T1A;
Chris@82 614 T2K = FNMS(Tk, T1B, Ti * T1C);
Chris@82 615 T2L = FMA(T4, T1y, T1 * T1z);
Chris@82 616 T2M = T2K + T2L;
Chris@82 617 T3x = T2L - T2K;
Chris@82 618 }
Chris@82 619 }
Chris@82 620 {
Chris@82 621 E T11, T1M, T15, T1I, T1a, T1R, T1e, T1P;
Chris@82 622 {
Chris@82 623 E TZ, T10, T13, T14;
Chris@82 624 TZ = Ip[WS(rs, 2)];
Chris@82 625 T10 = Im[WS(rs, 2)];
Chris@82 626 T11 = TZ - T10;
Chris@82 627 T1M = TZ + T10;
Chris@82 628 T13 = Rp[WS(rs, 2)];
Chris@82 629 T14 = Rm[WS(rs, 2)];
Chris@82 630 T15 = T13 + T14;
Chris@82 631 T1I = T13 - T14;
Chris@82 632 }
Chris@82 633 {
Chris@82 634 E T18, T19, T1c, T1d;
Chris@82 635 T18 = Ip[WS(rs, 6)];
Chris@82 636 T19 = Im[WS(rs, 6)];
Chris@82 637 T1a = T18 - T19;
Chris@82 638 T1R = T18 + T19;
Chris@82 639 T1c = Rp[WS(rs, 6)];
Chris@82 640 T1d = Rm[WS(rs, 6)];
Chris@82 641 T1e = T1c + T1d;
Chris@82 642 T1P = T1c - T1d;
Chris@82 643 }
Chris@82 644 {
Chris@82 645 E T16, T1f, T2H, T2I;
Chris@82 646 T16 = FNMS(T12, T15, TY * T11);
Chris@82 647 T1f = FNMS(T1b, T1e, T17 * T1a);
Chris@82 648 T1g = T16 + T1f;
Chris@82 649 T3C = T16 - T1f;
Chris@82 650 T2H = FNMS(T1L, T1I, T1H * T1M);
Chris@82 651 T2I = FNMS(T1Q, T1P, T1O * T1R);
Chris@82 652 T2J = T2H + T2I;
Chris@82 653 T3u = T2H - T2I;
Chris@82 654 }
Chris@82 655 {
Chris@82 656 E T1N, T1S, T2m, T2n;
Chris@82 657 T1N = FMA(T1H, T1I, T1L * T1M);
Chris@82 658 T1S = FMA(T1O, T1P, T1Q * T1R);
Chris@82 659 T1T = T1N + T1S;
Chris@82 660 T3w = T1S - T1N;
Chris@82 661 T2m = FMA(TY, T15, T12 * T11);
Chris@82 662 T2n = FMA(T17, T1e, T1b * T1a);
Chris@82 663 T2o = T2m + T2n;
Chris@82 664 T3j = T2m - T2n;
Chris@82 665 }
Chris@82 666 }
Chris@82 667 {
Chris@82 668 E Ta, T1W, Tg, T1V, Tp, T25, Tv, T21;
Chris@82 669 {
Chris@82 670 E T8, T9, Te, Tf;
Chris@82 671 T8 = Ip[WS(rs, 1)];
Chris@82 672 T9 = Im[WS(rs, 1)];
Chris@82 673 Ta = T8 - T9;
Chris@82 674 T1W = T8 + T9;
Chris@82 675 Te = Rp[WS(rs, 1)];
Chris@82 676 Tf = Rm[WS(rs, 1)];
Chris@82 677 Tg = Te + Tf;
Chris@82 678 T1V = Te - Tf;
Chris@82 679 }
Chris@82 680 {
Chris@82 681 E Tn, To, Tt, Tu;
Chris@82 682 Tn = Ip[WS(rs, 5)];
Chris@82 683 To = Im[WS(rs, 5)];
Chris@82 684 Tp = Tn - To;
Chris@82 685 T25 = Tn + To;
Chris@82 686 Tt = Rp[WS(rs, 5)];
Chris@82 687 Tu = Rm[WS(rs, 5)];
Chris@82 688 Tv = Tt + Tu;
Chris@82 689 T21 = Tt - Tu;
Chris@82 690 }
Chris@82 691 {
Chris@82 692 E Th, Tw, T2A, T2B;
Chris@82 693 Th = FNMS(Td, Tg, T7 * Ta);
Chris@82 694 Tw = FNMS(Ts, Tv, Tm * Tp);
Chris@82 695 Tx = Th + Tw;
Chris@82 696 T3b = Th - Tw;
Chris@82 697 T2A = FNMS(T5, T1V, T2 * T1W);
Chris@82 698 T2B = FNMS(T24, T21, T20 * T25);
Chris@82 699 T2C = T2A + T2B;
Chris@82 700 T3q = T2A - T2B;
Chris@82 701 }
Chris@82 702 {
Chris@82 703 E T1X, T26, T2q, T2r;
Chris@82 704 T1X = FMA(T2, T1V, T5 * T1W);
Chris@82 705 T26 = FMA(T20, T21, T24 * T25);
Chris@82 706 T27 = T1X + T26;
Chris@82 707 T3m = T26 - T1X;
Chris@82 708 T2q = FMA(T7, Tg, Td * Ta);
Chris@82 709 T2r = FMA(Tm, Tv, Ts * Tp);
Chris@82 710 T2s = T2q + T2r;
Chris@82 711 T3c = T2q - T2r;
Chris@82 712 }
Chris@82 713 }
Chris@82 714 {
Chris@82 715 E TD, T29, TH, T28, TO, T2e, TU, T2c;
Chris@82 716 {
Chris@82 717 E TB, TC, TF, TG;
Chris@82 718 TB = Ip[WS(rs, 7)];
Chris@82 719 TC = Im[WS(rs, 7)];
Chris@82 720 TD = TB - TC;
Chris@82 721 T29 = TB + TC;
Chris@82 722 TF = Rp[WS(rs, 7)];
Chris@82 723 TG = Rm[WS(rs, 7)];
Chris@82 724 TH = TF + TG;
Chris@82 725 T28 = TF - TG;
Chris@82 726 }
Chris@82 727 {
Chris@82 728 E TM, TN, TS, TT;
Chris@82 729 TM = Ip[WS(rs, 3)];
Chris@82 730 TN = Im[WS(rs, 3)];
Chris@82 731 TO = TM - TN;
Chris@82 732 T2e = TM + TN;
Chris@82 733 TS = Rp[WS(rs, 3)];
Chris@82 734 TT = Rm[WS(rs, 3)];
Chris@82 735 TU = TS + TT;
Chris@82 736 T2c = TS - TT;
Chris@82 737 }
Chris@82 738 {
Chris@82 739 E TI, TV, T2D, T2E;
Chris@82 740 TI = FNMS(TE, TH, TA * TD);
Chris@82 741 TV = FNMS(TR, TU, TL * TO);
Chris@82 742 TW = TI + TV;
Chris@82 743 T3f = TI - TV;
Chris@82 744 T2D = FNMS(Tz, T28, Ty * T29);
Chris@82 745 T2E = FNMS(T2d, T2c, T2b * T2e);
Chris@82 746 T2F = T2D + T2E;
Chris@82 747 T3n = T2D - T2E;
Chris@82 748 }
Chris@82 749 {
Chris@82 750 E T2a, T2f, T2t, T2u;
Chris@82 751 T2a = FMA(Ty, T28, Tz * T29);
Chris@82 752 T2f = FMA(T2b, T2c, T2d * T2e);
Chris@82 753 T2g = T2a + T2f;
Chris@82 754 T3p = T2f - T2a;
Chris@82 755 T2t = FMA(TA, TH, TE * TD);
Chris@82 756 T2u = FMA(TL, TU, TR * TO);
Chris@82 757 T2v = T2t + T2u;
Chris@82 758 T3e = T2t - T2u;
Chris@82 759 }
Chris@82 760 }
Chris@82 761 {
Chris@82 762 E T1v, T2z, T2O, T2Q, T2i, T2y, T2x, T2P;
Chris@82 763 {
Chris@82 764 E TX, T1u, T2G, T2N;
Chris@82 765 TX = Tx + TW;
Chris@82 766 T1u = T1g + T1t;
Chris@82 767 T1v = TX + T1u;
Chris@82 768 T2z = T1u - TX;
Chris@82 769 T2G = T2C + T2F;
Chris@82 770 T2N = T2J + T2M;
Chris@82 771 T2O = T2G - T2N;
Chris@82 772 T2Q = T2G + T2N;
Chris@82 773 }
Chris@82 774 {
Chris@82 775 E T1U, T2h, T2p, T2w;
Chris@82 776 T1U = T1E - T1T;
Chris@82 777 T2h = T27 + T2g;
Chris@82 778 T2i = T1U - T2h;
Chris@82 779 T2y = T2h + T1U;
Chris@82 780 T2p = T2l + T2o;
Chris@82 781 T2w = T2s + T2v;
Chris@82 782 T2x = T2p - T2w;
Chris@82 783 T2P = T2p + T2w;
Chris@82 784 }
Chris@82 785 Ip[0] = KP500000000 * (T1v + T2i);
Chris@82 786 Rp[0] = KP500000000 * (T2P + T2Q);
Chris@82 787 Im[WS(rs, 7)] = KP500000000 * (T2i - T1v);
Chris@82 788 Rm[WS(rs, 7)] = KP500000000 * (T2P - T2Q);
Chris@82 789 Rm[WS(rs, 3)] = KP500000000 * (T2x - T2y);
Chris@82 790 Im[WS(rs, 3)] = KP500000000 * (T2O - T2z);
Chris@82 791 Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
Chris@82 792 Ip[WS(rs, 4)] = KP500000000 * (T2z + T2O);
Chris@82 793 }
Chris@82 794 {
Chris@82 795 E T2T, T35, T33, T39, T2W, T36, T2Z, T37;
Chris@82 796 {
Chris@82 797 E T2R, T2S, T31, T32;
Chris@82 798 T2R = T2v - T2s;
Chris@82 799 T2S = T1t - T1g;
Chris@82 800 T2T = KP500000000 * (T2R + T2S);
Chris@82 801 T35 = KP500000000 * (T2S - T2R);
Chris@82 802 T31 = T2l - T2o;
Chris@82 803 T32 = Tx - TW;
Chris@82 804 T33 = KP500000000 * (T31 - T32);
Chris@82 805 T39 = KP500000000 * (T31 + T32);
Chris@82 806 }
Chris@82 807 {
Chris@82 808 E T2U, T2V, T2X, T2Y;
Chris@82 809 T2U = T2F - T2C;
Chris@82 810 T2V = T27 - T2g;
Chris@82 811 T2W = T2U + T2V;
Chris@82 812 T36 = T2U - T2V;
Chris@82 813 T2X = T1T + T1E;
Chris@82 814 T2Y = T2M - T2J;
Chris@82 815 T2Z = T2X - T2Y;
Chris@82 816 T37 = T2X + T2Y;
Chris@82 817 }
Chris@82 818 {
Chris@82 819 E T30, T3a, T34, T38;
Chris@82 820 T30 = KP353553390 * (T2W + T2Z);
Chris@82 821 Ip[WS(rs, 2)] = T2T + T30;
Chris@82 822 Im[WS(rs, 5)] = T30 - T2T;
Chris@82 823 T3a = KP353553390 * (T36 + T37);
Chris@82 824 Rm[WS(rs, 5)] = T39 - T3a;
Chris@82 825 Rp[WS(rs, 2)] = T39 + T3a;
Chris@82 826 T34 = KP353553390 * (T2Z - T2W);
Chris@82 827 Rm[WS(rs, 1)] = T33 - T34;
Chris@82 828 Rp[WS(rs, 6)] = T33 + T34;
Chris@82 829 T38 = KP353553390 * (T36 - T37);
Chris@82 830 Ip[WS(rs, 6)] = T35 + T38;
Chris@82 831 Im[WS(rs, 1)] = T38 - T35;
Chris@82 832 }
Chris@82 833 }
Chris@82 834 {
Chris@82 835 E T3k, T3Q, T3Z, T3D, T3h, T40, T3X, T45, T3G, T3P, T3s, T3K, T3U, T44, T3z;
Chris@82 836 E T3L;
Chris@82 837 {
Chris@82 838 E T3d, T3g, T3o, T3r;
Chris@82 839 T3k = KP500000000 * (T3i - T3j);
Chris@82 840 T3Q = KP500000000 * (T3j + T3i);
Chris@82 841 T3Z = KP500000000 * (T3B - T3C);
Chris@82 842 T3D = KP500000000 * (T3B + T3C);
Chris@82 843 T3d = T3b - T3c;
Chris@82 844 T3g = T3e + T3f;
Chris@82 845 T3h = KP353553390 * (T3d + T3g);
Chris@82 846 T40 = KP353553390 * (T3d - T3g);
Chris@82 847 {
Chris@82 848 E T3V, T3W, T3E, T3F;
Chris@82 849 T3V = T3u + T3t;
Chris@82 850 T3W = T3x - T3w;
Chris@82 851 T3X = FNMS(KP461939766, T3W, KP191341716 * T3V);
Chris@82 852 T45 = FMA(KP461939766, T3V, KP191341716 * T3W);
Chris@82 853 T3E = T3c + T3b;
Chris@82 854 T3F = T3e - T3f;
Chris@82 855 T3G = KP353553390 * (T3E + T3F);
Chris@82 856 T3P = KP353553390 * (T3F - T3E);
Chris@82 857 }
Chris@82 858 T3o = T3m + T3n;
Chris@82 859 T3r = T3p - T3q;
Chris@82 860 T3s = FMA(KP191341716, T3o, KP461939766 * T3r);
Chris@82 861 T3K = FNMS(KP191341716, T3r, KP461939766 * T3o);
Chris@82 862 {
Chris@82 863 E T3S, T3T, T3v, T3y;
Chris@82 864 T3S = T3n - T3m;
Chris@82 865 T3T = T3q + T3p;
Chris@82 866 T3U = FMA(KP461939766, T3S, KP191341716 * T3T);
Chris@82 867 T44 = FNMS(KP461939766, T3T, KP191341716 * T3S);
Chris@82 868 T3v = T3t - T3u;
Chris@82 869 T3y = T3w + T3x;
Chris@82 870 T3z = FNMS(KP191341716, T3y, KP461939766 * T3v);
Chris@82 871 T3L = FMA(KP191341716, T3v, KP461939766 * T3y);
Chris@82 872 }
Chris@82 873 }
Chris@82 874 {
Chris@82 875 E T3l, T3A, T3N, T3O;
Chris@82 876 T3l = T3h + T3k;
Chris@82 877 T3A = T3s + T3z;
Chris@82 878 Ip[WS(rs, 1)] = T3l + T3A;
Chris@82 879 Im[WS(rs, 6)] = T3A - T3l;
Chris@82 880 T3N = T3D + T3G;
Chris@82 881 T3O = T3K + T3L;
Chris@82 882 Rm[WS(rs, 6)] = T3N - T3O;
Chris@82 883 Rp[WS(rs, 1)] = T3N + T3O;
Chris@82 884 }
Chris@82 885 {
Chris@82 886 E T3H, T3I, T3J, T3M;
Chris@82 887 T3H = T3D - T3G;
Chris@82 888 T3I = T3z - T3s;
Chris@82 889 Rm[WS(rs, 2)] = T3H - T3I;
Chris@82 890 Rp[WS(rs, 5)] = T3H + T3I;
Chris@82 891 T3J = T3k - T3h;
Chris@82 892 T3M = T3K - T3L;
Chris@82 893 Ip[WS(rs, 5)] = T3J + T3M;
Chris@82 894 Im[WS(rs, 2)] = T3M - T3J;
Chris@82 895 }
Chris@82 896 {
Chris@82 897 E T3R, T3Y, T47, T48;
Chris@82 898 T3R = T3P + T3Q;
Chris@82 899 T3Y = T3U + T3X;
Chris@82 900 Ip[WS(rs, 3)] = T3R + T3Y;
Chris@82 901 Im[WS(rs, 4)] = T3Y - T3R;
Chris@82 902 T47 = T3Z + T40;
Chris@82 903 T48 = T44 + T45;
Chris@82 904 Rm[WS(rs, 4)] = T47 - T48;
Chris@82 905 Rp[WS(rs, 3)] = T47 + T48;
Chris@82 906 }
Chris@82 907 {
Chris@82 908 E T41, T42, T43, T46;
Chris@82 909 T41 = T3Z - T40;
Chris@82 910 T42 = T3X - T3U;
Chris@82 911 Rm[0] = T41 - T42;
Chris@82 912 Rp[WS(rs, 7)] = T41 + T42;
Chris@82 913 T43 = T3Q - T3P;
Chris@82 914 T46 = T44 - T45;
Chris@82 915 Ip[WS(rs, 7)] = T43 + T46;
Chris@82 916 Im[0] = T46 - T43;
Chris@82 917 }
Chris@82 918 }
Chris@82 919 }
Chris@82 920 }
Chris@82 921 }
Chris@82 922 }
Chris@82 923
Chris@82 924 static const tw_instr twinstr[] = {
Chris@82 925 {TW_CEXP, 1, 1},
Chris@82 926 {TW_CEXP, 1, 3},
Chris@82 927 {TW_CEXP, 1, 9},
Chris@82 928 {TW_CEXP, 1, 15},
Chris@82 929 {TW_NEXT, 1, 0}
Chris@82 930 };
Chris@82 931
Chris@82 932 static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, {188, 84, 40, 0} };
Chris@82 933
Chris@82 934 void X(codelet_hc2cfdft2_16) (planner *p) {
Chris@82 935 X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
Chris@82 936 }
Chris@82 937 #endif