annotate src/fftw-3.3.5/rdft/scalar/r2cf/hc2cfdft2_16.c @ 83:ae30d91d2ffe

Replace these with versions built using an older toolset (so as to avoid ABI compatibilities when linking on Ubuntu 14.04 for packaging purposes)
author Chris Cannam
date Fri, 07 Feb 2020 11:51:13 +0000
parents 2cd0e3b3e1fd
children
rev   line source
Chris@42 1 /*
Chris@42 2 * Copyright (c) 2003, 2007-14 Matteo Frigo
Chris@42 3 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
Chris@42 4 *
Chris@42 5 * This program is free software; you can redistribute it and/or modify
Chris@42 6 * it under the terms of the GNU General Public License as published by
Chris@42 7 * the Free Software Foundation; either version 2 of the License, or
Chris@42 8 * (at your option) any later version.
Chris@42 9 *
Chris@42 10 * This program is distributed in the hope that it will be useful,
Chris@42 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Chris@42 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Chris@42 13 * GNU General Public License for more details.
Chris@42 14 *
Chris@42 15 * You should have received a copy of the GNU General Public License
Chris@42 16 * along with this program; if not, write to the Free Software
Chris@42 17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Chris@42 18 *
Chris@42 19 */
Chris@42 20
Chris@42 21 /* This file was automatically generated --- DO NOT EDIT */
Chris@42 22 /* Generated on Sat Jul 30 16:48:58 EDT 2016 */
Chris@42 23
Chris@42 24 #include "codelet-rdft.h"
Chris@42 25
Chris@42 26 #ifdef HAVE_FMA
Chris@42 27
Chris@42 28 /* Generated by: ../../../genfft/gen_hc2cdft.native -fma -reorder-insns -schedule-for-pipeline -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include hc2cf.h */
Chris@42 29
Chris@42 30 /*
Chris@42 31 * This function contains 228 FP additions, 166 FP multiplications,
Chris@42 32 * (or, 136 additions, 74 multiplications, 92 fused multiply/add),
Chris@42 33 * 103 stack variables, 4 constants, and 64 memory accesses
Chris@42 34 */
Chris@42 35 #include "hc2cf.h"
Chris@42 36
Chris@42 37 static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 38 {
Chris@42 39 DK(KP923879532, +0.923879532511286756128183189396788286822416626);
Chris@42 40 DK(KP414213562, +0.414213562373095048801688724209698078569671875);
Chris@42 41 DK(KP707106781, +0.707106781186547524400844362104849039284835938);
Chris@42 42 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 43 {
Chris@42 44 INT m;
Chris@42 45 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 46 E T4p, T4o, T4n, T4s;
Chris@42 47 {
Chris@42 48 E T1, T2, Tw, Ty, Th, T3, Tx, TE, Ti, TK, Tj, T4, T5;
Chris@42 49 T1 = W[0];
Chris@42 50 T2 = W[2];
Chris@42 51 Tw = W[6];
Chris@42 52 Ty = W[7];
Chris@42 53 Th = W[4];
Chris@42 54 T3 = T1 * T2;
Chris@42 55 Tx = T1 * Tw;
Chris@42 56 TE = T1 * Ty;
Chris@42 57 Ti = T1 * Th;
Chris@42 58 TK = T2 * Th;
Chris@42 59 Tj = W[5];
Chris@42 60 T4 = W[1];
Chris@42 61 T5 = W[3];
Chris@42 62 {
Chris@42 63 E T1v, T2q, T1s, T2s, T38, T3T, T1Y, T3P, T17, T1h, T2x, T2v, T33, T3Q, T3S;
Chris@42 64 E T1N, Tv, T3A, T2E, T3B, T3L, T2c, T3I, T2S, TW, T3E, T3J, T2n, T3D, T2J;
Chris@42 65 E T3M, T2X;
Chris@42 66 {
Chris@42 67 E TF, Tk, Tz, TL, T6, TR, Tq, Tc, T2h, T25, T2k, T29, T1G, T1M, T2P;
Chris@42 68 E T2R;
Chris@42 69 {
Chris@42 70 E T18, TY, T1d, T13, T1H, T1A, T1K, T1E, T37, T1R, T35, T1X;
Chris@42 71 {
Chris@42 72 E T1j, T1o, T1W, T1p, T1m, T1Q, T1U, T1q;
Chris@42 73 {
Chris@42 74 E T1k, T1l, T1S, T1T;
Chris@42 75 {
Chris@42 76 E T1t, T28, T24, T1D, T1z, T1u, TQ, Tp, Tb;
Chris@42 77 T1t = Ip[0];
Chris@42 78 TQ = T2 * Tj;
Chris@42 79 Tp = T1 * Tj;
Chris@42 80 TF = FNMS(T4, Tw, TE);
Chris@42 81 T1j = FMA(T4, Tj, Ti);
Chris@42 82 Tk = FNMS(T4, Tj, Ti);
Chris@42 83 Tz = FMA(T4, Ty, Tx);
Chris@42 84 T18 = FNMS(T5, Tj, TK);
Chris@42 85 TL = FMA(T5, Tj, TK);
Chris@42 86 TY = FNMS(T4, T5, T3);
Chris@42 87 T6 = FMA(T4, T5, T3);
Chris@42 88 Tb = T1 * T5;
Chris@42 89 TR = FNMS(T5, Th, TQ);
Chris@42 90 T1d = FMA(T5, Th, TQ);
Chris@42 91 Tq = FMA(T4, Th, Tp);
Chris@42 92 T1o = FNMS(T4, Th, Tp);
Chris@42 93 T28 = T6 * Tj;
Chris@42 94 T24 = T6 * Th;
Chris@42 95 T1D = TY * Tj;
Chris@42 96 T1z = TY * Th;
Chris@42 97 Tc = FNMS(T4, T2, Tb);
Chris@42 98 T13 = FMA(T4, T2, Tb);
Chris@42 99 T1u = Im[0];
Chris@42 100 T1k = Ip[WS(rs, 4)];
Chris@42 101 T2h = FMA(Tc, Tj, T24);
Chris@42 102 T25 = FNMS(Tc, Tj, T24);
Chris@42 103 T2k = FNMS(Tc, Th, T28);
Chris@42 104 T29 = FMA(Tc, Th, T28);
Chris@42 105 T1H = FNMS(T13, Tj, T1z);
Chris@42 106 T1A = FMA(T13, Tj, T1z);
Chris@42 107 T1K = FMA(T13, Th, T1D);
Chris@42 108 T1E = FNMS(T13, Th, T1D);
Chris@42 109 T1W = T1t + T1u;
Chris@42 110 T1v = T1t - T1u;
Chris@42 111 T1l = Im[WS(rs, 4)];
Chris@42 112 }
Chris@42 113 T1S = Rm[0];
Chris@42 114 T1T = Rp[0];
Chris@42 115 T1p = Rp[WS(rs, 4)];
Chris@42 116 T1m = T1k - T1l;
Chris@42 117 T1Q = T1k + T1l;
Chris@42 118 T2q = T1T + T1S;
Chris@42 119 T1U = T1S - T1T;
Chris@42 120 T1q = Rm[WS(rs, 4)];
Chris@42 121 }
Chris@42 122 {
Chris@42 123 E T36, T1V, T1O, T1r, T1n, T1P, T34, T2r;
Chris@42 124 T36 = T4 * T1U;
Chris@42 125 T1V = T1 * T1U;
Chris@42 126 T1O = T1q - T1p;
Chris@42 127 T1r = T1p + T1q;
Chris@42 128 T1n = T1j * T1m;
Chris@42 129 T37 = FMA(T1, T1W, T36);
Chris@42 130 T2r = T1j * T1r;
Chris@42 131 T1P = Th * T1O;
Chris@42 132 T34 = Tj * T1O;
Chris@42 133 T1s = FNMS(T1o, T1r, T1n);
Chris@42 134 T2s = FMA(T1o, T1m, T2r);
Chris@42 135 T1R = FNMS(Tj, T1Q, T1P);
Chris@42 136 T35 = FMA(Th, T1Q, T34);
Chris@42 137 T1X = FNMS(T4, T1W, T1V);
Chris@42 138 }
Chris@42 139 }
Chris@42 140 {
Chris@42 141 E T1F, T11, T1e, T16, T1L, T1b, T1f, T1C, T2Z;
Chris@42 142 {
Chris@42 143 E T14, T15, TZ, T10, T19, T1a, T1B;
Chris@42 144 TZ = Ip[WS(rs, 2)];
Chris@42 145 T10 = Im[WS(rs, 2)];
Chris@42 146 T38 = T35 + T37;
Chris@42 147 T3T = T37 - T35;
Chris@42 148 T1Y = T1R + T1X;
Chris@42 149 T3P = T1X - T1R;
Chris@42 150 T1F = TZ + T10;
Chris@42 151 T11 = TZ - T10;
Chris@42 152 T14 = Rp[WS(rs, 2)];
Chris@42 153 T15 = Rm[WS(rs, 2)];
Chris@42 154 T19 = Ip[WS(rs, 6)];
Chris@42 155 T1a = Im[WS(rs, 6)];
Chris@42 156 T1e = Rp[WS(rs, 6)];
Chris@42 157 T16 = T14 + T15;
Chris@42 158 T1B = T15 - T14;
Chris@42 159 T1L = T19 + T1a;
Chris@42 160 T1b = T19 - T1a;
Chris@42 161 T1f = Rm[WS(rs, 6)];
Chris@42 162 T1C = T1A * T1B;
Chris@42 163 T2Z = T1E * T1B;
Chris@42 164 }
Chris@42 165 {
Chris@42 166 E T1J, T31, T2u, T30, T32;
Chris@42 167 {
Chris@42 168 E T12, T1g, T1I, T1c, T2w;
Chris@42 169 T12 = TY * T11;
Chris@42 170 T1g = T1e + T1f;
Chris@42 171 T1I = T1f - T1e;
Chris@42 172 T1c = T18 * T1b;
Chris@42 173 T17 = FNMS(T13, T16, T12);
Chris@42 174 T2w = T18 * T1g;
Chris@42 175 T1J = T1H * T1I;
Chris@42 176 T31 = T1K * T1I;
Chris@42 177 T1h = FNMS(T1d, T1g, T1c);
Chris@42 178 T2x = FMA(T1d, T1b, T2w);
Chris@42 179 }
Chris@42 180 T2u = TY * T16;
Chris@42 181 T30 = FMA(T1A, T1F, T2Z);
Chris@42 182 T32 = FMA(T1H, T1L, T31);
Chris@42 183 T1G = FNMS(T1E, T1F, T1C);
Chris@42 184 T2v = FMA(T13, T11, T2u);
Chris@42 185 T1M = FNMS(T1K, T1L, T1J);
Chris@42 186 T33 = T30 + T32;
Chris@42 187 T3Q = T30 - T32;
Chris@42 188 }
Chris@42 189 }
Chris@42 190 }
Chris@42 191 {
Chris@42 192 E Tl, T22, T9, T20, Tf, T2O, Ta, T21, T2A, Tm, Tr, Ts;
Chris@42 193 {
Chris@42 194 E T7, T8, Td, Te;
Chris@42 195 T7 = Ip[WS(rs, 1)];
Chris@42 196 T3S = T1G - T1M;
Chris@42 197 T1N = T1G + T1M;
Chris@42 198 T8 = Im[WS(rs, 1)];
Chris@42 199 Td = Rp[WS(rs, 1)];
Chris@42 200 Te = Rm[WS(rs, 1)];
Chris@42 201 Tl = Ip[WS(rs, 5)];
Chris@42 202 T22 = T7 + T8;
Chris@42 203 T9 = T7 - T8;
Chris@42 204 T20 = Td - Te;
Chris@42 205 Tf = Td + Te;
Chris@42 206 T2O = T2 * T22;
Chris@42 207 Ta = T6 * T9;
Chris@42 208 T21 = T2 * T20;
Chris@42 209 T2A = T6 * Tf;
Chris@42 210 Tm = Im[WS(rs, 5)];
Chris@42 211 Tr = Rp[WS(rs, 5)];
Chris@42 212 Ts = Rm[WS(rs, 5)];
Chris@42 213 }
Chris@42 214 {
Chris@42 215 E Tg, T2a, Tn, T26, T2Q, T27, T2C, T2B, Tu, Tt, To, T23, T2D, T2b;
Chris@42 216 Tg = FNMS(Tc, Tf, Ta);
Chris@42 217 T2a = Tl + Tm;
Chris@42 218 Tn = Tl - Tm;
Chris@42 219 T26 = Tr - Ts;
Chris@42 220 Tt = Tr + Ts;
Chris@42 221 T2Q = T25 * T2a;
Chris@42 222 To = Tk * Tn;
Chris@42 223 T27 = T25 * T26;
Chris@42 224 T2C = Tk * Tt;
Chris@42 225 T2B = FMA(Tc, T9, T2A);
Chris@42 226 Tu = FNMS(Tq, Tt, To);
Chris@42 227 T23 = FMA(T5, T22, T21);
Chris@42 228 T2D = FMA(Tq, Tn, T2C);
Chris@42 229 T2b = FMA(T29, T2a, T27);
Chris@42 230 Tv = Tg + Tu;
Chris@42 231 T3A = Tg - Tu;
Chris@42 232 T2P = FNMS(T5, T20, T2O);
Chris@42 233 T2E = T2B + T2D;
Chris@42 234 T3B = T2B - T2D;
Chris@42 235 T3L = T2b - T23;
Chris@42 236 T2c = T23 + T2b;
Chris@42 237 T2R = FNMS(T29, T26, T2Q);
Chris@42 238 }
Chris@42 239 }
Chris@42 240 {
Chris@42 241 E T2f, TC, T2T, TD, T2d, TI, TS, T2e, T2F, T2l, TO, TT;
Chris@42 242 {
Chris@42 243 E TG, TH, TA, TB, TM, TN;
Chris@42 244 TA = Ip[WS(rs, 7)];
Chris@42 245 TB = Im[WS(rs, 7)];
Chris@42 246 TG = Rp[WS(rs, 7)];
Chris@42 247 T3I = T2R - T2P;
Chris@42 248 T2S = T2P + T2R;
Chris@42 249 T2f = TA + TB;
Chris@42 250 TC = TA - TB;
Chris@42 251 TH = Rm[WS(rs, 7)];
Chris@42 252 TM = Ip[WS(rs, 3)];
Chris@42 253 T2T = Tw * T2f;
Chris@42 254 TD = Tz * TC;
Chris@42 255 T2d = TG - TH;
Chris@42 256 TI = TG + TH;
Chris@42 257 TN = Im[WS(rs, 3)];
Chris@42 258 TS = Rp[WS(rs, 3)];
Chris@42 259 T2e = Tw * T2d;
Chris@42 260 T2F = Tz * TI;
Chris@42 261 T2l = TM + TN;
Chris@42 262 TO = TM - TN;
Chris@42 263 TT = Rm[WS(rs, 3)];
Chris@42 264 }
Chris@42 265 {
Chris@42 266 E TJ, T2V, TP, T2i, TU, T2G;
Chris@42 267 TJ = FNMS(TF, TI, TD);
Chris@42 268 T2V = T2h * T2l;
Chris@42 269 TP = TL * TO;
Chris@42 270 T2i = TS - TT;
Chris@42 271 TU = TS + TT;
Chris@42 272 T2G = FMA(TF, TC, T2F);
Chris@42 273 {
Chris@42 274 E T2g, T2j, TV, T2H;
Chris@42 275 T2g = FMA(Ty, T2f, T2e);
Chris@42 276 T2j = T2h * T2i;
Chris@42 277 TV = FNMS(TR, TU, TP);
Chris@42 278 T2H = TL * TU;
Chris@42 279 {
Chris@42 280 E T2U, T2m, T2I, T2W;
Chris@42 281 T2U = FNMS(Ty, T2d, T2T);
Chris@42 282 T2m = FMA(T2k, T2l, T2j);
Chris@42 283 TW = TJ + TV;
Chris@42 284 T3E = TJ - TV;
Chris@42 285 T2I = FMA(TR, TO, T2H);
Chris@42 286 T2W = FNMS(T2k, T2i, T2V);
Chris@42 287 T3J = T2m - T2g;
Chris@42 288 T2n = T2g + T2m;
Chris@42 289 T3D = T2G - T2I;
Chris@42 290 T2J = T2G + T2I;
Chris@42 291 T3M = T2U - T2W;
Chris@42 292 T2X = T2U + T2W;
Chris@42 293 }
Chris@42 294 }
Chris@42 295 }
Chris@42 296 }
Chris@42 297 }
Chris@42 298 {
Chris@42 299 E T3Y, T3x, T3X, T3y, T3r, T3q, T3p, T3u;
Chris@42 300 {
Chris@42 301 E T2Y, T3o, TX, T3s, T3i, T39, T3t, T3l, T3e, T1x, T2M, T2p, T3d, T2K, T2t;
Chris@42 302 E T2y;
Chris@42 303 {
Chris@42 304 E T2o, T1Z, T3j, T3k, T1i, T1w, T3g, T3h;
Chris@42 305 T2Y = T2S + T2X;
Chris@42 306 T3g = T2X - T2S;
Chris@42 307 T3h = T2c - T2n;
Chris@42 308 T2o = T2c + T2n;
Chris@42 309 T1Z = T1N + T1Y;
Chris@42 310 T3j = T1Y - T1N;
Chris@42 311 T3o = Tv - TW;
Chris@42 312 TX = Tv + TW;
Chris@42 313 T3s = T3g - T3h;
Chris@42 314 T3i = T3g + T3h;
Chris@42 315 T3k = T38 - T33;
Chris@42 316 T39 = T33 + T38;
Chris@42 317 T3Y = T17 - T1h;
Chris@42 318 T1i = T17 + T1h;
Chris@42 319 T1w = T1s + T1v;
Chris@42 320 T3x = T1v - T1s;
Chris@42 321 T3t = T3j + T3k;
Chris@42 322 T3l = T3j - T3k;
Chris@42 323 T3e = T1w - T1i;
Chris@42 324 T1x = T1i + T1w;
Chris@42 325 T2M = T2o + T1Z;
Chris@42 326 T2p = T1Z - T2o;
Chris@42 327 T3d = T2J - T2E;
Chris@42 328 T2K = T2E + T2J;
Chris@42 329 T3X = T2q - T2s;
Chris@42 330 T2t = T2q + T2s;
Chris@42 331 T2y = T2v + T2x;
Chris@42 332 T3y = T2v - T2x;
Chris@42 333 }
Chris@42 334 {
Chris@42 335 E T2N, T3c, T3a, T3n, T3b, T2L, T2z, T1y;
Chris@42 336 T2N = T1x - TX;
Chris@42 337 T1y = TX + T1x;
Chris@42 338 T3c = T2Y + T39;
Chris@42 339 T3a = T2Y - T39;
Chris@42 340 T3n = T2t - T2y;
Chris@42 341 T2z = T2t + T2y;
Chris@42 342 Ip[0] = KP500000000 * (T1y + T2p);
Chris@42 343 Im[WS(rs, 7)] = KP500000000 * (T2p - T1y);
Chris@42 344 T3b = T2z + T2K;
Chris@42 345 T2L = T2z - T2K;
Chris@42 346 {
Chris@42 347 E T3f, T3m, T3v, T3w;
Chris@42 348 T3r = T3e - T3d;
Chris@42 349 T3f = T3d + T3e;
Chris@42 350 Im[WS(rs, 3)] = KP500000000 * (T3a - T2N);
Chris@42 351 Ip[WS(rs, 4)] = KP500000000 * (T2N + T3a);
Chris@42 352 Rp[WS(rs, 4)] = KP500000000 * (T2L + T2M);
Chris@42 353 Rm[WS(rs, 3)] = KP500000000 * (T2L - T2M);
Chris@42 354 Rp[0] = KP500000000 * (T3b + T3c);
Chris@42 355 Rm[WS(rs, 7)] = KP500000000 * (T3b - T3c);
Chris@42 356 T3m = T3i + T3l;
Chris@42 357 T3q = T3l - T3i;
Chris@42 358 T3p = T3n - T3o;
Chris@42 359 T3v = T3n + T3o;
Chris@42 360 T3w = T3s + T3t;
Chris@42 361 T3u = T3s - T3t;
Chris@42 362 Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3m, T3f)));
Chris@42 363 Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3m, T3f));
Chris@42 364 Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3w, T3v));
Chris@42 365 Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3w, T3v));
Chris@42 366 }
Chris@42 367 }
Chris@42 368 }
Chris@42 369 {
Chris@42 370 E T3R, T4b, T3z, T4q, T4g, T3U, T40, T41, T4r, T4j, T4m, T3G, T46, T3O, T4l;
Chris@42 371 E T3Z, T4c;
Chris@42 372 {
Chris@42 373 E T3K, T3N, T4h, T4i, T3C, T3F, T4e, T4f;
Chris@42 374 Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3q, T3p));
Chris@42 375 Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3q, T3p));
Chris@42 376 Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3u, T3r)));
Chris@42 377 Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3u, T3r));
Chris@42 378 T3K = T3I + T3J;
Chris@42 379 T4e = T3I - T3J;
Chris@42 380 T4f = T3M - T3L;
Chris@42 381 T3N = T3L + T3M;
Chris@42 382 T3R = T3P - T3Q;
Chris@42 383 T4h = T3Q + T3P;
Chris@42 384 T4b = T3y + T3x;
Chris@42 385 T3z = T3x - T3y;
Chris@42 386 T4q = FNMS(KP414213562, T4e, T4f);
Chris@42 387 T4g = FMA(KP414213562, T4f, T4e);
Chris@42 388 T4i = T3T - T3S;
Chris@42 389 T3U = T3S + T3T;
Chris@42 390 T40 = T3B + T3A;
Chris@42 391 T3C = T3A - T3B;
Chris@42 392 T3F = T3D + T3E;
Chris@42 393 T41 = T3D - T3E;
Chris@42 394 T4r = FNMS(KP414213562, T4h, T4i);
Chris@42 395 T4j = FMA(KP414213562, T4i, T4h);
Chris@42 396 T4m = T3C - T3F;
Chris@42 397 T3G = T3C + T3F;
Chris@42 398 T46 = FNMS(KP414213562, T3K, T3N);
Chris@42 399 T3O = FMA(KP414213562, T3N, T3K);
Chris@42 400 T4l = T3X - T3Y;
Chris@42 401 T3Z = T3X + T3Y;
Chris@42 402 }
Chris@42 403 {
Chris@42 404 E T45, T3H, T42, T47, T3V;
Chris@42 405 T45 = FNMS(KP707106781, T3G, T3z);
Chris@42 406 T3H = FMA(KP707106781, T3G, T3z);
Chris@42 407 T4c = T41 - T40;
Chris@42 408 T42 = T40 + T41;
Chris@42 409 T47 = FMA(KP414213562, T3R, T3U);
Chris@42 410 T3V = FNMS(KP414213562, T3U, T3R);
Chris@42 411 {
Chris@42 412 E T49, T43, T48, T4a, T44, T3W;
Chris@42 413 T49 = FMA(KP707106781, T42, T3Z);
Chris@42 414 T43 = FNMS(KP707106781, T42, T3Z);
Chris@42 415 T48 = T46 - T47;
Chris@42 416 T4a = T46 + T47;
Chris@42 417 T44 = T3V - T3O;
Chris@42 418 T3W = T3O + T3V;
Chris@42 419 Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T4a, T49));
Chris@42 420 Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T4a, T49));
Chris@42 421 Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T44, T43));
Chris@42 422 Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T44, T43));
Chris@42 423 Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3H)));
Chris@42 424 Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3W, T3H));
Chris@42 425 Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T48, T45));
Chris@42 426 Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T48, T45)));
Chris@42 427 }
Chris@42 428 }
Chris@42 429 {
Chris@42 430 E T4d, T4k, T4t, T4u;
Chris@42 431 T4p = FMA(KP707106781, T4c, T4b);
Chris@42 432 T4d = FNMS(KP707106781, T4c, T4b);
Chris@42 433 T4k = T4g - T4j;
Chris@42 434 T4o = T4g + T4j;
Chris@42 435 T4n = FMA(KP707106781, T4m, T4l);
Chris@42 436 T4t = FNMS(KP707106781, T4m, T4l);
Chris@42 437 T4u = T4q + T4r;
Chris@42 438 T4s = T4q - T4r;
Chris@42 439 Im[0] = -(KP500000000 * (FNMS(KP923879532, T4k, T4d)));
Chris@42 440 Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T4k, T4d));
Chris@42 441 Rm[0] = KP500000000 * (FMA(KP923879532, T4u, T4t));
Chris@42 442 Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4u, T4t));
Chris@42 443 }
Chris@42 444 }
Chris@42 445 }
Chris@42 446 }
Chris@42 447 }
Chris@42 448 Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4o, T4n));
Chris@42 449 Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4o, T4n));
Chris@42 450 Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4s, T4p)));
Chris@42 451 Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4s, T4p));
Chris@42 452 }
Chris@42 453 }
Chris@42 454 }
Chris@42 455
Chris@42 456 static const tw_instr twinstr[] = {
Chris@42 457 {TW_CEXP, 1, 1},
Chris@42 458 {TW_CEXP, 1, 3},
Chris@42 459 {TW_CEXP, 1, 9},
Chris@42 460 {TW_CEXP, 1, 15},
Chris@42 461 {TW_NEXT, 1, 0}
Chris@42 462 };
Chris@42 463
Chris@42 464 static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, {136, 74, 92, 0} };
Chris@42 465
Chris@42 466 void X(codelet_hc2cfdft2_16) (planner *p) {
Chris@42 467 X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
Chris@42 468 }
Chris@42 469 #else /* HAVE_FMA */
Chris@42 470
Chris@42 471 /* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include hc2cf.h */
Chris@42 472
Chris@42 473 /*
Chris@42 474 * This function contains 228 FP additions, 124 FP multiplications,
Chris@42 475 * (or, 188 additions, 84 multiplications, 40 fused multiply/add),
Chris@42 476 * 91 stack variables, 4 constants, and 64 memory accesses
Chris@42 477 */
Chris@42 478 #include "hc2cf.h"
Chris@42 479
Chris@42 480 static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
Chris@42 481 {
Chris@42 482 DK(KP461939766, +0.461939766255643378064091594698394143411208313);
Chris@42 483 DK(KP191341716, +0.191341716182544885864229992015199433380672281);
Chris@42 484 DK(KP353553390, +0.353553390593273762200422181052424519642417969);
Chris@42 485 DK(KP500000000, +0.500000000000000000000000000000000000000000000);
Chris@42 486 {
Chris@42 487 INT m;
Chris@42 488 for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
Chris@42 489 E T1, T4, T2, T5, T7, Td, T12, TY, Tk, Ti, Tm, T1l, T1b, TL, T1h;
Chris@42 490 E Ts, TR, T17, Ty, Tz, TA, TE, T1L, T1Q, T1H, T1O, T24, T2d, T20, T2b;
Chris@42 491 {
Chris@42 492 E Tl, TP, Tq, TK, Tj, TQ, Tr, TJ;
Chris@42 493 {
Chris@42 494 E T3, Tc, T6, Tb;
Chris@42 495 T1 = W[0];
Chris@42 496 T4 = W[1];
Chris@42 497 T2 = W[2];
Chris@42 498 T5 = W[3];
Chris@42 499 T3 = T1 * T2;
Chris@42 500 Tc = T4 * T2;
Chris@42 501 T6 = T4 * T5;
Chris@42 502 Tb = T1 * T5;
Chris@42 503 T7 = T3 + T6;
Chris@42 504 Td = Tb - Tc;
Chris@42 505 T12 = Tb + Tc;
Chris@42 506 TY = T3 - T6;
Chris@42 507 Tk = W[5];
Chris@42 508 Tl = T4 * Tk;
Chris@42 509 TP = T2 * Tk;
Chris@42 510 Tq = T1 * Tk;
Chris@42 511 TK = T5 * Tk;
Chris@42 512 Ti = W[4];
Chris@42 513 Tj = T1 * Ti;
Chris@42 514 TQ = T5 * Ti;
Chris@42 515 Tr = T4 * Ti;
Chris@42 516 TJ = T2 * Ti;
Chris@42 517 }
Chris@42 518 Tm = Tj - Tl;
Chris@42 519 T1l = Tq - Tr;
Chris@42 520 T1b = TP + TQ;
Chris@42 521 TL = TJ + TK;
Chris@42 522 T1h = Tj + Tl;
Chris@42 523 Ts = Tq + Tr;
Chris@42 524 TR = TP - TQ;
Chris@42 525 T17 = TJ - TK;
Chris@42 526 Ty = W[6];
Chris@42 527 Tz = W[7];
Chris@42 528 TA = FMA(T1, Ty, T4 * Tz);
Chris@42 529 TE = FNMS(T4, Ty, T1 * Tz);
Chris@42 530 {
Chris@42 531 E T1J, T1K, T1F, T1G;
Chris@42 532 T1J = TY * Tk;
Chris@42 533 T1K = T12 * Ti;
Chris@42 534 T1L = T1J - T1K;
Chris@42 535 T1Q = T1J + T1K;
Chris@42 536 T1F = TY * Ti;
Chris@42 537 T1G = T12 * Tk;
Chris@42 538 T1H = T1F + T1G;
Chris@42 539 T1O = T1F - T1G;
Chris@42 540 }
Chris@42 541 {
Chris@42 542 E T22, T23, T1Y, T1Z;
Chris@42 543 T22 = T7 * Tk;
Chris@42 544 T23 = Td * Ti;
Chris@42 545 T24 = T22 + T23;
Chris@42 546 T2d = T22 - T23;
Chris@42 547 T1Y = T7 * Ti;
Chris@42 548 T1Z = Td * Tk;
Chris@42 549 T20 = T1Y - T1Z;
Chris@42 550 T2b = T1Y + T1Z;
Chris@42 551 }
Chris@42 552 }
Chris@42 553 {
Chris@42 554 E T1t, T3i, T2l, T3B, T1E, T3t, T2M, T3x, T1g, T3C, T2J, T3u, T1T, T3w, T2o;
Chris@42 555 E T3j, Tx, T3b, T2C, T3q, T27, T3m, T2s, T3c, TW, T3f, T2F, T3n, T2g, T3p;
Chris@42 556 E T2v, T3e;
Chris@42 557 {
Chris@42 558 E T1k, T1C, T1o, T1B, T1s, T1z, T1y, T2j, T1p, T2k;
Chris@42 559 {
Chris@42 560 E T1i, T1j, T1m, T1n;
Chris@42 561 T1i = Ip[WS(rs, 4)];
Chris@42 562 T1j = Im[WS(rs, 4)];
Chris@42 563 T1k = T1i - T1j;
Chris@42 564 T1C = T1i + T1j;
Chris@42 565 T1m = Rp[WS(rs, 4)];
Chris@42 566 T1n = Rm[WS(rs, 4)];
Chris@42 567 T1o = T1m + T1n;
Chris@42 568 T1B = T1m - T1n;
Chris@42 569 }
Chris@42 570 {
Chris@42 571 E T1q, T1r, T1w, T1x;
Chris@42 572 T1q = Ip[0];
Chris@42 573 T1r = Im[0];
Chris@42 574 T1s = T1q - T1r;
Chris@42 575 T1z = T1q + T1r;
Chris@42 576 T1w = Rm[0];
Chris@42 577 T1x = Rp[0];
Chris@42 578 T1y = T1w - T1x;
Chris@42 579 T2j = T1x + T1w;
Chris@42 580 }
Chris@42 581 T1p = FNMS(T1l, T1o, T1h * T1k);
Chris@42 582 T1t = T1p + T1s;
Chris@42 583 T3i = T1s - T1p;
Chris@42 584 T2k = FMA(T1h, T1o, T1l * T1k);
Chris@42 585 T2l = T2j + T2k;
Chris@42 586 T3B = T2j - T2k;
Chris@42 587 {
Chris@42 588 E T1A, T1D, T2K, T2L;
Chris@42 589 T1A = FNMS(T4, T1z, T1 * T1y);
Chris@42 590 T1D = FMA(Ti, T1B, Tk * T1C);
Chris@42 591 T1E = T1A - T1D;
Chris@42 592 T3t = T1D + T1A;
Chris@42 593 T2K = FNMS(Tk, T1B, Ti * T1C);
Chris@42 594 T2L = FMA(T4, T1y, T1 * T1z);
Chris@42 595 T2M = T2K + T2L;
Chris@42 596 T3x = T2L - T2K;
Chris@42 597 }
Chris@42 598 }
Chris@42 599 {
Chris@42 600 E T11, T1M, T15, T1I, T1a, T1R, T1e, T1P;
Chris@42 601 {
Chris@42 602 E TZ, T10, T13, T14;
Chris@42 603 TZ = Ip[WS(rs, 2)];
Chris@42 604 T10 = Im[WS(rs, 2)];
Chris@42 605 T11 = TZ - T10;
Chris@42 606 T1M = TZ + T10;
Chris@42 607 T13 = Rp[WS(rs, 2)];
Chris@42 608 T14 = Rm[WS(rs, 2)];
Chris@42 609 T15 = T13 + T14;
Chris@42 610 T1I = T13 - T14;
Chris@42 611 }
Chris@42 612 {
Chris@42 613 E T18, T19, T1c, T1d;
Chris@42 614 T18 = Ip[WS(rs, 6)];
Chris@42 615 T19 = Im[WS(rs, 6)];
Chris@42 616 T1a = T18 - T19;
Chris@42 617 T1R = T18 + T19;
Chris@42 618 T1c = Rp[WS(rs, 6)];
Chris@42 619 T1d = Rm[WS(rs, 6)];
Chris@42 620 T1e = T1c + T1d;
Chris@42 621 T1P = T1c - T1d;
Chris@42 622 }
Chris@42 623 {
Chris@42 624 E T16, T1f, T2H, T2I;
Chris@42 625 T16 = FNMS(T12, T15, TY * T11);
Chris@42 626 T1f = FNMS(T1b, T1e, T17 * T1a);
Chris@42 627 T1g = T16 + T1f;
Chris@42 628 T3C = T16 - T1f;
Chris@42 629 T2H = FNMS(T1L, T1I, T1H * T1M);
Chris@42 630 T2I = FNMS(T1Q, T1P, T1O * T1R);
Chris@42 631 T2J = T2H + T2I;
Chris@42 632 T3u = T2H - T2I;
Chris@42 633 }
Chris@42 634 {
Chris@42 635 E T1N, T1S, T2m, T2n;
Chris@42 636 T1N = FMA(T1H, T1I, T1L * T1M);
Chris@42 637 T1S = FMA(T1O, T1P, T1Q * T1R);
Chris@42 638 T1T = T1N + T1S;
Chris@42 639 T3w = T1S - T1N;
Chris@42 640 T2m = FMA(TY, T15, T12 * T11);
Chris@42 641 T2n = FMA(T17, T1e, T1b * T1a);
Chris@42 642 T2o = T2m + T2n;
Chris@42 643 T3j = T2m - T2n;
Chris@42 644 }
Chris@42 645 }
Chris@42 646 {
Chris@42 647 E Ta, T1W, Tg, T1V, Tp, T25, Tv, T21;
Chris@42 648 {
Chris@42 649 E T8, T9, Te, Tf;
Chris@42 650 T8 = Ip[WS(rs, 1)];
Chris@42 651 T9 = Im[WS(rs, 1)];
Chris@42 652 Ta = T8 - T9;
Chris@42 653 T1W = T8 + T9;
Chris@42 654 Te = Rp[WS(rs, 1)];
Chris@42 655 Tf = Rm[WS(rs, 1)];
Chris@42 656 Tg = Te + Tf;
Chris@42 657 T1V = Te - Tf;
Chris@42 658 }
Chris@42 659 {
Chris@42 660 E Tn, To, Tt, Tu;
Chris@42 661 Tn = Ip[WS(rs, 5)];
Chris@42 662 To = Im[WS(rs, 5)];
Chris@42 663 Tp = Tn - To;
Chris@42 664 T25 = Tn + To;
Chris@42 665 Tt = Rp[WS(rs, 5)];
Chris@42 666 Tu = Rm[WS(rs, 5)];
Chris@42 667 Tv = Tt + Tu;
Chris@42 668 T21 = Tt - Tu;
Chris@42 669 }
Chris@42 670 {
Chris@42 671 E Th, Tw, T2A, T2B;
Chris@42 672 Th = FNMS(Td, Tg, T7 * Ta);
Chris@42 673 Tw = FNMS(Ts, Tv, Tm * Tp);
Chris@42 674 Tx = Th + Tw;
Chris@42 675 T3b = Th - Tw;
Chris@42 676 T2A = FNMS(T5, T1V, T2 * T1W);
Chris@42 677 T2B = FNMS(T24, T21, T20 * T25);
Chris@42 678 T2C = T2A + T2B;
Chris@42 679 T3q = T2A - T2B;
Chris@42 680 }
Chris@42 681 {
Chris@42 682 E T1X, T26, T2q, T2r;
Chris@42 683 T1X = FMA(T2, T1V, T5 * T1W);
Chris@42 684 T26 = FMA(T20, T21, T24 * T25);
Chris@42 685 T27 = T1X + T26;
Chris@42 686 T3m = T26 - T1X;
Chris@42 687 T2q = FMA(T7, Tg, Td * Ta);
Chris@42 688 T2r = FMA(Tm, Tv, Ts * Tp);
Chris@42 689 T2s = T2q + T2r;
Chris@42 690 T3c = T2q - T2r;
Chris@42 691 }
Chris@42 692 }
Chris@42 693 {
Chris@42 694 E TD, T29, TH, T28, TO, T2e, TU, T2c;
Chris@42 695 {
Chris@42 696 E TB, TC, TF, TG;
Chris@42 697 TB = Ip[WS(rs, 7)];
Chris@42 698 TC = Im[WS(rs, 7)];
Chris@42 699 TD = TB - TC;
Chris@42 700 T29 = TB + TC;
Chris@42 701 TF = Rp[WS(rs, 7)];
Chris@42 702 TG = Rm[WS(rs, 7)];
Chris@42 703 TH = TF + TG;
Chris@42 704 T28 = TF - TG;
Chris@42 705 }
Chris@42 706 {
Chris@42 707 E TM, TN, TS, TT;
Chris@42 708 TM = Ip[WS(rs, 3)];
Chris@42 709 TN = Im[WS(rs, 3)];
Chris@42 710 TO = TM - TN;
Chris@42 711 T2e = TM + TN;
Chris@42 712 TS = Rp[WS(rs, 3)];
Chris@42 713 TT = Rm[WS(rs, 3)];
Chris@42 714 TU = TS + TT;
Chris@42 715 T2c = TS - TT;
Chris@42 716 }
Chris@42 717 {
Chris@42 718 E TI, TV, T2D, T2E;
Chris@42 719 TI = FNMS(TE, TH, TA * TD);
Chris@42 720 TV = FNMS(TR, TU, TL * TO);
Chris@42 721 TW = TI + TV;
Chris@42 722 T3f = TI - TV;
Chris@42 723 T2D = FNMS(Tz, T28, Ty * T29);
Chris@42 724 T2E = FNMS(T2d, T2c, T2b * T2e);
Chris@42 725 T2F = T2D + T2E;
Chris@42 726 T3n = T2D - T2E;
Chris@42 727 }
Chris@42 728 {
Chris@42 729 E T2a, T2f, T2t, T2u;
Chris@42 730 T2a = FMA(Ty, T28, Tz * T29);
Chris@42 731 T2f = FMA(T2b, T2c, T2d * T2e);
Chris@42 732 T2g = T2a + T2f;
Chris@42 733 T3p = T2f - T2a;
Chris@42 734 T2t = FMA(TA, TH, TE * TD);
Chris@42 735 T2u = FMA(TL, TU, TR * TO);
Chris@42 736 T2v = T2t + T2u;
Chris@42 737 T3e = T2t - T2u;
Chris@42 738 }
Chris@42 739 }
Chris@42 740 {
Chris@42 741 E T1v, T2z, T2O, T2Q, T2i, T2y, T2x, T2P;
Chris@42 742 {
Chris@42 743 E TX, T1u, T2G, T2N;
Chris@42 744 TX = Tx + TW;
Chris@42 745 T1u = T1g + T1t;
Chris@42 746 T1v = TX + T1u;
Chris@42 747 T2z = T1u - TX;
Chris@42 748 T2G = T2C + T2F;
Chris@42 749 T2N = T2J + T2M;
Chris@42 750 T2O = T2G - T2N;
Chris@42 751 T2Q = T2G + T2N;
Chris@42 752 }
Chris@42 753 {
Chris@42 754 E T1U, T2h, T2p, T2w;
Chris@42 755 T1U = T1E - T1T;
Chris@42 756 T2h = T27 + T2g;
Chris@42 757 T2i = T1U - T2h;
Chris@42 758 T2y = T2h + T1U;
Chris@42 759 T2p = T2l + T2o;
Chris@42 760 T2w = T2s + T2v;
Chris@42 761 T2x = T2p - T2w;
Chris@42 762 T2P = T2p + T2w;
Chris@42 763 }
Chris@42 764 Ip[0] = KP500000000 * (T1v + T2i);
Chris@42 765 Rp[0] = KP500000000 * (T2P + T2Q);
Chris@42 766 Im[WS(rs, 7)] = KP500000000 * (T2i - T1v);
Chris@42 767 Rm[WS(rs, 7)] = KP500000000 * (T2P - T2Q);
Chris@42 768 Rm[WS(rs, 3)] = KP500000000 * (T2x - T2y);
Chris@42 769 Im[WS(rs, 3)] = KP500000000 * (T2O - T2z);
Chris@42 770 Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
Chris@42 771 Ip[WS(rs, 4)] = KP500000000 * (T2z + T2O);
Chris@42 772 }
Chris@42 773 {
Chris@42 774 E T2T, T35, T33, T39, T2W, T36, T2Z, T37;
Chris@42 775 {
Chris@42 776 E T2R, T2S, T31, T32;
Chris@42 777 T2R = T2v - T2s;
Chris@42 778 T2S = T1t - T1g;
Chris@42 779 T2T = KP500000000 * (T2R + T2S);
Chris@42 780 T35 = KP500000000 * (T2S - T2R);
Chris@42 781 T31 = T2l - T2o;
Chris@42 782 T32 = Tx - TW;
Chris@42 783 T33 = KP500000000 * (T31 - T32);
Chris@42 784 T39 = KP500000000 * (T31 + T32);
Chris@42 785 }
Chris@42 786 {
Chris@42 787 E T2U, T2V, T2X, T2Y;
Chris@42 788 T2U = T2F - T2C;
Chris@42 789 T2V = T27 - T2g;
Chris@42 790 T2W = T2U + T2V;
Chris@42 791 T36 = T2U - T2V;
Chris@42 792 T2X = T1T + T1E;
Chris@42 793 T2Y = T2M - T2J;
Chris@42 794 T2Z = T2X - T2Y;
Chris@42 795 T37 = T2X + T2Y;
Chris@42 796 }
Chris@42 797 {
Chris@42 798 E T30, T3a, T34, T38;
Chris@42 799 T30 = KP353553390 * (T2W + T2Z);
Chris@42 800 Ip[WS(rs, 2)] = T2T + T30;
Chris@42 801 Im[WS(rs, 5)] = T30 - T2T;
Chris@42 802 T3a = KP353553390 * (T36 + T37);
Chris@42 803 Rm[WS(rs, 5)] = T39 - T3a;
Chris@42 804 Rp[WS(rs, 2)] = T39 + T3a;
Chris@42 805 T34 = KP353553390 * (T2Z - T2W);
Chris@42 806 Rm[WS(rs, 1)] = T33 - T34;
Chris@42 807 Rp[WS(rs, 6)] = T33 + T34;
Chris@42 808 T38 = KP353553390 * (T36 - T37);
Chris@42 809 Ip[WS(rs, 6)] = T35 + T38;
Chris@42 810 Im[WS(rs, 1)] = T38 - T35;
Chris@42 811 }
Chris@42 812 }
Chris@42 813 {
Chris@42 814 E T3k, T3Q, T3Z, T3D, T3h, T40, T3X, T45, T3G, T3P, T3s, T3K, T3U, T44, T3z;
Chris@42 815 E T3L;
Chris@42 816 {
Chris@42 817 E T3d, T3g, T3o, T3r;
Chris@42 818 T3k = KP500000000 * (T3i - T3j);
Chris@42 819 T3Q = KP500000000 * (T3j + T3i);
Chris@42 820 T3Z = KP500000000 * (T3B - T3C);
Chris@42 821 T3D = KP500000000 * (T3B + T3C);
Chris@42 822 T3d = T3b - T3c;
Chris@42 823 T3g = T3e + T3f;
Chris@42 824 T3h = KP353553390 * (T3d + T3g);
Chris@42 825 T40 = KP353553390 * (T3d - T3g);
Chris@42 826 {
Chris@42 827 E T3V, T3W, T3E, T3F;
Chris@42 828 T3V = T3u + T3t;
Chris@42 829 T3W = T3x - T3w;
Chris@42 830 T3X = FNMS(KP461939766, T3W, KP191341716 * T3V);
Chris@42 831 T45 = FMA(KP461939766, T3V, KP191341716 * T3W);
Chris@42 832 T3E = T3c + T3b;
Chris@42 833 T3F = T3e - T3f;
Chris@42 834 T3G = KP353553390 * (T3E + T3F);
Chris@42 835 T3P = KP353553390 * (T3F - T3E);
Chris@42 836 }
Chris@42 837 T3o = T3m + T3n;
Chris@42 838 T3r = T3p - T3q;
Chris@42 839 T3s = FMA(KP191341716, T3o, KP461939766 * T3r);
Chris@42 840 T3K = FNMS(KP191341716, T3r, KP461939766 * T3o);
Chris@42 841 {
Chris@42 842 E T3S, T3T, T3v, T3y;
Chris@42 843 T3S = T3n - T3m;
Chris@42 844 T3T = T3q + T3p;
Chris@42 845 T3U = FMA(KP461939766, T3S, KP191341716 * T3T);
Chris@42 846 T44 = FNMS(KP461939766, T3T, KP191341716 * T3S);
Chris@42 847 T3v = T3t - T3u;
Chris@42 848 T3y = T3w + T3x;
Chris@42 849 T3z = FNMS(KP191341716, T3y, KP461939766 * T3v);
Chris@42 850 T3L = FMA(KP191341716, T3v, KP461939766 * T3y);
Chris@42 851 }
Chris@42 852 }
Chris@42 853 {
Chris@42 854 E T3l, T3A, T3N, T3O;
Chris@42 855 T3l = T3h + T3k;
Chris@42 856 T3A = T3s + T3z;
Chris@42 857 Ip[WS(rs, 1)] = T3l + T3A;
Chris@42 858 Im[WS(rs, 6)] = T3A - T3l;
Chris@42 859 T3N = T3D + T3G;
Chris@42 860 T3O = T3K + T3L;
Chris@42 861 Rm[WS(rs, 6)] = T3N - T3O;
Chris@42 862 Rp[WS(rs, 1)] = T3N + T3O;
Chris@42 863 }
Chris@42 864 {
Chris@42 865 E T3H, T3I, T3J, T3M;
Chris@42 866 T3H = T3D - T3G;
Chris@42 867 T3I = T3z - T3s;
Chris@42 868 Rm[WS(rs, 2)] = T3H - T3I;
Chris@42 869 Rp[WS(rs, 5)] = T3H + T3I;
Chris@42 870 T3J = T3k - T3h;
Chris@42 871 T3M = T3K - T3L;
Chris@42 872 Ip[WS(rs, 5)] = T3J + T3M;
Chris@42 873 Im[WS(rs, 2)] = T3M - T3J;
Chris@42 874 }
Chris@42 875 {
Chris@42 876 E T3R, T3Y, T47, T48;
Chris@42 877 T3R = T3P + T3Q;
Chris@42 878 T3Y = T3U + T3X;
Chris@42 879 Ip[WS(rs, 3)] = T3R + T3Y;
Chris@42 880 Im[WS(rs, 4)] = T3Y - T3R;
Chris@42 881 T47 = T3Z + T40;
Chris@42 882 T48 = T44 + T45;
Chris@42 883 Rm[WS(rs, 4)] = T47 - T48;
Chris@42 884 Rp[WS(rs, 3)] = T47 + T48;
Chris@42 885 }
Chris@42 886 {
Chris@42 887 E T41, T42, T43, T46;
Chris@42 888 T41 = T3Z - T40;
Chris@42 889 T42 = T3X - T3U;
Chris@42 890 Rm[0] = T41 - T42;
Chris@42 891 Rp[WS(rs, 7)] = T41 + T42;
Chris@42 892 T43 = T3Q - T3P;
Chris@42 893 T46 = T44 - T45;
Chris@42 894 Ip[WS(rs, 7)] = T43 + T46;
Chris@42 895 Im[0] = T46 - T43;
Chris@42 896 }
Chris@42 897 }
Chris@42 898 }
Chris@42 899 }
Chris@42 900 }
Chris@42 901 }
Chris@42 902
Chris@42 903 static const tw_instr twinstr[] = {
Chris@42 904 {TW_CEXP, 1, 1},
Chris@42 905 {TW_CEXP, 1, 3},
Chris@42 906 {TW_CEXP, 1, 9},
Chris@42 907 {TW_CEXP, 1, 15},
Chris@42 908 {TW_NEXT, 1, 0}
Chris@42 909 };
Chris@42 910
Chris@42 911 static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, {188, 84, 40, 0} };
Chris@42 912
Chris@42 913 void X(codelet_hc2cfdft2_16) (planner *p) {
Chris@42 914 X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
Chris@42 915 }
Chris@42 916 #endif /* HAVE_FMA */